In [23]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import xgboost as xgb
import lightgbm as lgb
from sklearn import model_selection
from sklearn.metrics import accuracy_score, mean_squared_error, mean_squared_log_error
import json
import ast
import eli5
from catboost import CatBoostRegressor
from urllib.request import urlopen
from PIL import Image
from sklearn.preprocessing import LabelEncoder
import time
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

#import shap
#stop = set(stopwords.words('english'))

## Helper functions

In [2]:
def fix_date(x):
    """
    Fixes dates which are in 20xx
    """
    year = x.split('/')[2]
    if int(year) <= 19:
        return x[:-2] + '20' + year
    else:
        return x[:-2] + '19' + year
    
def text_to_dict(df):
    dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
            'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']
    for column in dict_columns:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df

## Feature engg function

In [3]:
def fea_engg(df, top_genres, top_companies, top_countries, top_languages, top_keywords, top_cast_names, top_cast_characters,
            top_crew_names,top_crew_jobs,top_crew_departments):

    
    # -------------------- step 2 - collection ------------------------------ #
    # if it has collection name, get the collection name else add 0
    df['collection_name'] = df['belongs_to_collection'].apply(lambda x: x[0]['name'] if x != {} else 0)
    # if it has collection, give length else return 0
    df['has_collection'] = df['belongs_to_collection'].apply(lambda x: len(x) if x != {} else 0)
    # drop the column we used
    df = df.drop(['belongs_to_collection'], axis=1)
    
    # -------------------- step 3 - genres ------------------------------ #
    # get count of all genres
    df['num_genres'] = df['genres'].apply(lambda x: len(x) if x != {} else 0)
    # join all the genres together to form a word. 
    df['all_genres'] = df['genres'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
    # form seperate column for each genres and set one if the genere is present
    for g in top_genres:
        df['genre_' + g] = df['all_genres'].apply(lambda x: 1 if g in x else 0)
    # drop the column we used
    df = df.drop(['genres'], axis=1)
    
    # -------------------- step 4 - companies ------------------------------ #
    # get count of all companies
    df['num_companies'] = df['production_companies'].apply(lambda x: len(x) if x != {} else 0)
    # join all companies
    df['all_production_companies'] = df['production_companies'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
    # form seperate column for each top companies and set one if that company is present
    for g in top_companies:
        df['production_company_' + g] = df['all_production_companies'].apply(lambda x: 1 if g in x else 0)
    # drop the column we used
    df = df.drop(['production_companies', 'all_production_companies'], axis=1)

    # -------------------- step 5 - coutries ------------------------------ #
    # get count of all countries
    df['num_countries'] = df['production_countries'].apply(lambda x: len(x) if x != {} else 0)
    # join all countries. 
    df['all_countries'] = df['production_countries'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
    # form seperate column for each top countries and set one if that country is present
    for g in top_countries:
        df['production_country_' + g] = df['all_countries'].apply(lambda x: 1 if g in x else 0)
    # drop the column we used
    df = df.drop(['production_countries', 'all_countries'], axis=1)
    
    # -------------------- step 6 - lang ------------------------------ #
    # get count of all lang
    df['num_languages'] = df['spoken_languages'].apply(lambda x: len(x) if x != {} else 0)
    # join all lang.
    df['all_languages'] = df['spoken_languages'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
    # form seperate column for each top lang and set one if that lang is present
    for g in top_languages:
        df['language_' + g] = df['all_languages'].apply(lambda x: 1 if g in x else 0)
    # drop the column we used
    df = df.drop(['spoken_languages', 'all_languages'], axis=1)

    # -------------------- step 7 - keyword ------------------------------ #
    # follow same procedure for keyword. you got the idea.
    df['num_Keywords'] = df['Keywords'].apply(lambda x: len(x) if x != {} else 0)
    df['all_Keywords'] = df['Keywords'].apply(lambda x: ' '.join(sorted([i['name'] for i in x])) if x != {} else '')
    for g in top_keywords:
        df['keyword_' + g] = df['all_Keywords'].apply(lambda x: 1 if g in x else 0)
    df = df.drop(['Keywords', 'all_Keywords'], axis=1)

    # -------------------- step 8 - cast ------------------------------ #
    df['num_cast'] = df['cast'].apply(lambda x: len(x) if x != {} else 0)
    for g in top_cast_names:
        df['cast_name_' + g] = df['cast'].apply(lambda x: 1 if g in str(x) else 0)
    df['genders_0_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
    df['genders_1_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
    df['genders_2_cast'] = df['cast'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
    for g in top_cast_characters:
        df['cast_character_' + g] = df['cast'].apply(lambda x: 1 if g in str(x) else 0)
    df = df.drop(['cast'], axis=1)

    
    # -------------------- step 9 - crew gender, job, dept, name ------------------------------ #
    df['num_crew'] = df['crew'].apply(lambda x: len(x) if x != {} else 0)
    for g in top_crew_names:
        df['crew_name_' + g] = df['crew'].apply(lambda x: 1 if g in str(x) else 0)
    df['genders_0_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 0]))
    df['genders_1_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 1]))
    df['genders_2_crew'] = df['crew'].apply(lambda x: sum([1 for i in x if i['gender'] == 2]))
    for j in top_crew_jobs:
        df['jobs_' + j] = df['crew'].apply(lambda x: sum([1 for i in x if i['job'] == j]))
    for j in top_crew_departments:
        df['departments_' + j] = df['crew'].apply(lambda x: sum([1 for i in x if i['department'] == j])) 
    df = df.drop(['crew'], axis=1)

    # -------------------- step 10 - convert budget to log ------------------------------ #
    df['log_budget'] = np.log1p(df['budget'])

    # -------------------- step 11 - create has homepage varaible ------------------------------ #
    df['has_homepage'] = 0
    df.loc[df['homepage'].isnull() == False, 'has_homepage'] = 1
    
    # -------------------- step 12 - get release date and convert it to year with 4 character and date time------------------------------ #
    df['release_date'] = df['release_date'].apply(lambda x: '2/20/2015' if pd.isna(x) else fix_date(x))
    df['release_date'] = pd.to_datetime(df['release_date'])
    
    # -------------------- step 13 - get release date split it into several parts ------------------------------ #
    date_parts = ["year", "weekday", "month", 'weekofyear', 'day', 'quarter']
    for part in date_parts:
        part_col = 'release_date' + "_" + part
        df[part_col] = getattr(df['release_date'].dt, part).astype(int)
        
    # -------------------- step 14 - drop columns that are not used ------------------------------ #
    df = df.drop(['homepage', 'imdb_id', 'poster_path', 'release_date', 'status'], axis=1)
    
    # -------------------- step 15 - get length and number of words in title, tagline, overview, original_title for both test and train------------------------------ #
    for col in ['title', 'tagline', 'overview', 'original_title']:
        df['len_' + col] = df[col].fillna('').apply(lambda x: len(str(x)))
        df['words_' + col] = df[col].fillna('').apply(lambda x: len(str(x.split(' '))))
        df = df.drop(col, axis=1)
    
    return df

## Load the data & get top informations

In [5]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# -------------------- step 1 ------------------------------ #
# get dict columns which are like strings and convert to dict columns.

train = text_to_dict(train)
test = text_to_dict(test)

# ---------------- get list of generes and top geners -------------------#
list_of_genres = list(train['genres'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
top_genres = [m[0] for m in Counter([i for j in list_of_genres for i in j]).most_common(15)]

# ---------------- get list of companies and top companies -------------------#
list_of_companies = list(train['production_companies'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
top_companies = [m[0] for m in Counter([i for j in list_of_companies for i in j]).most_common(30)]

# ---------------- get list of countries and top coutries -------------------#
list_of_countries = list(train['production_countries'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
top_countries = [m[0] for m in Counter([i for j in list_of_countries for i in j]).most_common(25)]

# ---------------- get list of lang and top lang -------------------#
list_of_languages = list(train['spoken_languages'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
top_languages = [m[0] for m in Counter([i for j in list_of_languages for i in j]).most_common(30)]

# ---------------- get list of keywords and top keywords -------------------#
list_of_keywords = list(train['Keywords'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
top_keywords = [m[0] for m in Counter([i for j in list_of_keywords for i in j]).most_common(30)]

# ---------------- get list of cast name and top cast name -------------------#
list_of_cast_names = list(train['cast'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
top_cast_names = [m[0] for m in Counter([i for j in list_of_cast_names for i in j]).most_common(15)]

# ---------------- get list of cast character and top cast character -------------------#
list_of_cast_characters = list(train['cast'].apply(lambda x: [i['character'] for i in x] if x != {} else []).values)
top_cast_characters = [m[0] for m in Counter([i for j in list_of_cast_characters for i in j]).most_common(15)]

# ---------------- get list of crew names and top crew names -------------------#
list_of_crew_names = list(train['crew'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values)
top_crew_names = [m[0] for m in Counter([i for j in list_of_crew_names for i in j]).most_common(15)]

# ---------------- get list of crew jobs and top crew jobs -------------------#
list_of_crew_jobs = list(train['crew'].apply(lambda x: [i['job'] for i in x] if x != {} else []).values)
top_crew_jobs = [m[0] for m in Counter([i for j in list_of_crew_jobs for i in j]).most_common(15)]

# ---------------- get list of crew dept and top crew dept -------------------#
list_of_crew_departments = list(train['crew'].apply(lambda x: [i['department'] for i in x] if x != {} else []).values)
top_crew_departments = [m[0] for m in Counter([i for j in list_of_crew_departments for i in j]).most_common(15)]


## Feature Engg

In [6]:
train = fea_engg(train, top_genres, top_companies, top_countries, top_languages, top_keywords, top_cast_names, top_cast_characters,
                top_crew_names,top_crew_jobs,top_crew_departments)

test = fea_engg(test, top_genres, top_companies, top_countries, top_languages, top_keywords, top_cast_names, top_cast_characters,
               top_crew_names,top_crew_jobs,top_crew_departments)

In [7]:
# -------------------- step 16 - number of unique value in that column is just 1, then drop that column in both train and test ------------------------------ #
for col in train.columns:
    if train[col].nunique() == 1:
        #print(col)
        train = train.drop([col], axis=1)
        test = test.drop([col], axis=1)
        
# -------------------- step 17 - label encode test and train for below 3 columns ------------------------------ #
for col in ['original_language', 'collection_name', 'all_genres']:
    le = LabelEncoder()
    le.fit(list(train[col].fillna('')) + list(test[col].fillna('')))
    train[col] = le.transform(train[col].fillna('').astype(str))
    test[col] = le.transform(test[col].fillna('').astype(str))
    
# ---------------- step 18 - fix power 6 error in train data --------------------
power_six = train.id[train.budget > 1000][train.revenue < 100]

for k in power_six :
    train.loc[train['id'] == k,'revenue'] =  train.loc[train['id'] == k,'revenue'] * 1000000
    
# ---------------- step 19 - fix run time nan --------------------
train['runtime'] = train['runtime'].fillna(train['runtime'].mean())
test['runtime'] = test['runtime'].fillna(train['runtime'].mean())

0        90.0
1        65.0
2       100.0
3       130.0
4        92.0
5       121.0
6       119.0
7        77.0
8       120.0
9        92.0
10       88.0
11      112.0
12      109.0
13       88.0
14      114.0
15      100.0
16      101.0
17      119.0
18      123.0
19      194.0
20      100.0
21      101.0
22      105.0
23       99.0
24      105.0
25      128.0
26      100.0
27      128.0
28       84.0
29       94.0
        ...  
4368     98.0
4369    108.0
4370    100.0
4371    119.0
4372    126.0
4373     98.0
4374    130.0
4375    101.0
4376    144.0
4377     83.0
4378    122.0
4379     96.0
4380     86.0
4381    225.0
4382    100.0
4383    115.0
4384    114.0
4385     93.0
4386    104.0
4387     90.0
4388     81.0
4389     84.0
4390     85.0
4391    110.0
4392    126.0
4393    118.0
4394     95.0
4395    129.0
4396    100.0
4397     85.0
Name: runtime, Length: 4398, dtype: float64

## Form X and y, X_test

In [9]:
X = train.drop(['id', 'revenue'], axis=1)
y = np.log1p(train['revenue'])
X_test = test.drop(['id'], axis=1)

## Splitting training data into train and validation

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1)

## Model 1: LGBMRegressor

In [30]:
params = {'num_leaves': 30,
         'min_data_in_leaf': 20,
         'objective': 'regression',
         'max_depth': 5,
         'learning_rate': 0.01,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.2,
         "verbosity": -1}
model1 = lgb.LGBMRegressor(**params, n_estimators = 20000, nthread = 4, n_jobs = -1)
model1.fit(X_train, y_train, 
        eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='rmse',
        verbose=1000, early_stopping_rounds=200)

Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[358]	training's rmse: 1.63645	valid_1's rmse: 1.82652


LGBMRegressor(bagging_fraction=0.9, bagging_freq=1, bagging_seed=11,
       boosting='gbdt', boosting_type='gbdt', class_weight=None,
       colsample_bytree=1.0, feature_fraction=0.9, importance_type='split',
       lambda_l1=0.2, learning_rate=0.01, max_depth=5, metric='rmse',
       min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=20,
       min_split_gain=0.0, n_estimators=20000, n_jobs=-1, nthread=4,
       num_leaves=30, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0, verbosity=-1)

## Model 2: Linear Regression

In [14]:
model2 = LinearRegression().fit(X_train, y_train)
np.sqrt(mean_squared_error(y_valid, model2.predict(X_valid)))

2.0443409926791474

## Model 3: Ridge with grid search

In [28]:
params = {'alpha': np.logspace(-4, 5)}
gs = GridSearchCV(Ridge(), param_grid=params, cv=5, n_jobs=5).fit(X_train, y_train)
np.sqrt(mean_squared_error(y_valid, gs.predict(X_valid)))


Ill-conditioned matrix (rcond=7.46425e-18): result may not be accurate.



1.9976966389581265

## Create answer file after training with entire train data using the best selected model. 

In [31]:
model = lgb.LGBMRegressor(**params, n_estimators = 20000, nthread = 4, n_jobs = -1)
#model = LinearRegression()
#model = Ridge(**gs.best_params_)
model.fit(X, y)

test_id = test.id
ans = pd.Series(np.exp(model.predict(X_test)),name = 'revenue')
pd.concat([test_id, ans], axis=1).to_csv(index = False, path_or_buf  = 'data/result.csv')