# Kaggle - TMDB

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import math

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score

### Original language

In [4]:
l_rev = {'en': 16.201695631547217,
 'fr': 13.718204235553607,
 'ru': 13.815132182879807,
 'es': 14.645970166012837,
 'hi': 15.371121660763546,
 'ja': 15.818050019285394,
 'it': 14.610307296701814,
 'ko': 14.561503498231747,
 'cn': 15.720496475312752,
 'zh': 15.246036823468886,
 'de': 14.583008872938295,
 'ta': 15.073328869838628,
 'sv': 13.405171677584297}

train['l_rev'] = train['original_language'].map(l_rev).fillna(0)

### Collection

In [5]:
train['coll_id'] = train['belongs_to_collection'].fillna('[{"id":0}]').map(
    lambda x: (ast.literal_eval(x))[0]['id'])
colls = pd.read_csv('collections.csv',index_col=0)
train['coll_rev_logav'] = train['coll_id'].map(colls['log_of_averages']).fillna(0)

### genres

In [6]:
genres = pd.read_csv('genres.csv',index_col=0)
train['genres_id'] = train['genres'].fillna('[{"id":-1}]').map(
    lambda x: [i['id'] for i in (ast.literal_eval(x))])
gen_rev = dict(genres['log_revenue'])
gen_rev[-1] = 0
train['genre_ave'] = train['genres_id'].map(lambda x: np.array([gen_rev[g] for g in x]).mean())

### production company

In [7]:
train['production_ids'] = train['production_companies'].fillna('[{"id":-123}]').map(
    lambda x: [i['id'] for i in (ast.literal_eval(x))])
productions = pd.read_csv('production_companies_short.csv',index_col=0)
prod_rev = dict(productions['revenue'])
train['production_revs'] = train['production_ids'].map(lambda x: ([prod_rev[p] for p in x if p in prod_rev]))
train['prod_ave'] = train['production_revs'].map(lambda x: np.array(x).mean() if len(x)>0 else 0)
train['prod_top'] = train['production_revs'].map(lambda x: np.array(x).max() if len(x)>0 else 0)

### release_date

In [8]:
train['year'] = train['release_date'].map(lambda x: (x.split('/'))[2])
train['year'] = train['year'].astype(int)
train['year'] = train['year'].map(lambda x: x+1900 if x>17 else x+2000)

### runtime

In [9]:
train['runtime'] = train['runtime'].fillna(107)

### spoken language

In [24]:
train['spoken'] = train['spoken_languages'].fillna('[{"iso_639_1":"nan"}]').map(lambda x: [a['iso_639_1'] for a in ast.literal_eval(x)])

In [27]:
train['spoken'].map(lambda x: 'en' in x).sum()

2618

In [85]:
limit=5

uniques = []
for i in train['spoken']:
    uniques.extend(i)
uniques = set(uniques)

counts = {}
for u in uniques:
    c = train['spoken'].map(lambda x: u in x).sum()
    counts[u] = c
    
spoken = pd.DataFrame.from_dict(counts,orient='index',columns=['count'])

spoken_names = []
for u in counts:
    if (counts[u]> limit):
        name = f'spoken_{u}'
        spoken_names.append(name)
        train[name] = train['spoken'].map(lambda x: u in x).astype(int)

In [62]:
train.head(3)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,spoken_ar,spoken_de,spoken_en,spoken_cn,spoken_fr,spoken_zh,spoken_es,spoken_hi,spoken_ja,spoken_ta
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,0,0,1,0,0,0,0,0,0,0
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,0,0,1,0,0,0,0,0,0,0
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,0,0,1,0,0,0,0,0,0,0


### Fitting

In [63]:
boost = GradientBoostingRegressor()

In [64]:
y = train['revenue'].map(math.log)

In [65]:
# best score so far (reference)
columns = ['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave',
           'year', 'runtime']

X = train[columns]
-cross_val_score(boost,X,y,cv=10,scoring='neg_mean_squared_error').mean()

3.986956713697304

In [66]:
columns = ['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave',
           'year', 'runtime',
          *spoken_names]

X = train[columns]
-cross_val_score(boost,X,y,cv=10,scoring='neg_mean_squared_error').mean()

3.973993086328734

### Test set

In [67]:
test['l_rev'] = test['original_language'].map(l_rev).fillna(13.61844005781211)

In [68]:
test['coll_id'] = test['belongs_to_collection'].fillna('[{"id":0}]').map(lambda x: (ast.literal_eval(x))[0]['id'])
test['coll_rev_logav'] = test['coll_id'].map(colls['log_of_averages']).fillna(0)

In [69]:
test['genres_id'] = test['genres'].fillna('[{"id":-1}]').map(lambda x: [i['id'] for i in (ast.literal_eval(x))])
test['genre_ave'] = test['genres_id'].map(lambda x: np.array([gen_rev[g] for g in x]).mean())

In [70]:
test['production_ids'] = test['production_companies'].fillna('[{"id":-123}]').map(
    lambda x: [i['id'] for i in (ast.literal_eval(x))])

In [71]:
test['production_revs'] = test['production_ids'].map(lambda x: ([prod_rev[p] for p in x if p in prod_rev]))

In [72]:
test['prod_ave'] = test['production_revs'].map(lambda x: np.array(x).mean() if len(x)>0 else 0)
test['prod_top'] = test['production_revs'].map(lambda x: np.array(x).max() if len(x)>0 else 0)

In [73]:
# missing value - I filled with wikipedia info
test.loc[828,'release_date'] = '05/01/00'

In [74]:
test['year'] = test['release_date'].map(lambda x: (x.split('/'))[2])
test['year'] = test['year'].astype(int)
test['year'] = test['year'].map(lambda x: x+1900 if x>17 else x+2000)

In [75]:
test['runtime'] = test['runtime'].fillna(107)

In [86]:
test['spoken'] = test['spoken_languages'].fillna('[{"iso_639_1":"nan"}]').map(lambda x: [a['iso_639_1'] for a in ast.literal_eval(x)])
for u in counts:
    if (counts[u]> limit):
        name = f'spoken_{u}'
        spoken_names.append(name)
        test[name] = test['spoken'].map(lambda x: u in x).astype(int)

In [77]:
submit = pd.read_csv('sample_submission.csv',index_col='id')
submit.head()

Unnamed: 0_level_0,revenue
id,Unnamed: 1_level_1
3001,1000000
3002,1000000
3003,1000000
3004,1000000
3005,1000000


In [87]:
columns = ['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave',
           'year', 'runtime',
          *spoken_names]

X = train[columns]

X_test = test[columns]

boost.fit(X,y)
pred = boost.predict(X_test)
pred = np.exp(pred)
submit['revenue'] = pred
submit.to_csv('0803-5_spok.csv')