# Kaggle - TMDB

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import math

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score

### Original language

In [4]:
l_rev = {'en': 16.201695631547217,
 'fr': 13.718204235553607,
 'ru': 13.815132182879807,
 'es': 14.645970166012837,
 'hi': 15.371121660763546,
 'ja': 15.818050019285394,
 'it': 14.610307296701814,
 'ko': 14.561503498231747,
 'cn': 15.720496475312752,
 'zh': 15.246036823468886,
 'de': 14.583008872938295,
 'ta': 15.073328869838628}

In [5]:
train['l_rev'] = train['original_language'].map(l_rev).fillna(13.61844005781211)

### Collection

In [6]:
train['coll_id'] = train['belongs_to_collection'].fillna('[{"id":0}]').map(lambda x: (ast.literal_eval(x))[0]['id'])

In [8]:
colls = pd.read_csv('collections.csv',index_col=0)

In [9]:
train['coll_rev_logav'] = train['coll_id'].map(colls['log_of_averages']).fillna(0)

### genres

In [10]:
genres = pd.read_csv('genres.csv',index_col=0)

In [11]:
train['genres_id'] = train['genres'].fillna('[{"id":-1}]').map(lambda x: [i['id'] for i in (ast.literal_eval(x))])

In [12]:
gen_rev = dict(genres['log_revenue'])

In [13]:
null_mean = train[train['genres'].isnull()]['revenue'].map(math.log).mean()

In [14]:
gen_rev[-1] = null_mean

In [15]:
train['genre_ave'] = train['genres_id'].map(lambda x: np.array([gen_rev[g] for g in x]).mean())

### production company

In [30]:
train['production_ids'] = train['production_companies'].fillna('[{"id":-123}]').map(
    lambda x: [i['id'] for i in (ast.literal_eval(x))])

In [31]:
productions = pd.read_csv('production_companies.csv',index_col=0)
productions.head()

Unnamed: 0,revenue,counts,name
4,17.381079,161,Paramount Pictures
60,17.000037,44,United Artists
8411,16.413752,84,Metro-Goldwyn-Mayer (MGM)
2,18.857807,62,Walt Disney Pictures
2266,15.829551,4,Bold Films


In [32]:
prod_rev = dict(productions['revenue'])

In [39]:
train[train['production_ids'].map(
    lambda x: True if len(x)==1 and -123 in x else False)]['revenue'].map(
    math.log).mean()

12.462054733899363

In [40]:
prod_rev[-123] = 12.462054733899363

In [41]:
train['prod_ave'] = train['production_ids'].map(lambda x: np.array([prod_rev[g] for g in x]).mean())
train['prod_top'] = train['production_ids'].map(lambda x: np.array([prod_rev[g] for g in x]).max())

### Fitting

In [42]:
boost = GradientBoostingRegressor()

In [43]:
y = train['revenue'].map(math.log)

In [67]:
X = train[['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave']]

In [69]:
-cross_val_score(boost,X,y,cv=10,scoring='neg_mean_squared_error').mean()

4.5302109421793

In [70]:
X = train[['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_ave']]
-cross_val_score(boost,X,y,cv=10,scoring='neg_mean_squared_error').mean()

2.005956411507207

In [71]:
X = train[['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top']]
-cross_val_score(boost,X,y,cv=10,scoring='neg_mean_squared_error').mean()

2.7981850984362717

In [72]:
X = train[['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave']]
-cross_val_score(boost,X,y,cv=10,scoring='neg_mean_squared_error').mean()

1.9308145906055567

Expected score on kaggle:

In [73]:
np.sqrt(1.9308145906055567)

1.3895375455904588

### Test set

In [74]:
test['l_rev'] = test['original_language'].map(l_rev).fillna(13.61844005781211)

In [75]:
test['coll_id'] = test['belongs_to_collection'].fillna('[{"id":0}]').map(lambda x: (ast.literal_eval(x))[0]['id'])
test['coll_rev_logav'] = test['coll_id'].map(colls['log_of_averages']).fillna(0)

In [76]:
test['genres_id'] = test['genres'].fillna('[{"id":-1}]').map(lambda x: [i['id'] for i in (ast.literal_eval(x))])
test['genre_ave'] = test['genres_id'].map(lambda x: np.array([gen_rev[g] for g in x]).mean())

In [77]:
test['production_ids'] = test['production_companies'].fillna('[{"id":-123}]').map(
    lambda x: [i['id'] for i in (ast.literal_eval(x))])

In [65]:
to_add = [9233,7272,7273 ,6342 ,36417 , 36418, 7980, 11245]
for i in to_add:
    prod_rev[i] = prod_rev[-123]

In [81]:
all_test_prod = []
for p in test['production_ids'].values:
    all_test_prod.extend(p)
all_test_prod = set(all_test_prod)
all_test_prod = list(all_test_prod)
missing = []
for p in all_test_prod:
    if p not in prod_rev.keys():
        missing.append(p)
len(missing)

3419

In [83]:
missing[2]

16

In [87]:
for m in missing:
    prod_rev[m] = prod_rev[-123]

In [88]:
test['prod_ave'] = test['production_ids'].map(lambda x: np.array([prod_rev[g] for g in x]).mean())
test['prod_top'] = test['production_ids'].map(lambda x: np.array([prod_rev[g] for g in x]).max())

In [89]:
X_test = test[['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave']]

In [90]:
submit = pd.read_csv('sample_submission.csv',index_col='id')
submit.head()

Unnamed: 0_level_0,revenue
id,Unnamed: 1_level_1
3001,1000000
3002,1000000
3003,1000000
3004,1000000
3005,1000000


In [91]:
boost.fit(X,y)
pred = boost.predict(X_test)

In [92]:
pred = np.exp(pred)

In [93]:
submit['revenue'] = pred
submit.to_csv('2302_prod.csv')