# Kaggle - TMDB

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import math

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score

### Original language

In [4]:
l_rev = {'en': 16.201695631547217,
 'fr': 13.718204235553607,
 'ru': 13.815132182879807,
 'es': 14.645970166012837,
 'hi': 15.371121660763546,
 'ja': 15.818050019285394,
 'it': 14.610307296701814,
 'ko': 14.561503498231747,
 'cn': 15.720496475312752,
 'zh': 15.246036823468886,
 'de': 14.583008872938295,
 'ta': 15.073328869838628}

In [5]:
train['l_rev'] = train['original_language'].map(l_rev).fillna(13.61844005781211)

In [11]:
for ll in l_rev.keys():
    name = f'langauge_{ll}'
    train[name] = train['original_language'].map(lambda x: 1 if x==ll else 0)

### Collection

In [13]:
train['coll_id'] = train['belongs_to_collection'].fillna('[{"id":0}]').map(lambda x: (ast.literal_eval(x))[0]['id'])

In [14]:
colls = pd.read_csv('collections.csv',index_col=0)

In [15]:
train['coll_rev_logav'] = train['coll_id'].map(colls['log_of_averages']).fillna(0)

### genres

In [17]:
genres = pd.read_csv('genres.csv',index_col=0)

In [18]:
train['genres_id'] = train['genres'].fillna('[{"id":-1}]').map(lambda x: [i['id'] for i in (ast.literal_eval(x))])

In [19]:
gen_rev = dict(genres['log_revenue'])

In [20]:
null_mean = train[train['genres'].isnull()]['revenue'].map(math.log).mean()

In [21]:
gen_rev[-1] = null_mean

In [22]:
train['genre_ave'] = train['genres_id'].map(lambda x: np.array([gen_rev[g] for g in x]).mean())

In [25]:
for gg in gen_rev.keys():
    name = f'genre_{gg}'
    train[name] = train['genres_id'].map(lambda x: 1 if gg in x else 0)

### production company

In [27]:
train['production_ids'] = train['production_companies'].fillna('[{"id":-123}]').map(
    lambda x: [i['id'] for i in (ast.literal_eval(x))])

In [28]:
productions = pd.read_csv('production_companies_short.csv',index_col=0)
productions.head()

Unnamed: 0,revenue,counts,name
4,17.381079,161,Paramount Pictures
60,17.000037,44,United Artists
8411,16.413752,84,Metro-Goldwyn-Mayer (MGM)
2,18.857807,62,Walt Disney Pictures
3172,18.064469,18,Blumhouse Productions


In [29]:
prod_rev = dict(productions['revenue'])

In [30]:
train['production_revs'] = train['production_ids'].map(lambda x: ([prod_rev[p] for p in x if p in prod_rev]))

In [31]:
train['prod_ave'] = train['production_revs'].map(lambda x: np.array(x).mean() if len(x)>0 else 0)
train['prod_top'] = train['production_revs'].map(lambda x: np.array(x).max() if len(x)>0 else 0)

In [32]:
for pp in prod_rev.keys():
    name = f'production_{pp}'
    train[name] = train['production_ids'].map(lambda x: 1 if pp in x else 0)

### Fitting

In [35]:
boost = GradientBoostingRegressor()

In [36]:
y = train['revenue'].map(math.log)

### Test set

In [38]:
test['l_rev'] = test['original_language'].map(l_rev).fillna(13.61844005781211)

In [39]:
test['coll_id'] = test['belongs_to_collection'].fillna('[{"id":0}]').map(lambda x: (ast.literal_eval(x))[0]['id'])
test['coll_rev_logav'] = test['coll_id'].map(colls['log_of_averages']).fillna(0)

In [40]:
test['genres_id'] = test['genres'].fillna('[{"id":-1}]').map(lambda x: [i['id'] for i in (ast.literal_eval(x))])
test['genre_ave'] = test['genres_id'].map(lambda x: np.array([gen_rev[g] for g in x]).mean())

In [41]:
test['production_ids'] = test['production_companies'].fillna('[{"id":-123}]').map(
    lambda x: [i['id'] for i in (ast.literal_eval(x))])

In [42]:
test['production_revs'] = test['production_ids'].map(lambda x: ([prod_rev[p] for p in x if p in prod_rev]))

In [43]:
test['prod_ave'] = test['production_revs'].map(lambda x: np.array(x).mean() if len(x)>0 else 0)
test['prod_top'] = test['production_revs'].map(lambda x: np.array(x).max() if len(x)>0 else 0)

In [44]:
for ll in l_rev.keys():
    name = f'langauge_{ll}'
    test[name] = test['original_language'].map(lambda x: 1 if x==ll else 0)

In [56]:
for gg in gen_rev.keys():
    name = f'genre_{gg}'
    test[name] = test['genres_id'].map(lambda x: 1 if gg in x else 0)

In [57]:
for pp in prod_rev.keys():
    name = f'production_{pp}'
    test[name] = test['production_ids'].map(lambda x: 1 if pp in x else 0)

In [46]:
l_cols = []
for ll in l_rev.keys():
    name = f'langauge_{ll}'
    l_cols.append(name)

g_cols = []
for gg in gen_rev.keys():
    name = f'genre_{gg}'
    g_cols.append(name)

p_cols = []
for pp in prod_rev.keys():
    name = f'production_{pp}'
    p_cols.append(name)

In [47]:
submit = pd.read_csv('sample_submission.csv',index_col='id')
submit.head()

Unnamed: 0_level_0,revenue
id,Unnamed: 1_level_1
3001,1000000
3002,1000000
3003,1000000
3004,1000000
3005,1000000


In [51]:
X = train[['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave']]

X_test = test[['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave']]

In [52]:
boost.fit(X,y)
pred = boost.predict(X_test)
pred = np.exp(pred)
submit['revenue'] = pred
submit.to_csv('2402-base.csv')

In [53]:
columns = ['popularity','budget',
           *l_cols,
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave']

X = train[columns]

X_test = test[columns]

boost.fit(X,y)
pred = boost.predict(X_test)
pred = np.exp(pred)
submit['revenue'] = pred
submit.to_csv('2402-l_dummy.csv')

In [58]:
columns = ['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           *g_cols,
          'prod_top','prod_ave']

X = train[columns]

X_test = test[columns]

boost.fit(X,y)
pred = boost.predict(X_test)
pred = np.exp(pred)
submit['revenue'] = pred
submit.to_csv('2402-g_dummy.csv')

In [59]:
columns = ['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          *p_cols]

X = train[columns]

X_test = test[columns]

boost.fit(X,y)
pred = boost.predict(X_test)
pred = np.exp(pred)
submit['revenue'] = pred
submit.to_csv('2402-p_dummy.csv')