# Kaggle - TMDB

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import math

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score

### Original language

In [4]:
l_rev = {'en': 16.201695631547217,
 'fr': 13.718204235553607,
 'ru': 13.815132182879807,
 'es': 14.645970166012837,
 'hi': 15.371121660763546,
 'ja': 15.818050019285394,
 'it': 14.610307296701814,
 'ko': 14.561503498231747,
 'cn': 15.720496475312752,
 'zh': 15.246036823468886,
 'de': 14.583008872938295,
 'ta': 15.073328869838628}

In [5]:
train['l_rev'] = train['original_language'].map(l_rev).fillna(13.61844005781211)

### Collection

In [6]:
train['coll_id'] = train['belongs_to_collection'].fillna('[{"id":0}]').map(lambda x: (ast.literal_eval(x))[0]['id'])

In [7]:
colls = pd.read_csv('collections.csv',index_col=0)

In [8]:
train['coll_rev_logav'] = train['coll_id'].map(colls['log_of_averages']).fillna(0)

### genres

In [9]:
genres = pd.read_csv('genres.csv',index_col=0)

In [10]:
train['genres_id'] = train['genres'].fillna('[{"id":-1}]').map(lambda x: [i['id'] for i in (ast.literal_eval(x))])

In [11]:
gen_rev = dict(genres['log_revenue'])

In [12]:
null_mean = train[train['genres'].isnull()]['revenue'].map(math.log).mean()

In [13]:
gen_rev[-1] = null_mean

In [14]:
train['genre_ave'] = train['genres_id'].map(lambda x: np.array([gen_rev[g] for g in x]).mean())

### production company

In [22]:
train['production_ids'] = train['production_companies'].fillna('[{"id":-123}]').map(
    lambda x: [i['id'] for i in (ast.literal_eval(x))])

In [23]:
productions = pd.read_csv('production_companies_short.csv',index_col=0)
productions.head()

Unnamed: 0,revenue,counts,name
4,17.381079,161,Paramount Pictures
60,17.000037,44,United Artists
8411,16.413752,84,Metro-Goldwyn-Mayer (MGM)
2,18.857807,62,Walt Disney Pictures
3172,18.064469,18,Blumhouse Productions


In [24]:
prod_rev = dict(productions['revenue'])

In [47]:
train['production_revs'] = train['production_ids'].map(lambda x: ([prod_rev[p] for p in x if p in prod_rev]))

In [48]:
train['prod_ave'] = train['production_revs'].map(lambda x: np.array(x).mean() if len(x)>0 else 0)
train['prod_top'] = train['production_revs'].map(lambda x: np.array(x).max() if len(x)>0 else 0)

### Fitting

In [50]:
boost = GradientBoostingRegressor()

In [51]:
y = train['revenue'].map(math.log)

In [52]:
X = train[['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave']]

In [53]:
-cross_val_score(boost,X,y,cv=10,scoring='neg_mean_squared_error').mean()

4.533537636365974

In [54]:
X = train[['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_ave']]
-cross_val_score(boost,X,y,cv=10,scoring='neg_mean_squared_error').mean()

4.286517300175151

In [55]:
X = train[['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top']]
-cross_val_score(boost,X,y,cv=10,scoring='neg_mean_squared_error').mean()

4.276060611653323

In [56]:
X = train[['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave']]
-cross_val_score(boost,X,y,cv=10,scoring='neg_mean_squared_error').mean()

4.245023378966365

In [57]:
from sklearn.linear_model import LinearRegression

In [58]:
reg = LinearRegression()

### Test set

In [59]:
test['l_rev'] = test['original_language'].map(l_rev).fillna(13.61844005781211)

In [60]:
test['coll_id'] = test['belongs_to_collection'].fillna('[{"id":0}]').map(lambda x: (ast.literal_eval(x))[0]['id'])
test['coll_rev_logav'] = test['coll_id'].map(colls['log_of_averages']).fillna(0)

In [61]:
test['genres_id'] = test['genres'].fillna('[{"id":-1}]').map(lambda x: [i['id'] for i in (ast.literal_eval(x))])
test['genre_ave'] = test['genres_id'].map(lambda x: np.array([gen_rev[g] for g in x]).mean())

In [62]:
test['production_ids'] = test['production_companies'].fillna('[{"id":-123}]').map(
    lambda x: [i['id'] for i in (ast.literal_eval(x))])

In [63]:
test['production_revs'] = test['production_ids'].map(lambda x: ([prod_rev[p] for p in x if p in prod_rev]))

In [64]:
test['prod_ave'] = test['production_revs'].map(lambda x: np.array(x).mean() if len(x)>0 else 0)
test['prod_top'] = test['production_revs'].map(lambda x: np.array(x).max() if len(x)>0 else 0)

In [65]:
X_test = test[['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave']]

In [66]:
submit = pd.read_csv('sample_submission.csv',index_col='id')
submit.head()

Unnamed: 0_level_0,revenue
id,Unnamed: 1_level_1
3001,1000000
3002,1000000
3003,1000000
3004,1000000
3005,1000000


In [67]:
boost.fit(X,y)
pred = boost.predict(X_test)

In [68]:
pred = np.exp(pred)

In [69]:
submit['revenue'] = pred
submit.to_csv('2302_prod_short.csv')

In [70]:
reg.fit(X,y)
pred = reg.predict(X_test)
pred = np.exp(pred)
submit['revenue'] = pred
submit.to_csv('2302_prod_short_lin.csv')