# Kaggle - TMDB

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast
import math

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score

### Original language

In [4]:
l_rev = {'en': 16.201695631547217,
 'fr': 13.718204235553607,
 'ru': 13.815132182879807,
 'es': 14.645970166012837,
 'hi': 15.371121660763546,
 'ja': 15.818050019285394,
 'it': 14.610307296701814,
 'ko': 14.561503498231747,
 'cn': 15.720496475312752,
 'zh': 15.246036823468886,
 'de': 14.583008872938295,
 'ta': 15.073328869838628}

In [5]:
train['l_rev'] = train['original_language'].map(l_rev).fillna(13.61844005781211)

### Collection

In [6]:
train['coll_id'] = train['belongs_to_collection'].fillna('[{"id":0}]').map(lambda x: (ast.literal_eval(x))[0]['id'])

In [7]:
colls = pd.read_csv('collections.csv',index_col=0)

In [8]:
train['coll_rev_logav'] = train['coll_id'].map(colls['log_of_averages']).fillna(0)

### genres

In [9]:
genres = pd.read_csv('genres.csv',index_col=0)

In [10]:
train['genres_id'] = train['genres'].fillna('[{"id":-1}]').map(lambda x: [i['id'] for i in (ast.literal_eval(x))])

In [11]:
gen_rev = dict(genres['log_revenue'])

In [12]:
null_mean = train[train['genres'].isnull()]['revenue'].map(math.log).mean()

In [13]:
gen_rev[-1] = null_mean

In [14]:
train['genre_ave'] = train['genres_id'].map(lambda x: np.array([gen_rev[g] for g in x]).mean())

### production company

In [15]:
train['production_ids'] = train['production_companies'].fillna('[{"id":-123}]').map(
    lambda x: [i['id'] for i in (ast.literal_eval(x))])

In [16]:
productions = pd.read_csv('production_companies_short.csv',index_col=0)
productions.head()

Unnamed: 0,revenue,counts,name
4,17.381079,161,Paramount Pictures
60,17.000037,44,United Artists
8411,16.413752,84,Metro-Goldwyn-Mayer (MGM)
2,18.857807,62,Walt Disney Pictures
3172,18.064469,18,Blumhouse Productions


In [17]:
prod_rev = dict(productions['revenue'])

In [18]:
train['production_revs'] = train['production_ids'].map(lambda x: ([prod_rev[p] for p in x if p in prod_rev]))

In [19]:
train['prod_ave'] = train['production_revs'].map(lambda x: np.array(x).mean() if len(x)>0 else 0)
train['prod_top'] = train['production_revs'].map(lambda x: np.array(x).max() if len(x)>0 else 0)

### release_date

In [20]:
train['month'] = train['release_date'].map(lambda x: (x.split('/'))[0])
train['year'] = train['release_date'].map(lambda x: (x.split('/'))[2])
train['month'] = train['month'].astype(int)
train['year'] = train['year'].astype(int)
train['year'] = train['year'].map(lambda x: x+1900 if x>17 else x+2000)

In [21]:
months = []
for i in range(1,13):
    name = f'month_{i}'
    train[name] = train['month'].map(lambda x: 1 if i==x else 0)
    months.append(name)

### cast

In [25]:
train['cast_expand'] = train['cast'].fillna('[{"id":-234}]').map(
    lambda x: [f['id'] for f in ast.literal_eval(x)])

In [26]:
train['cast_expand'].head()

0    [52997, 64342, 54729, 36801, 54812, 94098, 115...
1    [1813, 5823, 1210, 655, 33656, 62064, 68287, 1...
2    [996701, 18999, 129104, 970216, 223012, 159366...
3    [35068, 85047, 1021524, 1093644, 86033, 92686,...
4                       [84751, 64453, 84752, 1130534]
Name: cast_expand, dtype: object

In [42]:
cast = pd.read_csv('actors.csv',index_col=0)
cast.head()

Unnamed: 0,name,counts,revenue
52997,Rob Corddry,5,17.301041
64342,Craig Robinson,7,16.783682
54729,Clark Duke,6,17.483381
36801,Adam Scott,13,16.460071
54812,Chevy Chase,8,17.201902


In [45]:
train['cast_rev_5'] = train['cast_expand'].map(
    lambda x:[cast.loc[i]['revenue'] for i in x if (cast.loc[i]['counts']>=5)])

In [54]:
train['top_cast'] = train['cast_rev_5'].map(lambda x: max(x) if len(x)>0 else 0)

In [58]:
train['ave_cast'] = train['cast_rev_5'].map(lambda x: sum(x)/len(x) if len(x)>0 else 0)

### Fitting

In [59]:
boost = GradientBoostingRegressor()

In [60]:
y = train['revenue'].map(math.log)

In [65]:
# best score so far (reference)
columns = ['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave',
           'year']

X = train[columns]
-cross_val_score(boost,X,y,cv=10,scoring='neg_mean_squared_error').mean()

4.0979703565013255

In [66]:
columns = ['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave',
           'year',
          'top_cast']

X = train[columns]
-cross_val_score(boost,X,y,cv=10,scoring='neg_mean_squared_error').mean()

3.7743360090147675

In [67]:
columns = ['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave',
           'year',
          'ave_cast']

X = train[columns]
-cross_val_score(boost,X,y,cv=10,scoring='neg_mean_squared_error').mean()

3.698995565964494

In [68]:
columns = ['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave',
           'year',
          'ave_cast','top_cast']

X = train[columns]
-cross_val_score(boost,X,y,cv=10,scoring='neg_mean_squared_error').mean()

3.633830498753995

### Test set

In [69]:
test['l_rev'] = test['original_language'].map(l_rev).fillna(13.61844005781211)

In [70]:
test['coll_id'] = test['belongs_to_collection'].fillna('[{"id":0}]').map(lambda x: (ast.literal_eval(x))[0]['id'])
test['coll_rev_logav'] = test['coll_id'].map(colls['log_of_averages']).fillna(0)

In [71]:
test['genres_id'] = test['genres'].fillna('[{"id":-1}]').map(lambda x: [i['id'] for i in (ast.literal_eval(x))])
test['genre_ave'] = test['genres_id'].map(lambda x: np.array([gen_rev[g] for g in x]).mean())

In [72]:
test['production_ids'] = test['production_companies'].fillna('[{"id":-123}]').map(
    lambda x: [i['id'] for i in (ast.literal_eval(x))])

In [73]:
test['production_revs'] = test['production_ids'].map(lambda x: ([prod_rev[p] for p in x if p in prod_rev]))

In [74]:
test['prod_ave'] = test['production_revs'].map(lambda x: np.array(x).mean() if len(x)>0 else 0)
test['prod_top'] = test['production_revs'].map(lambda x: np.array(x).max() if len(x)>0 else 0)

In [75]:
# missing value - I filled with wikipedia info
test.loc[828,'release_date'] = '05/01/00'

In [76]:
test['month'] = test['release_date'].map(lambda x: (x.split('/'))[0])
test['year'] = test['release_date'].map(lambda x: (x.split('/'))[2])
test['month'] = test['month'].astype(int)
test['year'] = test['year'].astype(int)
test['year'] = test['year'].map(lambda x: x+1900 if x>17 else x+2000)
for i in range(1,13):
    name = f'month_{i}'
    test[name] = test['month'].map(lambda x: 1 if i==x else 0)

In [78]:
test['cast_expand'] = test['cast'].fillna('[{"id":-234}]').map(
    lambda x: [f['id'] for f in ast.literal_eval(x)])

In [88]:
# modified to skip actors that are not present in the actors.csv dataframe
    # that is, actors that were not in any movie from the training set
test['cast_rev_5'] = test['cast_expand'].map(
    lambda x:[cast.loc[i]['revenue'] for i in x if (i in cast.index and cast.loc[i]['counts']>=5)])

In [89]:
test['top_cast'] = test['cast_rev_5'].map(lambda x: max(x) if len(x)>0 else 0)
test['ave_cast'] = test['cast_rev_5'].map(lambda x: sum(x)/len(x) if len(x)>0 else 0)

In [90]:
submit = pd.read_csv('sample_submission.csv',index_col='id')
submit.head()

Unnamed: 0_level_0,revenue
id,Unnamed: 1_level_1
3001,1000000
3002,1000000
3003,1000000
3004,1000000
3005,1000000


In [91]:
columns = ['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave',
           'year',
          'ave_cast']

X = train[columns]

X_test = test[columns]

boost.fit(X,y)
pred = boost.predict(X_test)
pred = np.exp(pred)
submit['revenue'] = pred
submit.to_csv('2802-cast_ave.csv')

In [92]:
columns = ['popularity','budget',
           'l_rev',
           'coll_rev_logav',
           'genre_ave',
          'prod_top','prod_ave',
           'year',
          'ave_cast','top_cast']

X = train[columns]

X_test = test[columns]

boost.fit(X,y)
pred = boost.predict(X_test)
pred = np.exp(pred)
submit['revenue'] = pred
submit.to_csv('2802-cast_both.csv')