# Kaggle - TMDB

**Here I will try to use the genres table and test different options on adding this feature and the resulting scores**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score

In [4]:
train['coll_id'] = train['belongs_to_collection'].fillna('[{"id":0}]').map(lambda x: (ast.literal_eval(x))[0]['id'])

In [5]:
colls = pd.read_csv('collections.csv',index_col=0)
colls.head()

Unnamed: 0,average_revenue,stdev
10,749699164,141846802
84,531269279,232145436
119,898827882,38833622
151,96733333,32511741
230,190916236,76579911


In [6]:
train['coll_rev'] = train['coll_id'].map(colls['average_revenue']).fillna(0)

In [7]:
reg = LinearRegression()
ml = MLPRegressor()
boost = GradientBoostingRegressor()

In [10]:
# previously established that coll_rev is a good feature
X = train[['budget','popularity','coll_rev']].values
y = train['revenue'].values

In [14]:
cross_val_score(reg,X,y,cv=10).mean()

0.7616451754160671

In [15]:
cross_val_score(ml,X,y,cv=10).mean()

0.7642585636098407

In [16]:
cross_val_score(boost,X,y,cv=10).mean()

0.7511230807236592

### Add genres

In [17]:
genres = pd.read_csv('genres.csv',index_col=0)
genres.head()

Unnamed: 0,name,revenue
10752,War,49915870.0
10402,Music,42870900.0
35,Comedy,60875740.0
99,Documentary,4638009.0
37,Western,51370640.0


**As dummy variables**

In [21]:
genres.index

Int64Index([10752, 10402,    35,    99,    37,    36,    12,   878,    14,
               16,    80,    18,  9648, 10769,    53, 10770,    27,    28,
            10749, 10751],
           dtype='int64')

In [55]:
train['genres_id'] = train['genres'].fillna('[{"id":-1}]').map(lambda x: [i['id'] for i in (ast.literal_eval(x))])

In [59]:
# test with a few
g = 28
name = f'genre_{g}'
train['genres_id'].head(10).map(lambda x: 1 if g in x else 0)

0    0
1    0
2    0
3    0
4    1
5    0
6    0
7    0
8    1
9    0
Name: genres_id, dtype: int64

In [60]:
# build dummy variables
for g in genres.index:
    name = f'genre_{g}'
    train[name] = train['genres_id'].map(lambda x: 1 if g in x else 0)

In [64]:
train['genres_id'].head()

0                      [35]
1    [35, 18, 10751, 10749]
2                      [18]
3                  [53, 18]
4                  [28, 53]
Name: genres_id, dtype: object

In [65]:
train.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,genre_80,genre_18,genre_9648,genre_10769,genre_53,genre_10770,genre_27,genre_28,genre_10749,genre_10751
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,0,0,0,0,0,0,0,0,0,0
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,0,1,0,0,0,0,0,0,1,1
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,0,1,0,0,0,0,0,0,0,0
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,0,1,0,0,1,0,0,0,0,0
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,0,0,0,0,1,0,0,1,0,0


Looks correct

In [68]:
cols_to_consider = ['budget','popularity','coll_rev']+[f'genre_{g}' for g in genres.index]

In [69]:
X_genre = train[cols_to_consider].values

In [71]:
cross_val_score(reg,X_genre,y,cv=10).mean(),cross_val_score(ml,X_genre,y,cv=10).mean(),cross_val_score(boost,X_genre,y,cv=10).mean()

(0.7630480887675855, 0.7637846327546703, 0.7659376683172325)

Marginal improvement

**As mean of revenues**

In [75]:
train['genres_id'].head()

0                      [35]
1    [35, 18, 10751, 10749]
2                      [18]
3                  [53, 18]
4                  [28, 53]
Name: genres_id, dtype: object

In [80]:
gen_rev = dict(genres['revenue'])
gen_rev[35]

60875742.9192607

In [110]:
# there was an issue with some movies that don't have any genre

In [115]:
null_mean = train[train['genres'].isnull()]['revenue'].mean()

In [116]:
gen_rev[-1] = null_mean

In [119]:
train['genre_ave'] = train['genres_id'].map(lambda x: np.array([gen_rev[g] for g in x]).mean())
train['genre_top'] = train['genres_id'].map(lambda x: np.array([gen_rev[g] for g in x]).max())

In [120]:
# try either

In [121]:
X_ave  = train[['budget','popularity','coll_rev','genre_ave']].values
X_top  = train[['budget','popularity','coll_rev','genre_top']].values
X_both = train[['budget','popularity','coll_rev','genre_ave','genre_top']].values

In [123]:
def consider_all(X):
    a = cross_val_score(reg,X,y,cv=10).mean()
    b = cross_val_score(ml,X,y,cv=10).mean()
    c = cross_val_score(boost,X,y,cv=10).mean()
    return (a,b,c)

In [124]:
consider_all(X_ave)

(0.7614775397444806, 0.7646639556513876, 0.7472757469152138)

In [125]:
consider_all(X_top)

(0.7616044493382613, 0.7650185242741468, 0.7449064007372368)

In [126]:
consider_all(X_both)



(0.7612417160096043, 0.763206969777159, 0.7499987563095789)

It appears that dummy variables work best in this case<br>
Although the score is not much better, I will leave it in for now

### Test set

In [128]:
# collection
test['coll_id'] = test['belongs_to_collection'].fillna('[{"id":0}]').map(
    lambda x: (ast.literal_eval(x))[0]['id'])
test['coll_rev'] = test['coll_id'].map(colls['average_revenue']).fillna(0)
# genres
test['genres_id'] = test['genres'].fillna('[{"id":-1}]').map(
    lambda x: [i['id'] for i in (ast.literal_eval(x))])
for g in genres.index:
    name = f'genre_{g}'
    test[name] = test['genres_id'].map(lambda x: 1 if g in x else 0)

In [129]:
X_test = test[cols_to_consider].values

In [130]:
reg.fit(X_genre,y)
ml.fit(X_genre,y)
boost.fit(X_genre,y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [131]:
pred_1 = reg.predict(X_test)
pred_2 = ml.predict(X_test)
pred_3 = boost.predict(X_test)

In [133]:
submit = pd.read_csv('sample_submission.csv',index_col='id')
submit.head()

Unnamed: 0_level_0,revenue
id,Unnamed: 1_level_1
3001,1000000
3002,1000000
3003,1000000
3004,1000000
3005,1000000


In [134]:
submit['revenue'] = pred_1
submit.to_csv('reg_dm_gen.csv')

submit['revenue'] = pred_2
submit.to_csv('ml_dm_gen.csv')

submit['revenue'] = pred_3
submit.to_csv('boost_dm_gen.csv')