In [350]:
import numpy as np 
import pandas as pd
import json
import ast
from collections import Counter, OrderedDict
import time
import datetime
import random
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import eli5

import os
print(os.listdir("../input"))

['moviestmdb-datapreparation', 'tmdb-box-office-prediction']


## Import data, append train and test sets for feature engineering and transform some strings into dictionaries

In [351]:
df_train = pd.read_csv('../input/tmdb-box-office-prediction/train.csv')
df_test = pd.read_csv('../input/tmdb-box-office-prediction/test.csv')

In [352]:
features=df_train.drop(['revenue'],axis=1).append(df_test).reset_index()

In [353]:
# from this kernel: https://www.kaggle.com/gravix/gradient-in-a-box
dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

In [354]:
def text_to_dict(df, columns_to_parse):
    for column in columns_to_parse:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df

In [355]:
f_clean = text_to_dict(features,dict_columns)

 - we will also add date variables, since they will be useful right next after this

In [356]:
def fix_date(x):
    """
    Fixes dates which are in 20xx
    """
    year = x.split('/')[2]
    if int(year) <= 19:
        return x[:-2] + '20' + year
    else:
        return x[:-2] + '19' + year

In [357]:
f_clean.loc[f_clean['release_date'].isnull() == True, 'release_date'] = '01/01/98' 

In [358]:
f_clean['release_date'] = f_clean['release_date'].apply(lambda x: fix_date(x))
f_clean['release_date'] = pd.to_datetime(f_clean['release_date'])

In [359]:
f_clean['year']=pd.DatetimeIndex(f_clean['release_date']).year
f_clean['month']=pd.DatetimeIndex(f_clean['release_date']).month
f_clean['yr_mth']=f_clean['year']*100+f_clean['month']

In [360]:
min_date_months = f_clean["year"].min()*12 + f_clean["month"].min()

def change_time_to_num(year_month, min_date):
    date_to_months = year_month.apply(lambda x: int(str(x)[:4]) * 12 + int(str(x)[-2:]))
    return date_to_months.apply(lambda x: x - min_date)

f_clean['timediff'] = change_time_to_num(f_clean['yr_mth'], min_date_months)

# We will try to learn embeddings from movie-keyword pairs to generate extra features

(closely following this method: https://github.com/WillKoehrsen/wikipedia-data-science/blob/master/notebooks/Book%20Recommendation%20System.ipynb)****

### Add movie and keyword index dictionaries

 - before creating movie indices, we should transform the `original_title`, since there are some duplicates that may mix up our results

In [361]:
f_clean['original_title'][f_clean.duplicated('original_title')].shape

(133,)

In [362]:
f_clean['edited_title'] = f_clean['original_title'].copy()
f_clean['edited_title'][f_clean.duplicated('original_title')] = f_clean['edited_title'].map(str) + ' (' + f_clean['year'].map(str) + ')'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [363]:
f_clean['edited_title'][f_clean.duplicated('edited_title')].shape

(0,)

 - now that these 133 duplicates were taken care of, we can use `edited_title` to produce the movie indices

In [364]:
movie_index={v: k for k, v in f_clean['edited_title'].to_dict().items()}
index_movie=f_clean['edited_title'].to_dict()

 - let us now produce the indices for keywords

In [365]:
f_clean['list_keywords']=f_clean['Keywords'].apply(lambda x: [i['name'] for i in x] if x != {} else []).values

In [366]:
len(set([i for j in f_clean['list_keywords'] for i in j])) # check number of unique keywords

11930

In [367]:
count_keywords=Counter([i for j in f_clean['list_keywords'] for i in j]).most_common()

In [368]:
count_keywords[0:10] # check most common keywords

[('woman director', 457),
 ('independent film', 384),
 ('duringcreditsstinger', 350),
 ('based on novel', 312),
 ('murder', 305),
 ('violence', 245),
 ('love', 190),
 ('revenge', 188),
 ('sex', 186),
 ('aftercreditsstinger', 183)]

In [369]:
for i in range(1,5):
    print(f'There are {len([t[0] for t in count_keywords if t[1] == i])} keywords that appear in {i} movies')

print(f'There are {len([t[0] for t in count_keywords if t[1] > 4])} keywords that appear in 5 or more movies')
print(f'In total, {len([t[0] for t in count_keywords if t[1] > 1])} keywords appear more than once')

There are 5963 keywords that appear in 1 movies
There are 1978 keywords that appear in 2 movies
There are 985 keywords that appear in 3 movies
There are 623 keywords that appear in 4 movies
There are 2381 keywords that appear in 5 or more movies
In total, 5967 keywords appear more than once


 - we will disregard keywords that appear only once

In [370]:
keywords = [t[0] for t in count_keywords if t[1] > 1]

 - we will investigate which movies have greater and lower keyword count

In [371]:
kcount=pd.concat([f_clean['edited_title'],
                  f_clean['list_keywords'].apply(lambda x: [i for i in x if i in keywords]),
                  f_clean['list_keywords'].apply(lambda x: len([i for i in x if i in keywords]))],
                 axis=1)
kcount.columns=['movie','keywords','kcount']

In [372]:
kcount.sort_values(by='kcount',ascending=False)[0:10]

Unnamed: 0,movie,keywords,kcount
1793,Werckmeister harmóniák,"[dancing, male nudity, circus, moon, bathroom,...",126
2955,Brooklyn's Finest,"[male nudity, female nudity, tattoo, gambling,...",78
6804,Southland Tales,"[suicide, brother brother relationship, spy, p...",51
2448,15 Minutes,"[new york, female nudity, prison, prostitute, ...",42
3832,Fair Game (1995),"[bomb, miami, sex, detective, handcuffs, based...",41
1463,Hard Candy,"[suicide, rape, age difference, photographer, ...",38
5139,Straight Outta Compton,"[brother brother relationship, aids, police br...",38
5344,Ein Lied von Liebe und Tod – Gloomy Sunday,"[suicide, male nudity, female nudity, poison, ...",37
4075,Lost Highway,"[schizophrenia, prison, pornography, sadistic,...",37
153,The Boy Next Door,"[male nudity, female nudity, sex, adultery, in...",36


In [373]:
kcount['movie'].loc[kcount['kcount']==0].count() # check how many movies have zero keywords

724

- let us create keyword indices

In [374]:
kword_index = {kword: idx for idx, kword in enumerate(keywords)}
index_kword = {idx: kword for kword, idx in kword_index.items()}

### Let us now create the embedding learning task

In [375]:
pairs = []

for movie in movie_index.values():
    pairs.extend((movie,kword_index[kword]) for kword in kcount['keywords'][kcount.index==movie].iloc[0]) 

 - Setting up a Random training example generator (as in the wikipedia book example)
 - Since the neural network will be trained one batch at a time, the Random training example generator is made in a way that it yields batches of samples each time it is called (which will happen during training of the network)

In [376]:
random.seed(100)

pairs_set = set(pairs)

def generate_batch(pairs, n_positive = 50, negative_ratio = 1.0, classification = False):
    """Generate batches of samples for training"""
    batch_size = n_positive * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    
    # Adjust label based on task
    if classification:
        neg_label = 0
    else:
        neg_label = -1
    
    # This creates a generator
    while True:
        # randomly choose positive examples
        for idx, (movie_id, kword_id) in enumerate(random.sample(pairs, n_positive)):
            batch[idx, :] = (movie_id, kword_id, 1)

        # Increment idx by 1
        idx += 1
        
        # Add negative examples until reach batch size
        while idx < batch_size:
            
            # random selection
            random_movie = random.randrange(len(index_movie))
            random_kword = random.randrange(len(index_kword))
            
            # Check to make sure this is not a positive example
            if (random_movie, random_kword) not in pairs_set:
                
                # Add to batch and increment index
                batch[idx, :] = (random_movie, random_kword, neg_label)
                idx += 1
                
        # Make sure to shuffle order
        np.random.shuffle(batch)
        yield {'movie': batch[:, 0], 'kword': batch[:, 1]}, batch[:, 2]

### Neural network embedding model

The neural network will have the following layers:
1. Input layer (movie and keyword inputs)
2. Embedding layer (embeddings for movies and keywords. These will be trained to map our inputs into a 50-dimensional vector)
3. Dot product layer
4. Reshape layer (to correct the shape)
5. Dense (in classification): fully connected layer with sigmoid activation to generate output for classification

In [377]:
from keras.layers import Input, Embedding, Dot, Reshape, Dense
from keras.models import Model

In [378]:

def embedding_model(embedding_size = 50, classification = False):
      
    # Layer 1: 1-dimensional inputs
    movie = Input(name = 'movie', shape = [1])
    kword = Input(name = 'kword', shape = [1])
    
    # Layer 2: Embedding the movie (shape will be (None, 1, 50))
    movie_embedding = Embedding(name = 'movie_embedding',
                               input_dim = len(movie_index),
                               output_dim = embedding_size)(movie)
    
    # Layer 2: Embedding the keyword (shape will be (None, 1, 50))
    kword_embedding = Embedding(name = 'kword_embedding',
                               input_dim = len(kword_index),
                               output_dim = embedding_size)(kword)
    
    # Layer 3: Merge the layers with a dot product along the second axis (shape will be (None, 1, 1))
    merged = Dot(name = 'dot_product', normalize = True, axes = 2)([movie_embedding, kword_embedding])
    
    # Layer 4: Reshape to be a single number (shape will be (None, 1))
    merged = Reshape(target_shape = [1])(merged)
    
    # If classifcation, add extra layer and loss function is binary cross entropy
    if classification:
        merged = Dense(1, activation = 'sigmoid')(merged) # layer 5: for classification
        model = Model(inputs = [movie, kword], outputs = merged)
        model.compile(optimizer = 'Adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    
    # Otherwise loss function is mean squared error
    else:
        model = Model(inputs = [movie, kword], outputs = merged)
        model.compile(optimizer = 'Adam', loss = 'mse')
    
    return model

In [379]:
model = embedding_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
movie (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
kword (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
movie_embedding (Embedding)     (None, 1, 50)        369900      movie[0][0]                      
__________________________________________________________________________________________________
kword_embedding (Embedding)     (None, 1, 50)        298350      kword[0][0]                      
__________________________________________________________________________________________________
dot_produc

### Training the model

In [380]:
n_positive = 1024

gen = generate_batch(pairs, n_positive, negative_ratio = 2)

# Train
h = model.fit_generator(gen, epochs = 50, 
                        steps_per_epoch = len(pairs) // n_positive,
                        verbose = 2)

Epoch 1/50
 - 1s - loss: 0.9820
Epoch 2/50
 - 1s - loss: 0.9328
Epoch 3/50
 - 1s - loss: 0.8902
Epoch 4/50
 - 1s - loss: 0.8478
Epoch 5/50
 - 1s - loss: 0.8174
Epoch 6/50
 - 1s - loss: 0.7934
Epoch 7/50
 - 1s - loss: 0.7713
Epoch 8/50
 - 1s - loss: 0.7404
Epoch 9/50
 - 1s - loss: 0.6948
Epoch 10/50
 - 1s - loss: 0.6498
Epoch 11/50
 - 1s - loss: 0.6023
Epoch 12/50
 - 1s - loss: 0.5565
Epoch 13/50
 - 1s - loss: 0.5256
Epoch 14/50
 - 1s - loss: 0.5152
Epoch 15/50
 - 1s - loss: 0.5054
Epoch 16/50
 - 1s - loss: 0.5048
Epoch 17/50
 - 1s - loss: 0.4920
Epoch 18/50
 - 1s - loss: 0.4930
Epoch 19/50
 - 1s - loss: 0.4798
Epoch 20/50
 - 1s - loss: 0.4831
Epoch 21/50
 - 1s - loss: 0.4750
Epoch 22/50
 - 1s - loss: 0.4717
Epoch 23/50
 - 1s - loss: 0.4664
Epoch 24/50
 - 1s - loss: 0.4732
Epoch 25/50
 - 1s - loss: 0.4628
Epoch 26/50
 - 1s - loss: 0.4682
Epoch 27/50
 - 1s - loss: 0.4656
Epoch 28/50
 - 1s - loss: 0.4678
Epoch 29/50
 - 1s - loss: 0.4594
Epoch 30/50
 - 1s - loss: 0.4556
Epoch 31/50
 - 1s -

### Extract the embeddings

In [381]:
movie_layer = model.get_layer('movie_embedding')
movie_weights = movie_layer.get_weights()[0]
movie_weights.shape

(7398, 50)

 - each movie can now be represented on the 50-dimensional vector obtained based on keywords
 - to be able to calculate similarities based on cosine similarity, we should first normalize embeddings so that they have the dot product of two movie embeddings is the cosine similarity
 - this normalization is achieved by dividing each vector by the square root of the sum of squared components

In [382]:
movie_weights = movie_weights / np.linalg.norm(movie_weights, axis = 1).reshape((-1, 1))

In [383]:
def similar_movies(name, n=10):
    
    dists = np.dot(movie_weights, movie_weights[movie_index[name]])
    sorted_dists = np.argsort(dists)
    closest = sorted_dists[-n:]
    max_width = max([len(index_movie[c]) for c in closest])
    
    for c in reversed(closest):
        print(f'Movie: {index_movie[c]:{max_width + 2}} Similarity: {dists[c]:.{2}}')

In [384]:
similar_movies('Avatar')

Movie: Avatar                    Similarity: 1.0
Movie: Aliens                    Similarity: 0.75
Movie: Treasure Planet           Similarity: 0.74
Movie: Apollo 13                 Similarity: 0.72
Movie: The Martian               Similarity: 0.72
Movie: Starship Troopers         Similarity: 0.71
Movie: Interstellar              Similarity: 0.7
Movie: Alien: Covenant           Similarity: 0.7
Movie: Sunshine                  Similarity: 0.69
Movie: Star Trek Into Darkness   Similarity: 0.69


In [385]:
col_names = ['membed_'+str(i) for i in range(1,51)]

movie_embeds = pd.DataFrame(movie_weights,columns=col_names)

In [386]:
f_clean = pd.concat([f_clean,movie_embeds],axis=1)

# FE: Now we will again separate the feature data set into train and test sets and work on feature engineering

In [387]:
train = pd.concat([f_clean.iloc[0:df_train.shape[0]],df_train['revenue']],axis=1)
test = f_clean.iloc[df_train.shape[0]:]

 - start by examining the target variable:

In [388]:
train['revenue'].describe().T

count    3.000000e+03
mean     6.672585e+07
std      1.375323e+08
min      1.000000e+00
25%      2.379808e+06
50%      1.680707e+07
75%      6.891920e+07
max      1.519558e+09
Name: revenue, dtype: float64

 - some movies with revenue of 1\$, which seems very unusual. Let's see how many movies are under 300\$ revenue

In [389]:
train[['original_title','revenue']][train['revenue']<300].sort_values(by=['revenue'])

Unnamed: 0,original_title,revenue
1917,The Merry Widow,1
1754,Mute Witness,1
1874,Vermist,1
347,The Wind in the Willows,1
695,Tere Naam,2
2383,Borsalino,3
1541,Все и сразу,3
334,Saamy,3
2117,American Adobo,4
1346,East of Eden,5


 - some of these values may be correct. However, let us assume that a minimum 300\$ is more acceptable

In [390]:
train['revenue'][train['revenue']<300] = 300

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


 - `director` might be important?

In [391]:
train['director'] = train['crew'].apply(lambda x: [i['name'] for i in x if i['job'] == 'Director']).apply(pd.Series).iloc[:,0]

In [392]:
train['revenue'].groupby(train['director']).count().describe().T

count    1857.000000
mean        1.606893
std         1.221448
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        11.000000
Name: revenue, dtype: float64

 - there are 1,857 directors in the training set, which means most directors only show up once. These will not be very informative to discriminate revenue
  - how many directors show up twice or more?

In [393]:
dir_rev= pd.concat([train['revenue'].groupby(train['director']).count(),
           train['revenue'].groupby(train['director']).sum(),
           train['revenue'].groupby(train['director']).mean(),
           train['revenue'].groupby(train['director']).max(),
           train['revenue'].groupby(train['director']).min()], axis=1).reset_index()

dir_rev.columns = ['director','N_movies','Total_rev','Average_rev','Highest_rev','Lowest_rev']

In [394]:
dir_rev[dir_rev['N_movies']>1].sort_values(by=['Average_rev'],ascending=False)[0:10]

Unnamed: 0,director,N_movies,Total_rev,Average_rev,Highest_rev,Lowest_rev
935,Joss Whedon,2,2924961604,1462481000.0,1519557910,1405403694
718,James Gunn,2,1636744770,818372400.0,863416141,773328629
253,Carlos Saldanha,3,2047816032,682605300.0,886686817,500188435
1337,Peter Jackson,7,4450044134,635720600.0,1021103568,29359216
89,Andrew Stanton,3,1834021849,611340600.0,1028570889,284139100
1441,Rob Marshall,2,1207956764,603978400.0,1045713802,162242962
205,Brad Bird,3,1535309794,511769900.0,694713380,209154322
586,George Lucas,3,1501835328,500611800.0,850000000,2437000
517,Francis Lawrence,3,1469661999,489887300.0,653428261,230884728
1160,Michael Bay,8,3906752453,488344100.0,1123746996,69411370


 - perhaps having a high-profile director helps a movie becoming more successful. We could try to extract some features from this
 - let's add summary variables such as average revenue of the director and (High-Low)/Average to give a sense of revenue dispersion
 - we will calculate these measures based on cast members in the training set, but we will then trickle that information down to the test set

In [395]:
dir_rev['Hi_lo_rev'] = (dir_rev['Highest_rev'] - dir_rev['Lowest_rev']) / dir_rev['Average_rev']

In [396]:
f_clean['director'] = f_clean['crew'].apply(lambda x: [i['name'] for i in x if i['job'] == 'Director']).apply(pd.Series).iloc[:,0]

In [397]:
f_clean = f_clean.merge(dir_rev[['director','Average_rev','Hi_lo_rev']],how='left',on='director')
f_clean = f_clean.rename(columns={'Average_rev': 'Dir_avg_rev', 'Hi_lo_rev': 'Dir_HL_rev'})

 - what about the relation between `cast` and revenue?

In [398]:
cast_list = train['cast'].apply(lambda x: [i['name'] for i in x])

In [399]:
len(set([i for j in cast_list for i in j])) # over 38,000 unique cast members in the training set

38588

In [400]:
cast_revenue = []

for i,r in enumerate(train['revenue'].values):
    cast_revenue.extend((act,r) for act in cast_list[cast_list.index==i].iloc[0])
    
cast_revenue = pd.DataFrame(list(cast_revenue), columns=['Name','Revenue'])

cast_rev_sum = pd.concat([cast_revenue.groupby(['Name']).count(),
                          cast_revenue.groupby(['Name']).sum(),
                          cast_revenue.groupby(['Name']).mean(),
                          cast_revenue.groupby(['Name']).max(),
                          cast_revenue.groupby(['Name']).min()], axis=1)

cast_rev_sum.columns = ['N_movies','Total_rev','Average_rev','Highest_rev','Lowest_rev']

In [401]:
cast_rev_sum.sort_values(by=['Highest_rev'],ascending=False)[0:10]

Unnamed: 0_level_0,N_movies,Total_rev,Average_rev,Highest_rev,Lowest_rev
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Brent Reichert,1,1519557910,1519558000.0,1519557910,1519557910
Robert B. Schneider IV,1,1519557910,1519558000.0,1519557910,1519557910
Jeff Seich,1,1519557910,1519558000.0,1519557910,1519557910
Maria Perossa,1,1519557910,1519558000.0,1519557910,1519557910
Jeff Wolfe,3,1702813638,567604500.0,1519557910,78054825
Christina Shaffer,1,1519557910,1519558000.0,1519557910,1519557910
Kelly Ruble,1,1519557910,1519558000.0,1519557910,1519557910
Jeffrey Feingold,2,1673584046,836792000.0,1519557910,154026136
Walter Perez,1,1519557910,1519558000.0,1519557910,1519557910
Kelley Robins,1,1519557910,1519558000.0,1519557910,1519557910


 - the relation between cast members and revenue could be slightly trickier than that of directors and revenue? Should try to summarize this somehow
 - we will try to use summary measures such as number of actors which have participated in movies with extremly high or relatively low revenue, or the minimum highest revenue any cast member has achieved in the past
 - we will calculate these measures based on cast members in the training set, but we will then trickle that information down to the test set

In [402]:
cast_rev_sum['rev99p'] = (cast_rev_sum['Highest_rev'] >cast_revenue['Revenue'].quantile(0.99))*1
cast_rev_sum['rev20p'] = (cast_rev_sum['Highest_rev'] <cast_revenue['Revenue'].quantile(0.20))*1

In [403]:
full_cast_list = f_clean['cast'].apply(lambda x: [i['name'] for i in x])

In [404]:
id_cast = []

for i,r in enumerate(f_clean['id'].values):
    id_cast.extend((r,act) for act in full_cast_list[full_cast_list.index==i].iloc[0])
    
id_cast = pd.DataFrame(list(id_cast), columns=['id','Name'])

In [405]:
cast_rev_movie = id_cast.merge(cast_rev_sum,how='left',on='Name')

In [406]:
cast_rev_summary = pd.concat([cast_rev_movie.groupby(['id']).sum()['rev99p'],
                             cast_rev_movie.groupby(['id']).sum()['rev20p'],
                             cast_rev_movie.groupby(['id']).min()['Highest_rev']],
                             axis=1).reset_index()

In [407]:
f_clean = f_clean.merge(cast_rev_summary,how='left',on='id')
f_clean = f_clean.rename(columns={'rev99p': 'N_cast_99p', 'rev20p': 'N_cast_20p','Highest_rev':'Cast_low_bound'})

 - examining `production_companies` and `revenue`
 - we will treat this variable similarly to the `cast` variable

In [408]:
companies_list = train['production_companies'].apply(lambda x: [i['name'] for i in x])

In [409]:
comp_revenue = []

for i,r in enumerate(train['revenue'].values):
    comp_revenue.extend((comp,r) for comp in companies_list[companies_list.index==i].iloc[0])
    
comp_revenue = pd.DataFrame(list(comp_revenue), columns=['Company','Revenue'])

comp_rev_sum = pd.concat([comp_revenue.groupby(['Company']).count(),
                          comp_revenue.groupby(['Company']).sum(),
                          comp_revenue.groupby(['Company']).mean(),
                          comp_revenue.groupby(['Company']).max(),
                          comp_revenue.groupby(['Company']).min()], axis=1)

comp_rev_sum.columns = ['N_movies','Total_rev','Average_rev','Highest_rev','Lowest_rev']

In [410]:
comp_rev_sum['rev75p'] = (comp_rev_sum['Highest_rev'] >comp_revenue['Revenue'].quantile(0.75))*1
comp_rev_sum['rev25p'] = (comp_rev_sum['Highest_rev'] <comp_revenue['Revenue'].quantile(0.25))*1

In [411]:
full_comp_list = f_clean['production_companies'].apply(lambda x: [i['name'] for i in x])

In [412]:
id_comp = []

for i,r in enumerate(f_clean['id'].values):
    id_comp.extend((r,comp) for comp in full_comp_list[full_comp_list.index==i].iloc[0])
    
id_comp = pd.DataFrame(list(id_comp), columns=['id','Company'])

In [413]:
comp_rev_movie = id_comp.merge(comp_rev_sum,how='left',on='Company')

In [414]:
comp_rev_summary = pd.concat([comp_rev_movie.groupby(['id']).sum()['rev75p'],
                             comp_rev_movie.groupby(['id']).sum()['rev25p'],
                             comp_rev_movie.groupby(['id']).min()['Highest_rev']],
                             axis=1).reset_index()

In [415]:
f_clean = f_clean.merge(comp_rev_summary,how='left',on='id')
f_clean = f_clean.rename(columns={'rev75p': 'N_comp_75p', 'rev25p': 'N_comp_25p','Highest_rev':'Comp_low_bound'})

 - there seem to be some zero values for `budget`, which is odd. We will simply replace them by the training set median, due to time constraints

In [416]:
med_budget = train['budget'].median()

In [417]:
f_clean['budget'] = f_clean['budget'].replace(0, med_budget)

 - there are still some `NaN` values in some of the created columns and in `runtime`. We will replace them by the mode or median

In [418]:
f_clean['runtime'] = f_clean['runtime'].fillna(train['runtime'].median())
f_clean['Dir_avg_rev'] = f_clean['Dir_avg_rev'].fillna(f_clean['Dir_avg_rev'].median())
f_clean['Dir_HL_rev'] = f_clean['Dir_HL_rev'].fillna(f_clean['Dir_HL_rev'].median())
f_clean['N_cast_99p'] = f_clean['N_cast_99p'].fillna(0)
f_clean['N_cast_20p'] = f_clean['N_cast_20p'].fillna(0)
f_clean['Cast_low_bound'] = f_clean['Cast_low_bound'].fillna(f_clean['Cast_low_bound'].median())
f_clean['N_comp_75p'] = f_clean['N_comp_75p'].fillna(1)
f_clean['N_comp_25p'] = f_clean['N_comp_25p'].fillna(0)
f_clean['Comp_low_bound'] = f_clean['Comp_low_bound'].fillna(f_clean['Comp_low_bound'].median())


 - we will extract some features from this notebook https://www.kaggle.com/joanalpinto/moviestmdb-datapreparation

In [419]:
extra_train = pd.read_csv('../input/moviestmdb-datapreparation/train_prep.csv')
extra_test = pd.read_csv('../input/moviestmdb-datapreparation/test_prep.csv')

In [420]:
cols_add = ['has_collection','num_cast','num_crew','genres_name_Drama','genres_name_Comedy','genres_name_Thriller',
            'genres_name_Action','genres_name_Romance','genres_name_Crime','genres_name_Adventure',
            'genres_name_Horror','genres_name_Science Fiction','genres_name_Family',
            'production_countries_name_United States of America',
            'spoken_languages_name_English','spoken_languages_name_Français','spoken_languages_name_Español']

In [460]:
train = pd.concat([f_clean.iloc[0:df_train.shape[0]],extra_train[cols_add],df_train['revenue']],axis=1)
test = pd.concat([f_clean.iloc[df_train.shape[0]:].reset_index(),extra_test[cols_add]],axis=1)

# Modelling

In [461]:
cols_to_drop = ['index','id','belongs_to_collection','genres','homepage','imdb_id','original_language',
                'original_title','overview','poster_path','production_companies','production_countries',
               'release_date','spoken_languages','status','tagline','title','Keywords','cast','crew',
               'edited_title','list_keywords','director']

In [471]:
X = train.drop(['revenue'],axis=1).drop(cols_to_drop,axis=1)
y = np.log1p(train['revenue'])
X_test = test.drop(['level_0'],axis=1).drop(cols_to_drop,axis=1)

In [472]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [473]:
params = {'num_leaves': 30,
         'min_data_in_leaf': 20,
         'objective': 'regression',
         'max_depth': 5,
         'learning_rate': 0.01,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.2,
         "verbosity": -1}

lgbm = lgb.LGBMRegressor(**params, n_estimators = 20000, nthread = 4, n_jobs = -1)
lgbm.fit(X_train, y_train, 
        eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='rmse',
        verbose=1000, early_stopping_rounds=200)

Training until validation scores don't improve for 200 rounds.
[1000]	training's rmse: 0.313195	valid_1's rmse: 0.386213
[2000]	training's rmse: 0.212507	valid_1's rmse: 0.372151
Early stopping, best iteration is:
[2203]	training's rmse: 0.198274	valid_1's rmse: 0.370867


LGBMRegressor(bagging_fraction=0.9, bagging_freq=1, bagging_seed=11,
       boosting='gbdt', boosting_type='gbdt', class_weight=None,
       colsample_bytree=1.0, feature_fraction=0.9, importance_type='split',
       lambda_l1=0.2, learning_rate=0.01, max_depth=5, metric='rmse',
       min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=20,
       min_split_gain=0.0, n_estimators=20000, n_jobs=-1, nthread=4,
       num_leaves=30, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0, verbosity=-1)

In [474]:
eli5.show_weights(lgbm, feature_filter=lambda x: x != '<BIAS>')

Weight,Feature
0.8462,Cast_low_bound
0.0747,Dir_avg_rev
0.0386,N_cast_20p
0.0175,Comp_low_bound
0.0083,num_cast
0.0031,budget
0.0026,popularity
0.0009,num_crew
0.0008,Dir_HL_rev
0.0005,membed_22


In [475]:
lasso = linear_model.Lasso(alpha=0.1)
print(np.sqrt(-cross_val_score(lasso, X, y, cv=10, scoring='neg_mean_squared_error')))

[2.10195367 2.28832129 2.0321788  2.39715568 2.07826831 1.87932137
 2.37074068 2.2762209  1.99636467 1.72257812]


In [476]:
lasso.fit(X,y)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [479]:
preds_lasso = lasso.predict(X_test)

In [480]:
sub = pd.read_csv('../input/tmdb-box-office-prediction/sample_submission.csv')
sub['revenue'] = np.expm1(preds_lasso)
sub.to_csv("lasso_sub.csv", index=False)