## exploratory data analysis

#### import packages

In [1]:
import pandas as pd
import json
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from functools import reduce
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, Normalizer, StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

#### upload dataset and set movie_id as index

In [2]:
data = pd.read_csv('datacleaned.csv')
data.shape

(7339, 21)

In [3]:
data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [4]:
data = data.set_index('movie_id')

## regression model test with only numericals

In [5]:
def LR():
    model = LinearRegression()
    model.fit(X_train,y_train)

    predictions  = model.predict(X_test)
    predictions.shape

    return r2_score(y_test, predictions)

In [6]:
def KNR():
    model = KNeighborsRegressor(n_neighbors=4)

    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    return model.score(X_test, y_test)

In [7]:
def MLPR():

    model = MLPRegressor()
    model.fit(X_train, y_train)
    expected_y  = y_test
    predicted_y = model.predict(X_test)

    return r2_score(expected_y, predicted_y)

In [8]:
X = data.select_dtypes(np.number)

In [9]:
X.drop(['revenue'], axis=1, inplace=True)
y=data[('revenue')]

In [10]:
X.isna().sum().sort_values(ascending=False)

budget          0
popularity      0
runtime         0
vote_average    0
vote_count      0
dtype: int64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
LR(), KNR(), MLPR()

(0.7180452385676053, 0.6638051021294267, 0.5514132403519849)

### handling non-numerical columns

In [13]:
data.select_dtypes(object).columns

Index(['title', 'overview', 'release_date', 'genres', 'original_language',
       'original_title', 'production_companies', 'production_countries',
       'spoken_languages', 'status', 'keywords', 'cast', 'crew'],
      dtype='object')

#### column: overview

based on the poor information overview column gave when data visualizing, droping this column from the model

In [14]:
data.drop(['overview'], axis=1, inplace=True)

#### column: release date

based on data visualization, it is interesting that we perform hot encoding with months. First, we will change the column to datetime for extracting a release month column

In [15]:
data['release_month'] = pd.to_datetime(data['release_date']).dt.month

now, we will perform hot encoding with release month new column and save as 'month_oh'

In [16]:
months_oh = pd.get_dummies(data['release_month'], columns=['release_month'])

In [17]:
months_oh.shape

(7339, 12)

#### column: genres

from the data visualization part, we saw there are 19 genres, we will handle this column info splitted by ',' and perform hot encoding with them and save as 'genres_oh'

In [18]:
genres_oh = data.genres.str.split(',', expand=True).stack()

In [19]:
genres_oh = pd.get_dummies(genres_oh).groupby(level=0).sum()

In [20]:
genres_oh

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
5,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
12,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
13,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
14,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118784,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
118957,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
118991,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0
119123,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0


#### column: original language

in the data visualization part, we saw 6k films were in english as original language. I will encode this column by en and the rest

In [21]:
origlang_oh = data['original_language']

In [22]:
origlang_oh = pd.DataFrame(origlang_oh).rename(columns={"title": "count"})

In [23]:
origlang_oh.loc[~origlang_oh["original_language"].isin(['en']), "original_language"] = "Others"

In [24]:
origlang_oh = pd.get_dummies(origlang_oh).groupby(level=0).sum()

#### column: production companies

we will hot encode the 3 most frequent production companies according to data visualization step: Warner Bros, Universal Pictures and Columbia Pictures, and others for the rest of production companies. Will save it in 'bigfour_oh'

In [25]:
data['production_companies'] = data['production_companies'].fillna('None')

In [26]:
warner = data[data.production_companies.str.contains('Warner')]
warner['warner'] = 1
warner = warner[['warner']]

In [27]:
data = pd.merge(data,warner, on='movie_id',how='left')
data['warner'] = data['warner'].fillna(0)

In [28]:
universal = data[data.production_companies.str.contains('Universal Pictures')]
universal['universal'] = 1
universal = universal[['universal']]

In [29]:
data = pd.merge(data,universal, on='movie_id',how='left')
data['universal'] = data['universal'].fillna(0)

In [30]:
columbia = data[data.production_companies.str.contains('Columbia Pictures')]
columbia['columbia'] = 1
columbia = columbia[['columbia']]

In [31]:
data = pd.merge(data,columbia, on='movie_id',how='left')
data['columbia'] = data['columbia'].fillna(0)

#### column: spoken languages

hot encoding of spoken languages column

In [32]:
spokenlanguages_oh = data['spoken_languages'].fillna('None')

In [33]:
spokenlanguages_oh = data.spoken_languages.str.split(',', expand=True).stack()

In [34]:
spokenlanguages_oh = pd.get_dummies(spokenlanguages_oh).groupby(level=0).sum()

In [35]:
spokenlanguages_oh.shape

(7272, 66)

#### column: keywords

will hot encode keywords column with most frequesnt words (according to data visualization). These are: murder, police, revenge

In [37]:
kw_oh = data['keywords']

In [38]:
kw_oh = pd.DataFrame(kw_oh).rename(columns={"title": "count"})

In [39]:
kw_oh.loc[~kw_oh["keywords"].isin(['revenge','police']), "keywords"] = "Others"

In [40]:
data['keywords'] = data['keywords'].fillna('None')

In [41]:
murder = data[data.keywords.str.contains('murder')]

In [42]:
murder['keyword_murder'] = 1

In [43]:
murder = murder['keyword_murder']

In [44]:
kw_oh = pd.merge(kw_oh,murder, on='movie_id',how='left')

In [45]:
kw_oh = pd.get_dummies(kw_oh).groupby(level=0).sum()

In [46]:
kw_oh.head(5)

Unnamed: 0_level_0,keyword_murder,keywords_Others,keywords_police,keywords_revenge
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5,0.0,1,0,0
6,0.0,1,0,0
12,0.0,1,0,0
13,0.0,1,0,0
14,0.0,1,0,0


#### column: crew

I will hot encode crew column by Steven Spielberg films, Clint Eastwood films, and others

In [47]:
crew_oh = data['crew']

In [48]:
crew_oh = pd.DataFrame(crew_oh).rename(columns={"title": "count"})

In [49]:
crew_oh.loc[~crew_oh["crew"].isin(['StevenSpielberg','ClintEastwood']), "crew"] = "None"

In [50]:
crew_oh = pd.get_dummies(crew_oh).groupby(level=0).sum()

In [51]:
crew_oh.shape

(7339, 3)

## build model with categoricals oh dataframes created

In [52]:
rmodel_categoricals = [months_oh, genres_oh, origlang_oh, spokenlanguages_oh, kw_oh, crew_oh]

df_merged = reduce(lambda left,right: pd.merge(left,right, on='movie_id', how='outer'), rmodel_categoricals)

In [53]:
X_num = data.select_dtypes(np.number)

In [54]:
X_cat = df_merged

In [55]:
X = pd.merge(X_num,df_merged, on='movie_id')

In [56]:
X = pd.merge(X_num,X_cat, on='movie_id')

## feature engineering

In [57]:
data.loc[27205]

title                                                           Inception
release_date                                                   2010-07-15
genres                                   Action,Science Fiction,Adventure
budget                                                          160000000
original_language                                                      en
original_title                                                  Inception
popularity                                                        155.584
production_companies     Legendary Pictures,Syncopy,Warner Bros. Pictures
production_countries              United Kingdom,United States of America
runtime                                                             148.0
spoken_languages                                              English,日本語
status                                                           Released
vote_average                                                          8.3
vote_count                            

In [58]:
data['vote_count'].sort_values(ascending=False)[:2]

movie_id
27205    30176
155      26143
Name: vote_count, dtype: int64

In [59]:
data = data[data['vote_count']<26143]

In [60]:
data.drop(['release_month'], axis=1, inplace=True)

In [61]:
X_num = data.select_dtypes(np.number)

In [62]:
X = pd.merge(X_num,X_cat, on='movie_id',how='left')

In [63]:
X.drop(['revenue'], axis=1, inplace=True)
y=data[('revenue')]

In [64]:
X = X.fillna(0)

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
LR(), KNR(), MLPR()

(0.8024859663336612, 0.673625818894129, 0.6176045811946846)

##### random forest

In [67]:
feature_list = list(X.columns)

In [68]:
y=data[('revenue')]
y=pd.DataFrame(y)

In [69]:
Y=np.array(y)

In [70]:
regressor = RandomForestRegressor(n_estimators = 10, random_state = 42)
regressor.fit(X_train,y_train)

RandomForestRegressor(n_estimators=10, random_state=42)

In [71]:
y_pred = regressor.predict(X_test)

In [72]:
r2_score(y_test,y_pred)

0.818873350961909

In [79]:
importances = list(regressor.feature_importances_)
feature_importances = [(feature, round(importance, 4)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('{:20} {}'.format(*pair)) for pair in feature_importances]

vote_count           0.6257
budget               0.173
popularity           0.0563
runtime              0.0336
vote_average         0.0193
                   6 0.0084
Adventure            0.0054
                   5 0.0053
Science Fiction      0.005
Drama                0.0042
Comedy               0.0038
Romance              0.0034
Français             0.0033
Family               0.003
crew_StevenSpielberg 0.0029
Fantasy              0.0028
Thriller             0.0027
Action               0.0026
warner               0.0021
                  11 0.002
ελληνικά             0.002
universal            0.0019
Crime                0.0019
                  12 0.0018
                   7 0.0017
Animation            0.0017
                   3 0.0015
Music                0.0013
日本語                  0.0013
columbia             0.0012
Pусский              0.0012
עִבְרִית             0.0012
                   4 0.0011
                   9 0.0011
Español              0.001
Italiano             0.001

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [74]:
X = pd.merge(X_num,df_merged, on='movie_id')

In [75]:
X = X.filter(['vote_count','budget','popularity','runtime','vote_average','warner','6','5','Drama','Science Fiction','Adventure','Romance','Comedy','Family','Français','crew_StevenSpielberg','Thriller','12','Fantasy','Action','11','ελληνικά','Crime','3','Español','Português','Pусский','7','Animation','Music'],axis=1)

In [76]:
X = X.fillna(0)

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
LR(), KNR(), MLPR()

(0.8006861003810676, 0.67357509019965, 0.6176970345250721)