# **Environment Setup**

In [1]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
from datetime import datetime
from collections import Counter

In [2]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [3]:
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# Sample Data Loading

In [4]:
sample_df=pd.read_csv('encoded_data/sample_data.csv')

In [5]:
sample_df.head()

Unnamed: 0,year,duration,avg_vote,votes,budget,worlwide_gross_income,day,language_English,language_Spanish,language_French,...,actor_Matthew Goode,actor_Edgar Ramírez,actor_Tyler Perry,actor_Sebastian Stan,actor_Rebecca Hall,actor_Cam Gigandet,actor_Miles Teller,actor_Scott Adkins,actor_Octavia Spencer,actor_Dave Bautista
0,1920,76,8.1,55601,18000,8811,58,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1921,150,7.2,3058,800000,9183673,106,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1921,68,8.3,109038,250000,26916,330,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1923,82,7.0,4735,351000,11233,157,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1925,95,8.2,97480,923000,26916,296,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X = sample_df.drop(["avg_vote","worlwide_gross_income"], axis=1)

In [7]:
X.head()

Unnamed: 0,year,duration,votes,budget,day,language_English,language_Spanish,language_French,language_Russian,language_German,...,actor_Matthew Goode,actor_Edgar Ramírez,actor_Tyler Perry,actor_Sebastian Stan,actor_Rebecca Hall,actor_Cam Gigandet,actor_Miles Teller,actor_Scott Adkins,actor_Octavia Spencer,actor_Dave Bautista
0,1920,76,55601,18000,58,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1921,150,3058,800000,106,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1921,68,109038,250000,330,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1923,82,4735,351000,157,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1925,95,97480,923000,296,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
rate_y=sample_df["avg_vote"]
revenue_y=sample_df["worlwide_gross_income"]

# Data Scaling

In [9]:
duration_scaler=MinMaxScaler().fit(X[["duration"]])
day_scaler=MinMaxScaler().fit(X[["day"]])

In [10]:
X_scaled=X

In [11]:
X_scaled["duration"]=duration_scaler.transform(X_scaled[["duration"]])
X_scaled["day"]=day_scaler.transform(X_scaled[["day"]])

# Feature Importance - Revenue

In [12]:
regressor = XGBRegressor()
regressor.fit(X_scaled, revenue_y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [13]:
rev_select_features={'feature':[],'importance':[]}
for col,score in zip(X_scaled.columns,regressor.feature_importances_):
  if score >0.00001:
    rev_select_features['feature'].append(col)
    rev_select_features['importance'].append(score)
    print(col,score)

year 0.0021819887
duration 0.002200983
votes 0.02438603
budget 0.0740325
day 0.0017901697
language_English 0.00020763616
language_Spanish 0.0016469425
language_French 0.0028030742
language_Russian 0.0009708054
language_German 0.0034133736
language_Italian 0.00019187255
language_Japanese 0.0016167535
language_Mandarin 0.0059919814
language_Arabic 0.00021303396
language_Korean 0.0031267297
language_Cantonese 0.0006513523
language_Hindi 0.0048205573
language_Portuguese 0.00067356887
language_Chinese 0.00047186445
language_Swedish 0.00025421227
language_Thai 0.001983986
language_American Sign Language 0.00046733805
language_Dutch 0.00011604634
language_Afrikaans 0.021414721
language_Romanian 0.003301475
genre_Drama 0.00650941
genre_Comedy 0.0026876165
genre_Action 0.0040166783
genre_Crime 0.0019480704
genre_Romance 0.0012606783
genre_Adventure 0.018557286
genre_Thriller 0.0033317043
genre_Horror 0.00021580342
genre_Mystery 0.00018888841
genre_Fantasy 0.0004086685
genre_Sci-Fi 0.0048249597


In [14]:
len(rev_select_features['feature'])

426

In [15]:
rev_feature_importance_df=pd.DataFrame.from_dict(rev_select_features)

In [16]:
rev_feature_importance_df.head()

Unnamed: 0,feature,importance
0,year,0.002182
1,duration,0.002201
2,votes,0.024386
3,budget,0.074033
4,day,0.00179


In [17]:
rev_select_sample_df=sample_df[rev_select_features['feature']]

In [18]:
rev_select_sample_df.head()

Unnamed: 0,year,duration,votes,budget,day,language_English,language_Spanish,language_French,language_Russian,language_German,...,actor_Daniel Stern,actor_Catherine O'Hara,actor_Queen Latifah,actor_Allison Janney,actor_Kate Mara,actor_Karl Urban,actor_Michael Sheen,actor_Tyler Perry,actor_Miles Teller,actor_Dave Bautista
0,1920,76,55601,18000,58,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1921,150,3058,800000,106,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1921,68,109038,250000,330,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1923,82,4735,351000,157,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1925,95,97480,923000,296,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
rev_feature_importance_df.to_csv('encoded_data/feature_importance_rev_xgb.csv',index=False)
rev_select_sample_df.to_csv('encoded_data/select_sample_data_rev_xgb.csv',index=False)
revenue_y.to_csv("encoded_data/revenue_xgb.csv",index=False)

# Feature Importance - Rating

In [20]:
regressor = XGBRegressor()
regressor.fit(X_scaled, rate_y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [21]:
rat_select_features={'feature':[],'importance':[]}
for col,score in zip(X_scaled.columns,regressor.feature_importances_):
  if score >0.00001:
    rat_select_features['feature'].append(col)
    rat_select_features['importance'].append(score)
    print(col,score)

year 0.002672742
duration 0.0052705724
votes 0.014334809
budget 0.004408414
day 0.001467419
language_English 0.00952788
language_Spanish 0.00087450637
language_French 0.0017532142
language_Russian 0.0016313283
language_German 0.0011400174
language_Italian 0.00017043548
language_Japanese 0.002523162
language_Mandarin 0.0015690048
language_Arabic 0.0014567571
language_Korean 0.003748367
language_Cantonese 0.0019394917
language_Turkish 0.0040750555
language_Latin 0.00014987226
language_Hindi 0.0074302168
language_Portuguese 0.0011151909
language_Hebrew 0.004973364
language_Chinese 0.00013009248
language_Polish 0.009263098
language_Swedish 0.00020465617
language_Hungarian 0.0024384386
language_Vietnamese 0.0020041585
language_American Sign Language 0.00087500946
language_Dutch 0.00096261594
language_Afrikaans 0.0019299405
language_Czech 0.0035622106
genre_Drama 0.048618052
genre_Comedy 0.002061581
genre_Action 0.008468408
genre_Crime 0.0017996436
genre_Romance 0.0015911587
genre_Adventure 

In [22]:
len(rat_select_features['feature'])

473

In [23]:
rat_feature_importance_df=pd.DataFrame.from_dict(rat_select_features)

In [24]:
rat_feature_importance_df.head()

Unnamed: 0,feature,importance
0,year,0.002673
1,duration,0.005271
2,votes,0.014335
3,budget,0.004408
4,day,0.001467


In [25]:
rat_select_sample_df=sample_df[rat_select_features['feature']]

In [26]:
rat_select_sample_df.head()

Unnamed: 0,year,duration,votes,budget,day,language_English,language_Spanish,language_French,language_Russian,language_German,...,actor_Jeremy Piven,actor_Emily Watson,actor_Michael Angarano,actor_Karl Urban,actor_Diane Kruger,actor_Daniel Wu,actor_Vanessa Hudgens,actor_Adam Brody,actor_Michael Ealy,actor_Tyler Perry
0,1920,76,55601,18000,58,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1921,150,3058,800000,106,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1921,68,109038,250000,330,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1923,82,4735,351000,157,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1925,95,97480,923000,296,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# rat_feature_importance_df.to_csv('encoded_data/feature_importance_rate_xgb.csv',index=False)
# rat_select_sample_df.to_csv('encoded_data/select_sample_data_rate_xgb.csv',index=False)
# rate_y.to_csv("encoded_data/rating_xgb.csv",index=False)

In [28]:
rat_languages = rat_select_sample_df.filter(regex="^language", axis=1).columns
rat_languages

Index(['language_English', 'language_Spanish', 'language_French',
       'language_Russian', 'language_German', 'language_Italian',
       'language_Japanese', 'language_Mandarin', 'language_Arabic',
       'language_Korean', 'language_Cantonese', 'language_Turkish',
       'language_Latin', 'language_Hindi', 'language_Portuguese',
       'language_Hebrew', 'language_Chinese', 'language_Polish',
       'language_Swedish', 'language_Hungarian', 'language_Vietnamese',
       'language_American Sign Language', 'language_Dutch',
       'language_Afrikaans', 'language_Czech'],
      dtype='object')

In [29]:
rat_lang_list = rat_languages.tolist()
rat_lang_list_clean = [ x.split(sep="_")[1] for x in rat_lang_list ]
rat_lang_list_clean

['English',
 'Spanish',
 'French',
 'Russian',
 'German',
 'Italian',
 'Japanese',
 'Mandarin',
 'Arabic',
 'Korean',
 'Cantonese',
 'Turkish',
 'Latin',
 'Hindi',
 'Portuguese',
 'Hebrew',
 'Chinese',
 'Polish',
 'Swedish',
 'Hungarian',
 'Vietnamese',
 'American Sign Language',
 'Dutch',
 'Afrikaans',
 'Czech']

In [30]:
rev_languages = rev_select_sample_df.filter(regex="^language", axis=1).columns
rev_languages

Index(['language_English', 'language_Spanish', 'language_French',
       'language_Russian', 'language_German', 'language_Italian',
       'language_Japanese', 'language_Mandarin', 'language_Arabic',
       'language_Korean', 'language_Cantonese', 'language_Hindi',
       'language_Portuguese', 'language_Chinese', 'language_Swedish',
       'language_Thai', 'language_American Sign Language', 'language_Dutch',
       'language_Afrikaans', 'language_Romanian'],
      dtype='object')

In [31]:
rev_lang_list = rev_languages.tolist()
rev_lang_list_clean = [ x.split(sep="_")[1] for x in rev_lang_list ]
rev_lang_list_clean

['English',
 'Spanish',
 'French',
 'Russian',
 'German',
 'Italian',
 'Japanese',
 'Mandarin',
 'Arabic',
 'Korean',
 'Cantonese',
 'Hindi',
 'Portuguese',
 'Chinese',
 'Swedish',
 'Thai',
 'American Sign Language',
 'Dutch',
 'Afrikaans',
 'Romanian']

In [36]:
rev_companies = rev_select_sample_df.filter(regex="^company", axis=1).columns
rev_co_list = rev_companies.tolist()
rev_co_list_clean = [ x.split(sep="_")[1] for x in rev_co_list ]

In [37]:
rat_companies = rat_select_sample_df.filter(regex="^company", axis=1).columns
rat_co_list = rat_companies.tolist()
rat_co_list_clean = [ x.split(sep="_")[1] for x in rat_co_list ]

In [38]:
rev_co_list_clean 

['Universal Pictures',
 'Warner Bros.',
 'Paramount Pictures',
 'Twentieth Century Fox',
 'New Line Cinema',
 'Touchstone Pictures',
 'Walt Disney Pictures',
 'DreamWorks',
 'Screen Gems',
 'Summit Entertainment',
 'Fox 2000 Pictures',
 'Dimension Films',
 'Focus Features',
 'Hollywood Pictures',
 'DreamWorks Animation',
 'Walt Disney Animation Studios',
 'Walt Disney Productions',
 'StudioCanal',
 'Blumhouse Productions',
 'Polygram Filmed Entertainment',
 'Amblin Entertainment',
 'Marvel Studios',
 'Annapurna Pictures',
 'Lucasfilm',
 'Twisted Pictures',
 'Twentieth Century Fox Animation',
 'A24',
 'Bona Film Group',
 'Selznick International Pictures',
 'Constantin Film',
 'Disney Television Animation',
 'Icon Productions',
 'Gary Sanchez Productions',
 'Blue Sky Studios']

In [39]:
rat_co_list_clean

['Universal Pictures',
 'Warner Bros.',
 'Columbia Pictures',
 'Paramount Pictures',
 'Twentieth Century Fox',
 'New Line Cinema',
 'Metro-Goldwyn-Mayer (MGM)',
 'Walt Disney Pictures',
 'Miramax',
 'Screen Gems',
 'Fox 2000 Pictures',
 'Dimension Films',
 'Castle Rock Entertainment',
 'Focus Features',
 'Orion Pictures',
 'Revolution Studios',
 'Regency Enterprises',
 'The Weinstein Company',
 'Lions Gate Films',
 'Imagine Entertainment',
 'Eon Productions',
 'StudioCanal',
 'Artisan Entertainment',
 'Polygram Filmed Entertainment',
 'Central Partnership',
 'Amblin Entertainment',
 'Emmett/Furla/Oasis Films (EFO Films)',
 'Rogue Pictures',
 'CTB Film Company',
 'Art Pictures Studio',
 'Cannon Group',
 'China Film Group Corporation (CFGC)',
 'Incorporated Television Company (ITC)',
 'American Playhouse',
 '40 Acres & A Mule Filmworks',
 'Davis Entertainment',
 'New Regency Pictures',
 'Yash Raj Films',
 "Anarchist's Convention Films",
 'Channel Four Films',
 'Sony Pictures Classics',
 

In [56]:
all_companies = list(set(rev_co_list_clean + rat_co_list_clean))

In [57]:
len(all_companies)

75

In [59]:
company_list = list()

for i in range(len(all_companies)):
    comp = all_companies[i]
    company_list.append({"company": comp})

In [60]:
company_list

[{'company': 'Destination Films'},
 {'company': 'Bona Film Group'},
 {'company': 'Annapurna Pictures'},
 {'company': 'Selznick International Pictures'},
 {'company': 'Regency Enterprises'},
 {'company': 'Revolution Studios'},
 {'company': 'Cannon Group'},
 {'company': 'Channel Four Films'},
 {'company': 'Columbia Pictures'},
 {'company': 'DreamWorks'},
 {'company': 'Cinema Group Ventures'},
 {'company': 'Amblin Entertainment'},
 {'company': 'American Playhouse'},
 {'company': 'Central Partnership'},
 {'company': 'Summit Entertainment'},
 {'company': 'CTB Film Company'},
 {'company': "Anarchist's Convention Films"},
 {'company': 'Gary Sanchez Productions'},
 {'company': 'Atlas Entertainment'},
 {'company': 'DreamWorks Animation'},
 {'company': 'Icon Productions'},
 {'company': 'Trimark Pictures'},
 {'company': 'A24'},
 {'company': 'Cannon Films'},
 {'company': 'Art Pictures Studio'},
 {'company': 'Touchstone Pictures'},
 {'company': 'Yash Raj Films'},
 {'company': 'Dimension Films'},
 {