# Enviroment Setup

In [1]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
from datetime import datetime
from collections import Counter

In [2]:
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
import joblib

# Sample Data Loading

In [3]:
rev_select_sample_df = pd.read_csv("model/encoded_data/select_sample_data_rev_xgb.csv")
revenue_y = pd.read_csv("model/encoded_data/revenue_xgb.csv")
rat_select_sample_df = pd.read_csv("model/encoded_data/select_sample_data_rate_xgb.csv")
rate_y = pd.read_csv("model/encoded_data/rating_xgb.csv")

In [4]:
rev_select_sample_df.head()

Unnamed: 0,year,duration,votes,budget,day,language_Spanish,language_French,language_Russian,language_German,language_Italian,...,actor_Daniel Stern,actor_Blythe Danner,actor_Catherine O'Hara,actor_Allison Janney,actor_Karl Urban,actor_Vanessa Hudgens,actor_Christoph Waltz,actor_Tyler Perry,actor_Octavia Spencer,actor_Dave Bautista
0,1920,76,55601,18000,58,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1921,150,3058,800000,106,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1921,68,109038,250000,330,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1923,82,4735,351000,157,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1925,95,97480,923000,296,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
rev_select_sample_df["budget"].mean()

25100977.97102593

In [6]:
X_rev = rev_select_sample_df
X_rat = rat_select_sample_df

# Data Scaler

In [7]:
duration_scaler=MinMaxScaler().fit(X_rev[["duration"]])
day_scaler=MinMaxScaler().fit(X_rev[["day"]])

# Revenue Predictor Load

In [8]:
revenue_filename = 'model/revenue_xgboost_model.sav'
revenue_regressor = joblib.load(revenue_filename)

In [9]:
revenue_inputs={
    # Compulsory inputs
    "comp":{},
    # Optional inputs
    "opt":{}}
revenue_inputs["comp"]["year"]=2021
revenue_inputs["comp"]["day"]=355
revenue_inputs["comp"]["duration"]=120
revenue_inputs["comp"]["votes"]=500000
revenue_inputs["comp"]["budget"]=25000000
revenue_inputs["opt"]["language_"]=[] # Spanish/French/Russian etc.
revenue_inputs["opt"]["genre_"]=[] # Drama/Action/Comedy etc.
revenue_inputs["opt"]["country_"]=[] # UK/China/France/Australia etc. 
revenue_inputs["opt"]["director_"]=[] # Woody Allen/Renny Harlin/Paul Schrader etc.
revenue_inputs["opt"]["writer_"]=[] # Zak Penn/Tyler Perry/Christopher Nolan etc.
revenue_inputs["opt"]["company_"]=[] # Warner Bros./Columbia Pictures/Paramount Pictures etc.
revenue_inputs["opt"]["actor_"]=[] # Tom Hanks/Bruce Willis/Tom Cruise etc.

In [10]:
rev_features=rev_select_sample_df.columns.tolist()

In [11]:
emp_dict = {}
for feature in rev_features:
    emp_dict[feature] = [0]
test_df =pd.DataFrame.from_dict(emp_dict)
test_df

Unnamed: 0,year,duration,votes,budget,day,language_Spanish,language_French,language_Russian,language_German,language_Italian,...,actor_Daniel Stern,actor_Blythe Danner,actor_Catherine O'Hara,actor_Allison Janney,actor_Karl Urban,actor_Vanessa Hudgens,actor_Christoph Waltz,actor_Tyler Perry,actor_Octavia Spencer,actor_Dave Bautista
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Language Input
revenue_inputs["opt"]["language_"].append("Spanish") # English/Spanish/French/Russian etc.
# Genre Input
revenue_inputs["opt"]["genre_"].append("Drama") # Drama/Action/Comedy etc.
# Country Input
revenue_inputs["opt"]["country_"].append("UK") # UK/China/France/Australia etc. 
# Director Input
revenue_inputs["opt"]["director_"].append("Woody Allen") # Woody Allen/Renny Harlin/Paul Schrader etc.
# Writer Input
revenue_inputs["opt"]["writer_"].append("Zak Penn") # Zak Penn/Tyler Perry/Christopher Nolan etc.
# Company Input
revenue_inputs["opt"]["company_"].append("Warner Bros.") # Warner Bros./Columbia Pictures/Paramount Pictures etc.
# Actor Input
revenue_inputs["opt"]["actor_"].append("Tom Hanks") # Tom Hanks/Bruce Willis/Tom Cruise etc.

In [13]:
for rev_comp_key in revenue_inputs["comp"].keys():
    test_df[rev_comp_key]=revenue_inputs["comp"][rev_comp_key]
for rev_opt_key in revenue_inputs["opt"].keys():
    for x in revenue_inputs["opt"][rev_opt_key]:
        feature = rev_opt_key + x
        if feature in rev_features:
            test_df[feature]=1

In [14]:
test_df["duration"]=duration_scaler.transform(test_df[["duration"]])
test_df["day"]=day_scaler.transform(test_df[["day"]])

In [15]:
test_df

Unnamed: 0,year,duration,votes,budget,day,language_Spanish,language_French,language_Russian,language_German,language_Italian,...,actor_Daniel Stern,actor_Blythe Danner,actor_Catherine O'Hara,actor_Allison Janney,actor_Karl Urban,actor_Vanessa Hudgens,actor_Christoph Waltz,actor_Tyler Perry,actor_Octavia Spencer,actor_Dave Bautista
0,2021,0.274038,500000,25000000,0.969863,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
revenue_pred = revenue_regressor.predict(test_df)
print("$ %d"%(revenue_pred))

$ 295511776


# Rating Predictor Load

In [17]:
rate_filename = 'model/rating_xgboost_model.sav'
rate_regressor = joblib.load(rate_filename)

In [18]:
rate_inputs={
    # Compulsory inputs
    "comp":{},
    # Optional inputs
    "opt":{}}
rate_inputs["comp"]["year"]=2021
rate_inputs["comp"]["day"]=355
rate_inputs["comp"]["duration"]=120
rate_inputs["comp"]["votes"]=500000
rate_inputs["comp"]["budget"]=25000000
rate_inputs["opt"]["language_"]=[] # English/Spanish/French/Russian etc.
rate_inputs["opt"]["genre_"]=[] # Drama/Action/Comedy etc.
rate_inputs["opt"]["country_"]=[] # USA/UK/China/France/Australia etc. 
rate_inputs["opt"]["director_"]=[] # Woody Allen/Renny Harlin/Paul Schrader etc.
rate_inputs["opt"]["writer_"]=[] # Zak Penn/Tyler Perry/Christopher Nolan etc.
rate_inputs["opt"]["company_"]=[] # Warner Bros./Columbia Pictures/Paramount Pictures etc.
rate_inputs["opt"]["actor_"]=[] # Nicolas Cage/Bruce Willis/Tom Cruise etc.

In [19]:
rate_features=rat_select_sample_df.columns.tolist()

In [20]:
emp_dict = {}
for feature in rate_features:
    emp_dict[feature] = [0]
rate_test_df =pd.DataFrame.from_dict(emp_dict)
rate_test_df

Unnamed: 0,year,duration,votes,budget,day,language_English,language_Spanish,language_French,language_Russian,language_German,...,actor_Queen Latifah,actor_Demián Bichir,actor_Natascha McElhone,actor_Johnathon Schaech,actor_Emily Watson,actor_Michael Angarano,actor_Kate Mara,actor_Diane Kruger,actor_Vanessa Hudgens,actor_Michael Ealy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Language Input
rate_inputs["opt"]["language_"].append("English") # English/Spanish/French/Russian etc.
# Genre Input
rate_inputs["opt"]["genre_"].append("Drama") # Drama/Action/Comedy etc.
# Country Input
rate_inputs["opt"]["country_"].append("USA") # UK/China/France/Australia etc. 
# Director Input
rate_inputs["opt"]["director_"].append("Woody Allen") # Woody Allen/Renny Harlin/Paul Schrader etc.
# Writer Input
rate_inputs["opt"]["writer_"].append("Zak Penn") # Zak Penn/Tyler Perry/Christopher Nolan etc.
# Company Input
rate_inputs["opt"]["company_"].append("Warner Bros.") # Warner Bros./Columbia Pictures/Paramount Pictures etc.
# Actor Input
rate_inputs["opt"]["actor_"].append("Nicolas Cage") # Nicolas Cage/Bruce Willis/Tom Cruise etc.

In [22]:
for rate_comp_key in rate_inputs["comp"].keys():
    rate_test_df[rate_comp_key]=rate_inputs["comp"][rate_comp_key]
for rate_opt_key in rate_inputs["opt"].keys():
    for x in rate_inputs["opt"][rate_opt_key]:
        feature = rate_opt_key + x
        if feature in rate_features:
            rate_test_df[feature]=1

In [23]:
rate_test_df["duration"]=duration_scaler.transform(rate_test_df[["duration"]])
rate_test_df["day"]=day_scaler.transform(rate_test_df[["day"]])

In [24]:
rate_test_df

Unnamed: 0,year,duration,votes,budget,day,language_English,language_Spanish,language_French,language_Russian,language_German,...,actor_Queen Latifah,actor_Demián Bichir,actor_Natascha McElhone,actor_Johnathon Schaech,actor_Emily Watson,actor_Michael Angarano,actor_Kate Mara,actor_Diane Kruger,actor_Vanessa Hudgens,actor_Michael Ealy
0,2021,0.274038,500000,25000000,0.969863,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
rate_pred = rate_regressor.predict(rate_test_df)
print("%.2f" %rate_pred)

8.10
