# **Environment Setup**

In [1]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
from datetime import datetime
from collections import Counter

In [2]:
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

# Sample Data Loading

In [3]:
rev_select_sample_df = pd.read_csv("encoded_data/select_sample_data_rev_xgb.csv")
revenue_y = pd.read_csv("encoded_data/revenue_xgb.csv")
rat_select_sample_df = pd.read_csv("encoded_data/select_sample_data_rate_xgb.csv")
rate_y = pd.read_csv("encoded_data/rating_xgb.csv")

In [4]:
X_rev = rev_select_sample_df
X_rat = rat_select_sample_df

# Data Scaling

In [5]:
from sklearn.model_selection import train_test_split
X_rev_train, X_rev_test, revenue_y_train, revenue_y_test = train_test_split(X_rev, revenue_y, test_size=0.2)
X_rat_train, X_rat_test, rate_y_train, rate_y_test = train_test_split(X_rat, rate_y, test_size=0.2)

In [6]:
duration_scaler=MinMaxScaler().fit(X_rev[["duration"]])
day_scaler=MinMaxScaler().fit(X_rev[["day"]])

In [7]:
X_rev_train_scaled=X_rev_train
X_rev_test_scaled=X_rev_test

X_rat_train_scaled=X_rat_train
X_rat_test_scaled=X_rat_test

In [8]:
X_rev_train_scaled["duration"]=duration_scaler.transform(X_rev_train_scaled[["duration"]])
X_rev_train_scaled["day"]=day_scaler.transform(X_rev_train_scaled[["day"]])
X_rev_test_scaled["duration"]=duration_scaler.transform(X_rev_test_scaled[["duration"]])
X_rev_test_scaled["day"]=day_scaler.transform(X_rev_test_scaled[["day"]])

X_rat_train_scaled["duration"]=duration_scaler.transform(X_rat_train_scaled[["duration"]])
X_rat_train_scaled["day"]=day_scaler.transform(X_rat_train_scaled[["day"]])
X_rat_test_scaled["duration"]=duration_scaler.transform(X_rat_test_scaled[["duration"]])
X_rat_test_scaled["day"]=day_scaler.transform(X_rat_test_scaled[["day"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

# Revenue Predictor Training

In [9]:
regressor = XGBRegressor()
regressor.fit(X_rev_train_scaled, revenue_y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [10]:
revenue_y_pred = regressor.predict(X_rev_test_scaled)
score = r2_score(revenue_y_test, revenue_y_pred)
print(score)

0.7571356807526265


In [11]:
regressor = XGBRegressor(max_depth= 7, min_child_weight= 4, learning_rate=0.1,
                         n_estimators=500, colsample_bytree= 0.9, subsample = 0.7,
                         gamma= 0.7, objective='reg:squarederror')

In [12]:
gridparas = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.06,0.1,0.3],
    'n_estimators': [100,200],
    'min_child_weight': [2, 3, 4],
    'subsample': [0.5,0.6,0.7],
    'colsample_bytree': [0.8,0.9]
}

In [13]:
gsearch = GridSearchCV(regressor, param_grid=gridparas,verbose=3, n_jobs=-1)
gsearch.fit(X_rev_train_scaled,revenue_y_train)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   11.6s


KeyboardInterrupt: 

In [None]:
print(gsearch.best_params_)
print(gsearch.best_score_)

In [None]:
regressor = XGBRegressor(max_depth= 7, min_child_weight= 2, learning_rate=0.1,
                         n_estimators=200, colsample_bytree= 0.9, subsample = 0.7,
                         gamma= 0.7, objective='reg:squarederror')
regressor.fit(X_rev_train_scaled, revenue_y_train)

In [None]:
revenue_y_pred = regressor.predict(X_rev_test_scaled)
score = r2_score(revenue_y_test, revenue_y_pred)
print(score)

In [None]:
import joblib
filename = 'revenue_xgboost_model.sav'
joblib.dump(regressor, filename)
filename = 'revenue_xgboost_model.h5'
joblib.dump(regressor, filename)
regressor.save_model("revenue_xgboost_regressor.sav")

# Rating Predictor Training

In [None]:
rate_regressor = XGBRegressor()
rate_regressor.fit(X_rat_train_scaled, rate_y_train)

In [None]:
rate_y_pred = rate_regressor.predict(X_rat_test_scaled)
score = r2_score(rate_y_test, rate_y_pred)
print(score)

In [None]:
rate_regressor = XGBRegressor(max_depth= 7, min_child_weight= 2, learning_rate=0.1,
                         n_estimators=200, colsample_bytree= 0.9, subsample = 0.7,
                         gamma= 0.7, objective='reg:squarederror')

In [None]:
gridparas = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.06,0.1,0.3],
    'n_estimators': [100,200],
    'min_child_weight': [2, 3, 4],
    'subsample': [0.5,0.6,0.7],
    'colsample_bytree': [0.8,0.9]
}

In [None]:
gsearch = GridSearchCV(rate_regressor, param_grid=gridparas,verbose=3, n_jobs=-1)
gsearch.fit(X_rat_train_scaled,rate_y_train)

In [None]:
print(gsearch.best_params_)
print(gsearch.best_score_)

In [None]:
rate_regressor = XGBRegressor(max_depth= 7, min_child_weight= 2, learning_rate=0.06,
                         n_estimators=200, colsample_bytree= 0.9, subsample = 0.5,
                         gamma= 0.7, objective='reg:squarederror')
rate_regressor.fit(X_rat_train_scaled, rate_y_train)

In [None]:
rate_y_pred = rate_regressor.predict(X_rat_test_scaled)
score = r2_score(rate_y_test, rate_y_pred)
print(score)

In [None]:
import joblib
filename = 'rating_xgboost_model.sav'
joblib.dump(rate_regressor, filename)
filename = 'rating_xgboost_model.h5'
joblib.dump(rate_regressor, filename)
rate_regressor.save_model("rating_xgboost_regressor.sav")

# Test

In [15]:
def movie_prediction(year, day, budget, duration, votes, language):
    
    for_df = {}
    
    for col in col_list:
        for_df[col] = [0]
    
    input_variables = pd.DataFrame(for_df)
    
    
    if f"language_{language.capitalize()}" in col_list:
        
        
        if (year, day, budget, duration, votes, language):
            
            language_cap = "language_"+language.capitalize()

            
            input_variables.loc[:,"year"] = year
            input_variables.loc[:,"day"] = day
            input_variables.loc[:,"budget"] = budget
            input_variables.loc[:,"duration"] = duration
            input_variables.loc[:,"votes"] = votes
            input_variables.loc[:,language_cap] = 1
    
            prediction = model.predict(input_variables)
    
    return prediction[0]






#using pickle to load model 
import pickle
file_name = 'rating_xgboost_model.h5'

model = pickle.load(open(file_name, "rb"))

# creating dictionary to turn into dataframe for all inputs
for_df = {}

# list of columns names
col_list = list(rat_select_sample_df.columns)

# dummy data
year = 2021
day = 360
budget = 100000000
duration = 90
votes = 100
language = 'english'

# for loop that makes the value in all cells zero
for col in col_list:
    for_df[col] = [0]
    
# creating dataframe 
input_variables = pd.DataFrame(for_df)

# if the user's selected language is in our list of languages...  
if f"language_{language.capitalize()}" in col_list:

    #and if there values for each of the following inputs
    if (year, day, budget, duration, votes, language):

        # in case the input is all caps
        language = language.lower()
        
        # taking 'english' and making it conform to our language columns => language_English
        language_cap = "language_"+language.capitalize()

        #updating dataframe with input data
        input_variables.loc[:,"year"] = year
        input_variables.loc[:,"day"] = day
        input_variables.loc[:,"budget"] = budget
        input_variables.loc[:,"duration"] = duration
        input_variables.loc[:,"votes"] = votes
        input_variables.loc[:,language_cap] = 1
        
#running model
prediction = model.predict(input_variables)

print(prediction)

[5.400668]
