# **Environment Setup**

In [1]:
# !pip install wordcloud

In [2]:
# !pip install lightgbm

# Data Pre-processing

In [3]:
# In case you get an import error, run again notebook cell number 3, 
# to install again miniconda and RAPIDS dependencies. If the problem persists,
# Go to Runtime -> Reset all runtimes and then run again the noebook cells (two 
# times the one for installing miniconda and RAPIDS dependencies)

# Dependencies

# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedKFold
# from sklearn.datasets import make_classification

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

from matplotlib import pyplot
import pandas as pd
import numpy as np

from numpy import mean
from numpy import std

# Light GBM Regressor Machine Learning Model
from lightgbm import LGBMRegressor

# Machine Learning Model Exporter
import joblib

In [4]:
# Import Cleaned Dataset as CSV

# Google Colab import with Google Drive mounted:
# df = pd.read_csv('/content/drive/MyDrive/MonashDataBootcamp/Final_Project/movie_sample.csv') 

df = pd.read_csv('data/movie_sample.csv') 

df.head()

Unnamed: 0.1,Unnamed: 0,year,duration,avg_vote,votes,budget,worlwide_gross_income,day,language_English,language_Spanish,...,actor_Matthew Goode,actor_Edgar Ramírez,actor_Tyler Perry,actor_Sebastian Stan,actor_Rebecca Hall,actor_Cam Gigandet,actor_Miles Teller,actor_Scott Adkins,actor_Octavia Spencer,actor_Dave Bautista
0,0,1920,76,8.1,55601,18000,8811,58,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1921,150,7.2,3058,800000,9183673,106,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1921,68,8.3,109038,250000,26916,330,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1923,82,7.0,4735,351000,11233,157,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4,1925,95,8.2,97480,923000,26916,296,1,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Confirm all columns are numerical datatypes
df.dtypes

Unnamed: 0                 int64
year                       int64
duration                   int64
avg_vote                 float64
votes                      int64
                          ...   
actor_Cam Gigandet         int64
actor_Miles Teller         int64
actor_Scott Adkins         int64
actor_Octavia Spencer      int64
actor_Dave Bautista        int64
Length: 1873, dtype: object

In [6]:
df.shape

(8870, 1873)

# Model test

In [7]:
# Drop superfluous index column
sample_df = df.drop(["Unnamed: 0"], axis=1)

In [8]:
sample_df.head()

Unnamed: 0,year,duration,avg_vote,votes,budget,worlwide_gross_income,day,language_English,language_Spanish,language_French,...,actor_Matthew Goode,actor_Edgar Ramírez,actor_Tyler Perry,actor_Sebastian Stan,actor_Rebecca Hall,actor_Cam Gigandet,actor_Miles Teller,actor_Scott Adkins,actor_Octavia Spencer,actor_Dave Bautista
0,1920,76,8.1,55601,18000,8811,58,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1921,150,7.2,3058,800000,9183673,106,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1921,68,8.3,109038,250000,26916,330,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1923,82,7.0,4735,351000,11233,157,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1925,95,8.2,97480,923000,26916,296,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Assign X variables by dropping Y targets
X = sample_df.drop(["avg_vote","worlwide_gross_income"], axis=1)

In [10]:
X.head()

Unnamed: 0,year,duration,votes,budget,day,language_English,language_Spanish,language_French,language_Russian,language_German,...,actor_Matthew Goode,actor_Edgar Ramírez,actor_Tyler Perry,actor_Sebastian Stan,actor_Rebecca Hall,actor_Cam Gigandet,actor_Miles Teller,actor_Scott Adkins,actor_Octavia Spencer,actor_Dave Bautista
0,1920,76,55601,18000,58,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1921,150,3058,800000,106,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1921,68,109038,250000,330,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1923,82,4735,351000,157,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1925,95,97480,923000,296,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Assign two separate Y targets:
# film review score set as vote_y
# film worldwide gross income set as revenue_y

vote_y = sample_df["avg_vote"]

revenue_y = sample_df["worlwide_gross_income"]

In [12]:
# Function to train an LGBM Model

def LightGBM_Model(Variable, variable_scaled, target, rand_state=42):
    
    # Split the data to train and test
    X_train, X_test, y_train, y_test = \
    train_test_split(variable_scaled, target, test_size=0.2, random_state=rand_state)

    print('X_train dimensions: ', X_train.shape, 'y_train: ', y_train.shape)
    print('X_test dimensions:', X_test.shape, 'y_test: ', y_test.shape)

    # Scale only duration and day columns using Min Max Scaler
    duration_scaler = MinMaxScaler().fit(Variable[["duration"]])
    day_scaler = MinMaxScaler().fit(Variable[["day"]])

    # Duplicate X_train and X_test for scaling purposes
    X_train_scaled = X_train
    X_test_scaled = X_test
    
    # Use Scaler to transform relevant X_train and X_test columns
    X_train_scaled["duration"] = duration_scaler.transform(X_train_scaled[["duration"]])
    X_train_scaled["day"] = day_scaler.transform(X_train_scaled[["day"]])

    X_test_scaled["duration"] = duration_scaler.transform(X_test_scaled[["duration"]])
    X_test_scaled["day"] = day_scaler.transform(X_test_scaled[["day"]])
    
    # Set the model to LGBM Regressor
    regressor = LGBMRegressor()
    
    # Train the model 
    regressed = regressor.fit(X_train_scaled,y_train) 
    return regressed, X_train_scaled, X_test_scaled, y_test

## Revenue Predictor
### All Variables
<hr>
Light GBM Regression Model for Revenue Prediction

In [13]:
# Create and train a LightGBM model for revenue prediction
revenue_y_regressor, revenue_X_train_scaled, revenue_X_test_scaled, revenue_y_test = LightGBM_Model(X, X, revenue_y, rand_state=42)

X_train dimensions:  (7096, 1870) y_train:  (7096,)
X_test dimensions: (1774, 1870) y_test:  (1774,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [14]:
# Make prediction using test data
revenue_y_pred = revenue_y_regressor.predict(revenue_X_test_scaled)

# Calculate R-squared score
revenue_score = r2_score(revenue_y_test, revenue_y_pred)
print(f"R-squared Score: {revenue_score}")

R-squared Score: 0.7701690631865028


## Revenue Predictor
### Selected Features / Feature Importances
<hr>
select the most important features to train a new model

In [15]:
# Function to find most important features and train a refined model

def important_features(X_train_scaled, y_regressor):
    # Train a new model using only the most important features using .feature_importance_ 
    # Save important features in a list select_features
    select_features=[]

    for col,score in zip(X_train_scaled.columns, y_regressor.feature_importances_):
      if score > 1:
        select_features.append(col)
        print(col,score)
        
    print(f"Number of features: {len(select_features)}")
    
    # Set new DataFrame with only important features
    select_sample_df = sample_df[select_features]
    
    return select_sample_df

In [16]:
# Set X_Selected variables on important features DataFrame
revenue_X_selected = important_features(revenue_X_train_scaled, revenue_y_regressor)

#Train a Light GBM Model
revenue_y_s_regressor, revenue_X_s_train_scaled, revenue_X_s_test_scaled, revenue_y_s_test = \
LightGBM_Model(X, revenue_X_selected, revenue_y, rand_state=42)

# Make prediction using test data
revenue_y_s_pred = revenue_y_s_regressor.predict(X_s_test_scaled)

# Calculate R-squared score
revenue_s_score = r2_score(revenue_y_s_test, revenue_y_s_pred)
print(f"R-squared Score: {revenue_s_score}")

year 325
duration 308
votes 599
budget 535
day 319
language_English 2
language_Spanish 11
language_French 8
language_Russian 3
language_German 8
language_Italian 9
language_Japanese 3
language_Mandarin 13
language_Korean 3
language_Cantonese 3
language_Hindi 6
language_Hebrew 4
language_Greek 4
language_Hungarian 2
language_Thai 5
language_Vietnamese 5
genre_Drama 37
genre_Comedy 33
genre_Action 30
genre_Crime 19
genre_Romance 20
genre_Adventure 39
genre_Thriller 4
genre_Horror 6
genre_Mystery 4
genre_Fantasy 19
genre_Sci-Fi 35
genre_Family 38
genre_Biography 13
genre_Animation 48
genre_Music 5
genre_War 2
genre_Musical 2
country_USA 2
country_UK 15
country_France 3
country_Canada 5
country_Germany 7
country_China 15
country_Australia 10
country_Spain 2
country_New Zealand 3
country_Denmark 3
director_Woody Allen 5
director_Clint Eastwood 4
director_Steven Spielberg 12
director_Martin Scorsese 12
writer_Stephen King 2
company_Universal Pictures 23
company_Warner Bros. 4
company_Columbi

NameError: name 'X_selected' is not defined

## Revenue Models Export
<hr>
Export revenue model as .sav and .h5 formats

In [None]:
filename = 'Model/light_gbm_revenue_model.sav'
joblib.dump(revenue_y_regressor, filename)

In [None]:
filename = 'Model/light_gbm_revenue_model.h5'
joblib.dump(revenue_y_regressor, filename)

In [None]:
filename = 'Model/light_gbm_revenue_model.sav'
joblib.dump(revenue_y_s_regressor, filename)

In [None]:
filename = 'Model/light_gbm_revenue_model.h5'
joblib.dump(revenue_y_s_regressor, filename)

## Review Score Predictor
### All Variables
<hr>
Light GBM Regression Model for Review Score Prediction

In [None]:
vote_y_regressor, vote_X_train_scaled, vote_X_test_scaled, vote_y_test = LightGBM_Model(X, X, vote_y, rand_state=42)

In [None]:
# Make prediction using test data
vote_y_pred = vote_y_regressor.predict(vote_X_test_scaled)

# Calculate R-squared score
vote_score = r2_score(vote_y_test, vote_y_pred)
print(f"R-squared Score: {vote_score}")

## Review Score Predictor
### Selected Features / Feature Importances
<hr>
select the most important features to train a new model

In [None]:
# Set X_Selected variables on important features DataFrame
vote_X_selected = important_features(vote_X_train_scaled, vote_y_regressor)

#Train a Light GBM Model
vote_y_s_regressor, vote_X_s_train_scaled, vote_X_s_test_scaled, vote_y_s_test = \
LightGBM_Model(X, vote_X_selected, vote_y, rand_state=42)

# Make prediction using test data
vote_y_s_pred = vote_y_s_regressor.predict(vote_X_s_test_scaled)

# Calculate R-squared score
vote_s_score = r2_score(vote_y_s_test, vote_y_s_pred)
print(f"R-squared Score: {vote_s_score}")

## Review Score Models Export
<hr>
Export review score model as .sav and .h5 formats

In [None]:
filename = 'Model/light_gbm_vote_model.sav'
joblib.dump(vote_y_regressor, filename)

In [None]:
filename_s = 'Model/light_gbm_vote_model.h5'
joblib.dump(vote_y_regressor, filename)

In [None]:
filename_s = 'Model/light_gbm_vote_model.sav'
joblib.dump(vote_y_s_regressor, filename)

In [None]:
filename_s = 'Model/light_gbm_vote_model.h5'
joblib.dump(vote_y_s_regressor, filename)

In [84]:
# # get a list of models to evaluate
# def get_models():
# 	models = dict()
# 	trees = [10, 50, 100, 500, 1000, 5000]
# 	for n in trees:
# 		models[str(n)] = LGBMClassifier(n_estimators=n)
# 	return models

# # get the models to evaluate
# models = get_models()

# # evaluate a give model using cross-validation
# def evaluate_model(model):
# 	cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=1)
# 	scores = cross_val_score(model, X_rev_train, revenue_y_train, scoring='accuracy', cv=cv, n_jobs=-1)
# 	return scores

# # evaluate the models and store results
# results, names = list(), list()
# for name, model in models.items():
# 	scores = evaluate_model(model)
# 	results.append(scores)
# 	names.append(name)
# 	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# # plot model performance for comparison
# pyplot.boxplot(results, labels=names, showmeans=True)
# pyplot.show()

# # evaluate the model
# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# n_scores = cross_val_score(model, X_train_scaled, revenue_y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
# # report performance
# print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: -30785080.589 (2127300.619)


# Hyper Tuning
<hr>
Using GridSearchCV to hyper tune model

In [30]:
param_grid = {
    'max_depth': [2,63],
    'num_leaves': [7, 4095],
    'learning_rate': [0.06,0.08,0.1],
    'n_estimators': [500,1000,2000],
    'min_child_weight': [0.01, 8.87],
    'scale_pos_weight': [1, 10000], 
    'subsample': [0.4, 0.1],
    'colsample_bytree': [0.1,0.2]
    
#    'max_delta_step': [0, 0.2, 0.6, 1, 2],
#    'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
#    'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
    # 'gamma': [0.1, 0.3],
#    'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1]
}

In [31]:
gsearch = GridSearchCV(model_selected, param_grid=param_grid,verbose=3, n_jobs=-1)
gsearch.fit(X_selected_train_scaled,revenue_y_train)

Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed: 18.2min
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed: 24.8min
[Parallel(n_jobs=-1)]: Done 2040 tasks      | elapsed: 33.9min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 42.9min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed: 51.1min finished


GridSearchCV(estimator=LGBMRegressor(), n_jobs=-1,
             param_grid={'colsample_bytree': [0.1, 0.2],
                         'learning_rate': [0.06, 0.08, 0.1],
                         'max_depth': [2, 63], 'min_child_weight': [0.01, 8.87],
                         'n_estimators': [500, 1000, 2000],
                         'num_leaves': [7, 4095],
                         'scale_pos_weight': [1, 10000],
                         'subsample': [0.4, 0.1]},
             verbose=3)

In [32]:
print(gsearch.best_params_)
print(gsearch.best_score_)

{'colsample_bytree': 0.2, 'learning_rate': 0.06, 'max_depth': 63, 'min_child_weight': 0.01, 'n_estimators': 2000, 'num_leaves': 7, 'scale_pos_weight': 1, 'subsample': 0.4}
0.7419403930457946
