# **Environment Setup**

In [1]:
# !pip install wordcloud

In [2]:
# !pip install lightgbm

In [1]:
# In case you get an import error, run again notebook cell number 3, 
# to install again miniconda and RAPIDS dependencies. If the problem persists,
# Go to Runtime -> Reset all runtimes and then run again the noebook cells (two 
# times the one for installing miniconda and RAPIDS dependencies)

# Dependencies

# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedKFold
# from sklearn.datasets import make_classification

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

from matplotlib import pyplot
import pandas as pd
import numpy as np
import lightgbm as lgb

from numpy import mean
from numpy import std

# Light GBM Regressor Machine Learning Model
from lightgbm import LGBMRegressor

# Machine Learning Model Exporter
import joblib

# Sample Data Loading

In [2]:
# Import Cleaned Dataset as CSV

# Google Colab import with Google Drive mounted:
# df = pd.read_csv('/content/drive/MyDrive/MonashDataBootcamp/Final_Project/movie_sample.csv') 

sample_df=pd.read_csv('encoded_data/sample_data.csv')

sample_df.head()

Unnamed: 0,year,duration,avg_vote,votes,budget,worlwide_gross_income,day,language_English,language_Spanish,language_French,...,actor_Matthew Goode,actor_Edgar Ramírez,actor_Tyler Perry,actor_Sebastian Stan,actor_Rebecca Hall,actor_Cam Gigandet,actor_Miles Teller,actor_Scott Adkins,actor_Octavia Spencer,actor_Dave Bautista
0,1920,76,8.1,55601,18000,8811,58,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1921,150,7.2,3058,800000,9183673,106,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1921,68,8.3,109038,250000,26916,330,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1923,82,7.0,4735,351000,11233,157,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1925,95,8.2,97480,923000,26916,296,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Confirm all columns are numerical datatypes
sample_df.dtypes

year                       int64
duration                   int64
avg_vote                 float64
votes                      int64
budget                     int64
                          ...   
actor_Cam Gigandet         int64
actor_Miles Teller         int64
actor_Scott Adkins         int64
actor_Octavia Spencer      int64
actor_Dave Bautista        int64
Length: 1872, dtype: object

In [4]:
sample_df.shape

(8870, 1872)

In [5]:
# Assign X variables by dropping Y targets
X = sample_df.drop(["avg_vote","worlwide_gross_income"], axis=1)

In [6]:
X.head()

Unnamed: 0,year,duration,votes,budget,day,language_English,language_Spanish,language_French,language_Russian,language_German,...,actor_Matthew Goode,actor_Edgar Ramírez,actor_Tyler Perry,actor_Sebastian Stan,actor_Rebecca Hall,actor_Cam Gigandet,actor_Miles Teller,actor_Scott Adkins,actor_Octavia Spencer,actor_Dave Bautista
0,1920,76,55601,18000,58,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1921,150,3058,800000,106,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1921,68,109038,250000,330,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1923,82,4735,351000,157,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1925,95,97480,923000,296,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Assign two separate Y targets:
# film review score set as vote_y
# film worldwide gross income set as revenue_y

vote_y = sample_df["avg_vote"]

revenue_y = sample_df["worlwide_gross_income"]

# Data Scaling

In [8]:
# Function to train an LGBM Model

def data_scaling(Variable, target, rand_state=42):
    
    # Split the data to train and test
    X_train, X_test, y_train, y_test = \
    train_test_split(Variable, target, test_size=0.2, random_state=rand_state)

    print('X_train dimensions: ', X_train.shape, 'y_train: ', y_train.shape)
    print('X_test dimensions:', X_test.shape, 'y_test: ', y_test.shape)

    # Scale only duration and day columns using Min Max Scaler
    duration_scaler = MinMaxScaler().fit(Variable[["duration"]])
    day_scaler = MinMaxScaler().fit(Variable[["day"]])

    # Duplicate X_train and X_test for scaling purposes
    X_train_scaled = X_train
    X_test_scaled = X_test
    
    # Use Scaler to transform relevant X_train and X_test columns
    X_train_scaled["duration"] = duration_scaler.transform(X_train_scaled[["duration"]])
    X_train_scaled["day"] = day_scaler.transform(X_train_scaled[["day"]])

    X_test_scaled["duration"] = duration_scaler.transform(X_test_scaled[["duration"]])
    X_test_scaled["day"] = day_scaler.transform(X_test_scaled[["day"]])
    return X_train_scaled, X_test_scaled, y_train, y_test

In [9]:
# def LightGBM_Model(X_train_scaled, y_train, params):
#     # Set the model to LGBM Regressor
#     regressor = LGBMRegressor(params)
    
#     # Train the model 
#     regressed = regressor.fit(X_train_scaled,y_train) 
#     return regressed

## All Variables

### Revenue Predictor
<hr>
Light GBM Regression Model for Revenue Prediction

In [9]:
# Create and train a LightGBM model for revenue prediction
revenue_X_train_scaled, revenue_X_test_scaled, revenue_y_train, revenue_y_test = \
data_scaling(X, revenue_y, rand_state=42)


# Set up LightGBM regressor model using default settings
revenue_y_regressor = LGBMRegressor()

# Train model using training data
revenue_y_regressor.fit(revenue_X_train_scaled,revenue_y_train)

X_train dimensions:  (7096, 1870) y_train:  (7096,)
X_test dimensions: (1774, 1870) y_test:  (1774,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

LGBMRegressor()

In [10]:
# Make prediction using test data
revenue_y_pred = revenue_y_regressor.predict(revenue_X_test_scaled)

# Calculate R-squared score
revenue_score = r2_score(revenue_y_test, revenue_y_pred)
print(f"R-squared Score: {revenue_score}")

R-squared Score: 0.7701690631865028


### Review Score Predictor
<hr>
Light GBM Regression Model for Review Score Prediction

In [11]:
# Create and train a LightGBM model for revenue prediction
vote_X_train_scaled, vote_X_test_scaled, vote_y_train, vote_y_test = \
data_scaling(X, vote_y, rand_state=42)

# Set up LightGBM regressor model using default settings
vote_y_regressor = LGBMRegressor()

# Train model using training data
vote_y_regressor.fit(vote_X_train_scaled,vote_y_train)

X_train dimensions:  (7096, 1870) y_train:  (7096,)
X_test dimensions: (1774, 1870) y_test:  (1774,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

LGBMRegressor()

In [12]:
# Make prediction using test data
vote_y_pred = vote_y_regressor.predict(vote_X_test_scaled)

# Calculate R-squared score
vote_score = r2_score(vote_y_test, vote_y_pred)
print(f"R-squared Score: {vote_score}")

R-squared Score: 0.5708410594303142


## Hyperparameter Tuning

In [13]:
# Function for Random Grid Search 

def random_grid_search_hypertune(X_train_scaled, y_train):
    #Set the minimum error arbitrarily large
    min = 99999999999999999999999 
    count = 0 #Used for keeping track of the iteration number
    #How many runs to perform using randomly selected hyperparameters
    iterations = 20
    for i in range(iterations):
        print('iteration number', count)
        count += 1 #increment count
        try:
            d_train = lgb.Dataset(X_train_scaled, label=y_train) #Load in data
            params = {} #initialize parameters
            params['learning_rate'] = np.random.uniform(0, 1)
            params['boosting_type'] = np.random.choice(['gbdt', 'dart', 'goss'])
            params['objective'] = 'regression'
            params['metric'] = 'mae'
            params['sub_feature'] = np.random.uniform(0, 1)
            params['num_leaves'] = np.random.randint(20, 300)
            params['min_data'] = np.random.randint(10, 100)
            params['max_depth'] = np.random.randint(5, 200)
            iterations = np.random.randint(10, 10000)
            print(params, iterations)
            #Train using selected parameters
            clf = lgb.train(params, d_train, iterations)
            random_revenue_y_pred=clf.predict(revenue_X_test_scaled) #Create predictions on test set
            mae=mean_absolute_error(random_revenue_y_pred, revenue_y_test)
            print('MAE:', mae)
            if mae < min:
                min = mae
                pp = params 
        except: #in case something goes wrong
                print('failed with')
                print(params)
    print("*" * 20)
    print('Minimum is: ', min)
    print('Used params', pp)

In [14]:
# Parameters for GridsearchCV

param_grid = {
    'max_depth': [2, 200],
    'num_leaves': [5, 500],
    'learning_rate': [0.06,0.2,0.6],
    'n_estimators': [500, 2000],
    'min_child_weight': [0.01, 8.87],
    'scale_pos_weight': [1, 10000], 
    'subsample': [0.4, 0.1],
    'colsample_bytree': [0.1,0.2]
    
#    'max_delta_step': [0, 0.2, 0.6, 1, 2],
#    'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
#    'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
    # 'gamma': [0.1, 0.3],
#    'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1]
}

### Revenue Hyper parameter Tuning
#### Revenue Predictor Random Grid Search Hypertune 

In [None]:
random_grid_search_hypertune(revenue_X_train_scaled, revenue_y_train)

In [15]:
# Use random grid search hypertuning parameters to train model
revenue_y_rgs_regressor = LGBMRegressor(learning_rate=0.09056583116806483, boosting_type='gbdt', \
                                    objective='regression', metric='mae', \
                                    sub_feature=0.7772631331051166, num_leaves=78, min_data=13, max_depth=22)

revenue_y_rgs_regressor.fit(revenue_X_train_scaled,revenue_y_train)



LGBMRegressor(learning_rate=0.09056583116806483, max_depth=22, metric='mae',
              min_data=13, num_leaves=78, objective='regression',
              sub_feature=0.7772631331051166)

In [17]:
# Make prediction using test data on hypertuned model
revenue_y_pred = revenue_y_rgs_regressor.predict(revenue_X_test_scaled)

# Calculate R-squared score
revenue_rgs_score = r2_score(revenue_y_test, revenue_y_pred)
print(f"R-squared Score: {revenue_rgs_score}")

R-squared Score: 0.7684639727141196


#### Revenue Predictor GridSearchCV Hypertune 

In [16]:
gsearch = GridSearchCV(revenue_y_regressor, param_grid=param_grid,verbose=3, n_jobs=-1)
gsearch.fit(revenue_X_train_scaled, revenue_y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [25]:
print(gsearch.best_params_)
print(gsearch.best_score_)

NameError: name 'gsearch' is not defined

In [17]:
# Use hypertuned parameters to train model
revenue_y_gscv_regressor = LGBMRegressor(learning_rate=0.09056583116806483,
                                     max_depth=22, metric='mae', min_data=13,
                                     num_leaves=78, objective='regression',
                                     sub_feature=0.7772631331051166)

revenue_y_gscv_regressor.fit(revenue_X_train_scaled,revenue_y_train)

LGBMRegressor(learning_rate=0.09056583116806483, max_depth=22, metric='mae',
              min_data=13, num_leaves=78, objective='regression',
              sub_feature=0.7772631331051166)

In [18]:
# Make prediction using test data on hypertuned model
revenue_y_pred = revenue_y_gscv_regressor.predict(revenue_X_test_scaled)

# Calculate R-squared score
revenue_gscv_score = r2_score(revenue_y_test, revenue_y_pred)
print(f"R-squared Score: {revenue_gscv_score}")

R-squared Score: 0.7684639727141196


### Review Score Hyperparameter Tuning
#### Review Score Predictor Random Grid Search Hypertune 

In [14]:
random_grid_search_hypertune(vote_X_train_scaled, vote_y_train)

iteration number 0
{'learning_rate': 0.09430878661507791, 'boosting_type': 'goss', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.7534357801194584, 'num_leaves': 82, 'min_data': 93, 'max_depth': 175} 3341
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1039
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 58
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 6.203875






































MAE: 64890086.580824286
iteration number 1
{'learning_rate': 0.7129096715599723, 'boosting_type': 'goss', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.24450276277266014, 'num_leaves': 126, 'min_data': 82, 'max_depth': 99} 78
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1045
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 61
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 6.203875


MAE: 64890086.58222984
iteration number 2
{'learning_rate': 0.9782858471775676, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.25707878882309776, 'num_leaves': 221, 'min_data': 66, 'max_depth': 75} 6096
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1057
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 67
[LightGBM] [Info] Start training from score 6.203875










































































































MAE: 64890086.57148519
iteration number 3
{'learning_rate': 0.18630278858923222, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.3148858638034946, 'num_leaves': 146, 'min_data': 38, 'max_depth': 194} 288
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 94
[LightGBM] [Info] Start training from score 6.203875






MAE: 64890086.55914948
iteration number 4
{'learning_rate': 0.6716634437094945, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.7157228623927595, 'num_leaves': 291, 'min_data': 27, 'max_depth': 89} 8330
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1221
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 149
[LightGBM] [Info] Start training from score 6.203875




















































































































































MAE: 64890086.57586385
iteration number 5
{'learning_rate': 0.8665904482611387, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.9204353753608535, 'num_leaves': 57, 'min_data': 99, 'max_depth': 147} 7832
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1037
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 57
[LightGBM] [Info] Start training from score 6.203875


























































































MAE: 64890086.58197138
iteration number 6
{'learning_rate': 0.6506291315412116, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.8971719900199382, 'num_leaves': 220, 'min_data': 77, 'max_depth': 130} 1665
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1053
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 65
[LightGBM] [Info] Start training from score 6.203875






















MAE: 64890086.573876254
iteration number 7
{'learning_rate': 0.21620395275523396, 'boosting_type': 'goss', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.3463840857014391, 'num_leaves': 296, 'min_data': 87, 'max_depth': 63} 4221
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1043
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 60
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 6.203875














































MAE: 64890086.57592952
iteration number 8
{'learning_rate': 0.8373536360290718, 'boosting_type': 'goss', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.05562110691838773, 'num_leaves': 242, 'min_data': 27, 'max_depth': 187} 5135
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1221
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 149
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 6.203875
























































MAE: 64890086.57942356
iteration number 9
{'learning_rate': 0.8345779043338999, 'boosting_type': 'goss', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.13734569304695599, 'num_leaves': 169, 'min_data': 88, 'max_depth': 157} 6111
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1041
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 59
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 6.203875
























































































MAE: 64890086.57685856
iteration number 10
{'learning_rate': 0.4340076776613708, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.274147690076871, 'num_leaves': 169, 'min_data': 59, 'max_depth': 102} 6394
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1063
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 70
[LightGBM] [Info] Start training from score 6.203875




























































































MAE: 64890086.56286864
iteration number 11
{'learning_rate': 0.27432492053463153, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.3993502583601609, 'num_leaves': 186, 'min_data': 85, 'max_depth': 75} 1012
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1043
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 60
[LightGBM] [Info] Start training from score 6.203875


















MAE: 64890086.5849758
iteration number 12
{'learning_rate': 0.6258775115038833, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.06775448528981687, 'num_leaves': 47, 'min_data': 66, 'max_depth': 101} 3788
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1057
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 67
[LightGBM] [Info] Start training from score 6.203875




































MAE: 64890086.552099474
iteration number 13
{'learning_rate': 0.5532664040090594, 'boosting_type': 'goss', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.8474105150711506, 'num_leaves': 153, 'min_data': 37, 'max_depth': 17} 316
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1115
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 96
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 6.203875




MAE: 64890086.57555621
iteration number 14
{'learning_rate': 0.837852237415914, 'boosting_type': 'goss', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.5873288375727206, 'num_leaves': 87, 'min_data': 39, 'max_depth': 148} 7226
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 94
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 6.203875
























































































































MAE: 64925531.393801786
iteration number 15
{'learning_rate': 0.913255556198943, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.3844510071434817, 'num_leaves': 90, 'min_data': 39, 'max_depth': 182} 8981


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 94
[LightGBM] [Info] Start training from score 6.203875












MAE: 64890086.57113828
iteration number 16
{'learning_rate': 0.592660417054006, 'boosting_type': 'goss', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.4899394915374986, 'num_leaves': 40, 'min_data': 60, 'max_depth': 71} 7603
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1063
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 70
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 6.203875








































































































MAE: 64890086.5723216
iteration number 17
{'learning_rate': 0.2577869149540145, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.321154075191313, 'num_leaves': 164, 'min_data': 53, 'max_depth': 123} 3132
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1071
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 74
[LightGBM] [Info] Start training from score 6.203875




















































MAE: 64890086.565285355
iteration number 18
{'learning_rate': 0.7694736143127154, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.8857703312733883, 'num_leaves': 30, 'min_data': 36, 'max_depth': 25} 5389
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1119
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 98
[LightGBM] [Info] Start training from score 6.203875


MAE: 64890086.561864324
iteration number 19
{'learning_rate': 0.4909286257427665, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.9262251330906152, 'num_leaves': 135, 'min_data': 63, 'max_depth': 180} 4614
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1059
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 68
[LightGBM] [Info] Start training from score 6.203875








































































MAE: 64890086.57646501
********************
Minimum is:  64890086.552099474
Used params {'learning_rate': 0.6258775115038833, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.06775448528981687, 'num_leaves': 47, 'min_data': 66, 'max_depth': 101}


In [30]:
# Use Random Grid Search hypertuned parameters to train model
vote_y_rgs_regressor = LGBMRegressor(colsample_bytree=0.2, 
                                     learning_rate= 0.06, 
                                     max_depth= 200, 
                                     min_child_weight=0.01, 
                                     n_estimators= 2000,
                                     num_leaves= 5, 
                                     scale_pos_weight= 1, 
                                     subsample= 0.4
                                    )

vote_y_rgs_regressor.fit(vote_X_train_scaled, vote_y_train)

LGBMRegressor(colsample_bytree=0.2, learning_rate=0.06, max_depth=200,
              min_child_weight=0.01, n_estimators=2000, num_leaves=5,
              scale_pos_weight=1, subsample=0.4)

In [31]:
# Make prediction using test data on hypertuned model
vote_y_pred = vote_y_rgs_regressor.predict(vote_X_test_scaled)

# Calculate R-squared score
vote_rgs_score = r2_score(vote_y_test, vote_y_pred)
print(f"R-squared Score: {vote_rgs_score}")

R-squared Score: 0.5730069471504173


#### Revenue Predictor GridSearchCV Hypertune 

In [28]:
# Review score GridsearchCV
gsearch = GridSearchCV(vote_y_regressor, param_grid=param_grid,verbose=3, n_jobs=-1)
gsearch.fit(vote_X_train_scaled, vote_y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed: 16.4min
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed: 25.4min
[Parallel(n_jobs=-1)]: Done 1920 out of 1920 | elapsed: 33.1min finished


GridSearchCV(estimator=LGBMRegressor(), n_jobs=-1,
             param_grid={'colsample_bytree': [0.1, 0.2],
                         'learning_rate': [0.06, 0.2, 0.6],
                         'max_depth': [2, 200],
                         'min_child_weight': [0.01, 8.87],
                         'n_estimators': [500, 2000], 'num_leaves': [5, 500],
                         'scale_pos_weight': [1, 10000],
                         'subsample': [0.4, 0.1]},
             verbose=3)

In [29]:
print(gsearch.best_params_)
print(gsearch.best_score_)

{'colsample_bytree': 0.2, 'learning_rate': 0.06, 'max_depth': 200, 'min_child_weight': 0.01, 'n_estimators': 2000, 'num_leaves': 5, 'scale_pos_weight': 1, 'subsample': 0.4}
0.5950071756583781


In [39]:
# Use GridSearchCV hypertuned parameters to train model
vote_y_gscv_regressor = LGBMRegressor(colsample_bytree=0.2, 
                                     learning_rate= 0.06, 
                                     max_depth= 200, 
                                     min_child_weight=0.01, 
                                     n_estimators= 2000,
                                     num_leaves= 5, 
                                     scale_pos_weight= 1, 
                                     subsample= 0.4
                                    )

vote_y_gscv_regressor.fit(vote_X_train_scaled, vote_y_train)

LGBMRegressor(colsample_bytree=0.2, learning_rate=0.06, max_depth=200,
              min_child_weight=0.01, n_estimators=2000, num_leaves=5,
              scale_pos_weight=1, subsample=0.4)

In [40]:
# Make prediction using test data on hypertuned model
vote_y_pred = vote_y_gscv_regressor.predict(vote_X_test_scaled)

# Calculate R-squared score
vote_gscv_score = r2_score(vote_y_test, vote_y_pred)
print(f"R-squared Score: {vote_gscv_score}")

R-squared Score: 0.5730069471504173


## Revenue Models Export
<hr>
Export revenue model as .sav and .h5 formats

In [38]:
print(f"R-squared Score for untuned model: {revenue_score}")
print(f"R-squared Score for Random Grid Search hypertuned model: {revenue_rgs_score}")
print(f"R-squared Score for GridSearchCV hypertuned model: {revenue_gscv_score}")

R-squared Score for untuned model: 0.7701690631865028
R-squared Score for Random Grid Search hypertuned model: 0.7684639727141196


NameError: name 'revenue_gscv_score' is not defined

In [41]:
filename = 'revenue_lgbm_model.sav'
joblib.dump(revenue_y_regressor, filename)

['revenue_lgbm_model.sav']

In [42]:
filename = 'revenue_lgbm_model.h5'
joblib.dump(revenue_y_regressor, filename)

['revenue_lgbm_model.h5']

## Review Score Models Export
<hr>
Export review score model as .sav and .h5 formats

In [37]:
print(f"R-squared Score for untuned model: {vote_score}")
print(f"R-squared Score for Random Grid Search hypertuned model: {vote_rgs_score}")
print(f"R-squared Score for GridSearchCV hypertuned model: {vote_rgs_score}")

R-squared Score for untuned model: 0.5708410594303142


NameError: name 'vote_rgs_score' is not defined

In [43]:
filename = 'rating_lgbm_model.sav'
joblib.dump(vote_y_gscv_regressor, filename)

['rating_lgbm_model.sav']

In [44]:
filename_s = 'rating_lgbm_model.h5'
joblib.dump(vote_y_gscv_regressor, filename)

['rating_lgbm_model.sav']

## Selected Features / Feature Importances

In [45]:
# Function to find most important features and train a refined model

def important_features(X_train_scaled, y_regressor):
    # Train a new model using only the most important features using .feature_importance_ 
    # Save important features in a list select_features
    select_features={'feature':[],'importance':[]}

    for col,score in zip(X_train_scaled.columns, y_regressor.feature_importances_):
      if score > 1:
        select_features['feature'].append(col)
        select_features['importance'].append(score)
        print(col,score)
        
    print(f"Number of features: {len(select_features['feature'])}")
    
#     # Set new DataFrame with only important features
#     select_sample_df = sample_df[select_features['feature']]
    
    return select_features 

### Revenue Predictor
<hr>
select the most important features to train a new model

In [46]:
# Set X_Selected variables on important features DataFrame
revenue_X_selected = important_features(revenue_X_train_scaled, revenue_y_regressor)

# #Train a Light GBM Model
# revenue_y_s_regressor, revenue_X_s_train_scaled, revenue_X_s_test_scaled, revenue_y_s_test = \
# LightGBM_Model(X, revenue_X_selected, revenue_y, rand_state=42)

# # Make prediction using test data
# revenue_y_s_pred = revenue_y_s_regressor.predict(revenue_X_s_test_scaled)

# # Calculate R-squared score
# revenue_s_score = r2_score(revenue_y_s_test, revenue_y_s_pred)
# print(f"R-squared Score: {revenue_s_score}")

year 325
duration 308
votes 599
budget 535
day 319
language_English 2
language_Spanish 11
language_French 8
language_Russian 3
language_German 8
language_Italian 9
language_Japanese 3
language_Mandarin 13
language_Korean 3
language_Cantonese 3
language_Hindi 6
language_Hebrew 4
language_Greek 4
language_Hungarian 2
language_Thai 5
language_Vietnamese 5
genre_Drama 37
genre_Comedy 33
genre_Action 30
genre_Crime 19
genre_Romance 20
genre_Adventure 39
genre_Thriller 4
genre_Horror 6
genre_Mystery 4
genre_Fantasy 19
genre_Sci-Fi 35
genre_Family 38
genre_Biography 13
genre_Animation 48
genre_Music 5
genre_War 2
genre_Musical 2
country_USA 2
country_UK 15
country_France 3
country_Canada 5
country_Germany 7
country_China 15
country_Australia 10
country_Spain 2
country_New Zealand 3
country_Denmark 3
director_Woody Allen 5
director_Clint Eastwood 4
director_Steven Spielberg 12
director_Martin Scorsese 12
writer_Stephen King 2
company_Universal Pictures 23
company_Warner Bros. 4
company_Columbi

In [47]:
revenue_X_feature_imp = pd.DataFrame.from_dict(revenue_X_selected)
revenue_X_feature_imp.head()

Unnamed: 0,feature,importance
0,year,325
1,duration,308
2,votes,599
3,budget,535
4,day,319


In [48]:
revenue_X_feature_imp

Unnamed: 0,feature,importance
0,year,325
1,duration,308
2,votes,599
3,budget,535
4,day,319
...,...,...
111,actor_Alfred Molina,4
112,actor_Jeff Goldblum,3
113,actor_Sigourney Weaver,4
114,actor_Rosario Dawson,7


In [49]:
revenue_X_selected_df=sample_df[revenue_X_feature_imp["feature"]]
revenue_X_selected_df

Unnamed: 0,year,duration,votes,budget,day,language_English,language_Spanish,language_French,language_Russian,language_German,...,actor_Will Ferrell,actor_Dwayne Johnson,actor_James Woods,actor_Ben Affleck,actor_Will Smith,actor_Alfred Molina,actor_Jeff Goldblum,actor_Sigourney Weaver,actor_Rosario Dawson,actor_Mark Ruffalo
0,1920,76,55601,18000,58,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1921,150,3058,800000,106,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1921,68,109038,250000,330,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1923,82,4735,351000,157,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1925,95,97480,923000,296,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8865,2018,99,168,200000,102,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8866,2019,98,197,950000,354,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8867,2018,127,117,500000,149,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8868,2020,116,6196,12000000,79,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Revenue Predictor: Export to CSV

In [50]:
revenue_X_feature_imp.to_csv('encoded_data/feature_importance_rev_lgbm.csv',index=False)
revenue_X_selected_df.to_csv('encoded_data/select_sample_data_rev_lgbm.csv',index=False)
revenue_y.to_csv("encoded_data/revenue_lgbm.csv",index=False)

### Review Score Predictor
<hr>
select the most important features to train a new model

In [51]:
# Set X_Selected variables on important features DataFrame
vote_X_selected = important_features(vote_X_train_scaled, vote_y_regressor)

# #Train a Light GBM Model
# vote_y_s_regressor, vote_X_s_train_scaled, vote_X_s_test_scaled, vote_y_s_test = \
# LightGBM_Model(X, vote_X_selected, vote_y, rand_state=42)

# # Make prediction using test data
# vote_y_s_pred = vote_y_s_regressor.predict(vote_X_s_test_scaled)

# # Calculate R-squared score
# vote_s_score = r2_score(vote_y_s_test, vote_y_s_pred)
# print(f"R-squared Score: {vote_s_score}")

year 352
duration 327
votes 529
budget 375
day 289
language_English 41
language_Spanish 5
language_French 4
language_Russian 5
language_German 4
language_Italian 2
language_Japanese 6
language_Mandarin 5
language_Arabic 10
language_Korean 8
language_Cantonese 11
language_Turkish 10
language_Latin 6
language_Hindi 15
language_Hebrew 6
language_Chinese 2
language_Hungarian 5
language_Afrikaans 5
language_Danish 4
genre_Drama 66
genre_Comedy 35
genre_Action 56
genre_Crime 27
genre_Romance 22
genre_Adventure 25
genre_Thriller 23
genre_Horror 59
genre_Mystery 14
genre_Fantasy 24
genre_Sci-Fi 32
genre_Family 25
genre_Biography 22
genre_Animation 56
genre_History 7
genre_Music 9
genre_Sport 5
genre_War 2
genre_Musical 8
country_USA 44
country_UK 19
country_France 12
country_Canada 5
country_Germany 6
country_China 6
country_Japan 11
country_Russia 21
country_Mexico 2
country_Hong Kong 6
country_Spain 4
country_India 11
country_South Korea 10
country_Belgium 3
country_Turkey 19
country_Ireland

In [52]:
vote_X_feature_imp = pd.DataFrame.from_dict(vote_X_selected)
vote_X_feature_imp.head()

Unnamed: 0,feature,importance
0,year,352
1,duration,327
2,votes,529
3,budget,375
4,day,289


In [53]:
vote_X_selected_df=sample_df[vote_X_feature_imp["feature"]]

### Review Score Predictor: Export to CSV

In [54]:
vote_X_feature_imp.to_csv('encoded_data/feature_importance_vote_lgbm.csv',index=False)
vote_X_selected_df.to_csv('encoded_data/select_sample_data_vote_lgbm.csv',index=False)
vote_y.to_csv("encoded_data/vote_lgbm.csv",index=False)