# **Environment Setup**

In [1]:
# !pip install wordcloud

In [2]:
# !pip install lightgbm

In [24]:
# In case you get an import error, run again notebook cell number 3, 
# to install again miniconda and RAPIDS dependencies. If the problem persists,
# Go to Runtime -> Reset all runtimes and then run again the noebook cells (two 
# times the one for installing miniconda and RAPIDS dependencies)

# Dependencies

# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedKFold
# from sklearn.datasets import make_classification

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

from matplotlib import pyplot
import pandas as pd
import numpy as np
import lightgbm as lgb

from numpy import mean
from numpy import std

# Light GBM Regressor Machine Learning Model
from lightgbm import LGBMRegressor

# Machine Learning Model Exporter
import joblib

# Sample Data Loading

In [2]:
# Import Cleaned Dataset as CSV

# Google Colab import with Google Drive mounted:
# df = pd.read_csv('/content/drive/MyDrive/MonashDataBootcamp/Final_Project/movie_sample.csv') 

sample_df=pd.read_csv('encoded_data/sample_data.csv')

sample_df.head()

Unnamed: 0,year,duration,avg_vote,votes,budget,worlwide_gross_income,day,language_English,language_Spanish,language_French,...,actor_Matthew Goode,actor_Edgar Ramírez,actor_Tyler Perry,actor_Sebastian Stan,actor_Rebecca Hall,actor_Cam Gigandet,actor_Miles Teller,actor_Scott Adkins,actor_Octavia Spencer,actor_Dave Bautista
0,1920,76,8.1,55601,18000,8811,58,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1921,150,7.2,3058,800000,9183673,106,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1921,68,8.3,109038,250000,26916,330,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1923,82,7.0,4735,351000,11233,157,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1925,95,8.2,97480,923000,26916,296,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Confirm all columns are numerical datatypes
sample_df.dtypes

year                       int64
duration                   int64
avg_vote                 float64
votes                      int64
budget                     int64
                          ...   
actor_Cam Gigandet         int64
actor_Miles Teller         int64
actor_Scott Adkins         int64
actor_Octavia Spencer      int64
actor_Dave Bautista        int64
Length: 1872, dtype: object

In [4]:
sample_df.shape

(8870, 1872)

In [5]:
# Assign X variables by dropping Y targets
X = sample_df.drop(["avg_vote","worlwide_gross_income"], axis=1)

In [6]:
X.head()

Unnamed: 0,year,duration,votes,budget,day,language_English,language_Spanish,language_French,language_Russian,language_German,...,actor_Matthew Goode,actor_Edgar Ramírez,actor_Tyler Perry,actor_Sebastian Stan,actor_Rebecca Hall,actor_Cam Gigandet,actor_Miles Teller,actor_Scott Adkins,actor_Octavia Spencer,actor_Dave Bautista
0,1920,76,55601,18000,58,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1921,150,3058,800000,106,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1921,68,109038,250000,330,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1923,82,4735,351000,157,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1925,95,97480,923000,296,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Assign two separate Y targets:
# film review score set as vote_y
# film worldwide gross income set as revenue_y

vote_y = sample_df["avg_vote"]

revenue_y = sample_df["worlwide_gross_income"]

# Data Scaling

In [14]:
# Function to train an LGBM Model

def data_scaling(Variable, target, rand_state=42):
    
    # Split the data to train and test
    X_train, X_test, y_train, y_test = \
    train_test_split(Variable, target, test_size=0.2, random_state=rand_state)

    print('X_train dimensions: ', X_train.shape, 'y_train: ', y_train.shape)
    print('X_test dimensions:', X_test.shape, 'y_test: ', y_test.shape)

    # Scale only duration and day columns using Min Max Scaler
    duration_scaler = MinMaxScaler().fit(Variable[["duration"]])
    day_scaler = MinMaxScaler().fit(Variable[["day"]])

    # Duplicate X_train and X_test for scaling purposes
    X_train_scaled = X_train
    X_test_scaled = X_test
    
    # Use Scaler to transform relevant X_train and X_test columns
    X_train_scaled["duration"] = duration_scaler.transform(X_train_scaled[["duration"]])
    X_train_scaled["day"] = day_scaler.transform(X_train_scaled[["day"]])

    X_test_scaled["duration"] = duration_scaler.transform(X_test_scaled[["duration"]])
    X_test_scaled["day"] = day_scaler.transform(X_test_scaled[["day"]])
    return X_train_scaled, X_test_scaled, y_train, y_test

In [15]:
def LightGBM_Model(X_train_scaled, y_train, params):
    # Set the model to LGBM Regressor
    regressor = LGBMRegressor(params)
    
    # Train the model 
    regressed = regressor.fit(X_train_scaled,y_train) 
    return regressed

## Revenue Predictor
### All Variables
<hr>
Light GBM Regression Model for Revenue Prediction

In [65]:
# Create and train a LightGBM model for revenue prediction
revenue_X_train_scaled, revenue_X_test_scaled, revenue_y_train, revenue_y_test = \
data_scaling(X, revenue_y, rand_state=42)

revenue_y_regressor = LGBMRegressor()

revenue_y_regressor.fit(revenue_X_train_scaled,revenue_y_train)


X_train dimensions:  (7096, 1870) y_train:  (7096,)
X_test dimensions: (1774, 1870) y_test:  (1774,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

LGBMRegressor()

In [66]:
# Make prediction using test data
revenue_y_pred = revenue_y_regressor.predict(revenue_X_test_scaled)

# Calculate R-squared score
revenue_score = r2_score(revenue_y_test, revenue_y_pred)
print(f"R-squared Score: {revenue_score}")

R-squared Score: 0.7701690631865028


## Hyperparameter Tuning using Random Grid Search
### Revenue Predictor

In [25]:
#Set the minimum error arbitrarily large
min = 99999999999999999999999 
count = 0 #Used for keeping track of the iteration number
#How many runs to perform using randomly selected hyperparameters
iterations = 30
for i in range(iterations):
    print('iteration number', count)
    count += 1 #increment count
    try:
        d_train = lgb.Dataset(revenue_X_train_scaled, label=revenue_y_train) #Load in data
        params = {} #initialize parameters
        params['learning_rate'] = np.random.uniform(0, 1)
        params['boosting_type'] = np.random.choice(['gbdt', 'dart', 'goss'])
        params['objective'] = 'regression'
        params['metric'] = 'mae'
        params['sub_feature'] = np.random.uniform(0, 1)
        params['num_leaves'] = np.random.randint(20, 300)
        params['min_data'] = np.random.randint(10, 100)
        params['max_depth'] = np.random.randint(5, 200)
        iterations = np.random.randint(10, 10000)
        print(params, iterations)
        #Train using selected parameters
        clf = lgb.train(params, d_train, iterations)
        random_revenue_y_pred=clf.predict(revenue_X_test_scaled) #Create predictions on test set
        mae=mean_absolute_error(random_revenue_y_pred, revenue_y_test)
        print('MAE:', mae)
        if mae < min:
            min = mae
            pp = params 
    except: #in case something goes wrong
            print('failed with')
            print(params)
print("*" * 30)
print('Minimum is: ', min)
print('Used params', pp)

iteration number 0
{'learning_rate': 0.7220525121847937, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.26517855225324394, 'num_leaves': 88, 'min_data': 41, 'max_depth': 57} 1446
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1107
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 92
[LightGBM] [Info] Start training from score 67667443.260147






MAE: 57357850.391175374
iteration number 1
{'learning_rate': 0.5355501058993952, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.30726048199584055, 'num_leaves': 47, 'min_data': 88, 'max_depth': 93} 7215
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1041
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 59
[LightGBM] [Info] Start training from score 67667443.260147


















MAE: 40898731.301678166
iteration number 2
{'learning_rate': 0.5444792163246309, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.4814695035744433, 'num_leaves': 139, 'min_data': 51, 'max_depth': 188} 3946
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1075
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 76
[LightGBM] [Info] Start training from score 67667443.260147








































































MAE: 36981052.350512326
iteration number 3
{'learning_rate': 0.8427177746182886, 'boosting_type': 'goss', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.24014855414635083, 'num_leaves': 116, 'min_data': 88, 'max_depth': 152} 2298
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1041
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 59
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 67667443.260147


































MAE: 96737344.07672174
iteration number 4
{'learning_rate': 0.46066917600006874, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.20485910700464915, 'num_leaves': 294, 'min_data': 46, 'max_depth': 65} 957
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1081
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 79
[LightGBM] [Info] Start training from score 67667443.260147
















MAE: 41660236.86373298
iteration number 5
{'learning_rate': 0.7708381740066722, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.3183235757655579, 'num_leaves': 93, 'min_data': 44, 'max_depth': 71} 593
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 86
[LightGBM] [Info] Start training from score 67667443.260147


MAE: 39856216.7897043
iteration number 6
{'learning_rate': 0.8505852340913139, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.8012170018919376, 'num_leaves': 228, 'min_data': 25, 'max_depth': 136} 7354
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1271
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 174
[LightGBM] [Info] Start training from score 67667443.260147




































































































































MAE: 32376084.038310893
iteration number 7
{'learning_rate': 0.8466688312562304, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.6588258419572436, 'num_leaves': 169, 'min_data': 94, 'max_depth': 81} 3560
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1039
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 58
[LightGBM] [Info] Start training from score 67667443.260147






























































MAE: 49751201.44780511
iteration number 8
{'learning_rate': 0.6849784782676905, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.702012398755818, 'num_leaves': 118, 'min_data': 98, 'max_depth': 73} 4873
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1037
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 57
[LightGBM] [Info] Start training from score 67667443.260147


















































































MAE: 44268511.033406846
iteration number 9
{'learning_rate': 0.6316495846790792, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.9884871801446896, 'num_leaves': 26, 'min_data': 14, 'max_depth': 171} 1588
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1835
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 456
[LightGBM] [Info] Start training from score 67667443.260147
MAE: 39064982.73623873
iteration number 10
{'learning_rate': 0.33706813595190843, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.6323979004943662, 'num_leaves': 174, 'min_data': 90, 'max_depth': 6} 4127
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1039
[LightGBM] [Info] Number of data points in the trai





























































MAE: 39066056.90493618
iteration number 11
{'learning_rate': 0.39433602981348825, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.7262352798707157, 'num_leaves': 293, 'min_data': 33, 'max_depth': 110} 7755
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1141
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 109
[LightGBM] [Info] Start training from score 67667443.260147
































































































































MAE: 38544021.63648989
iteration number 12
{'learning_rate': 0.020827937466412205, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.18679440753911425, 'num_leaves': 182, 'min_data': 18, 'max_depth': 198} 5423
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1511
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 294
[LightGBM] [Info] Start training from score 67667443.260147










































MAE: 37128897.97284915
iteration number 13
{'learning_rate': 0.2869834086694595, 'boosting_type': 'goss', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.9472412592786208, 'num_leaves': 225, 'min_data': 23, 'max_depth': 63} 6558
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1323
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 200
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 67667443.260147


















































































































MAE: 38900935.78833087
iteration number 14
{'learning_rate': 0.040674395316269374, 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.5710575226919281, 'num_leaves': 238, 'min_data': 74, 'max_depth': 131} 6070
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1055
[LightGBM] [Info] Number of data points in the train set: 7096, number of used features: 66
[LightGBM] [Info] Start training from score 67667443.260147


























































































MAE: 36159870.97238377
***************
Minimum is:  32376084.038310893
Used params {'learning_rate': 0.8505852340913139, 'boosting_type': 'dart', 'objective': 'regression', 'metric': 'mae', 'sub_feature': 0.8012170018919376, 'num_leaves': 228, 'min_data': 25, 'max_depth': 136}


<hr>
Using GridSearchCV to hyper tune model

In [72]:
param_grid = {
    'max_depth': [2, 200],
    'num_leaves': [5, 500],
    'learning_rate': [0.06,0.2,0.6],
    'n_estimators': [500, 2000],
    'min_child_weight': [0.01, 8.87],
    'scale_pos_weight': [1, 10000], 
    'subsample': [0.4, 0.1],
    'colsample_bytree': [0.1,0.2]
    
#    'max_delta_step': [0, 0.2, 0.6, 1, 2],
#    'reg_alpha': [0, 0.25, 0.5, 0.75, 1],
#    'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],
    # 'gamma': [0.1, 0.3],
#    'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1]
}

In [73]:
gsearch = GridSearchCV(revenue_y_regressor, param_grid=param_grid,verbose=3, n_jobs=-1)
gsearch.fit(revenue_X_train_scaled, revenue_y_train)

Fitting 5 folds for each of 384 candidates, totalling 1920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [32]:
print(gsearch.best_params_)
print(gsearch.best_score_)

{'colsample_bytree': 0.2, 'learning_rate': 0.06, 'max_depth': 63, 'min_child_weight': 0.01, 'n_estimators': 2000, 'num_leaves': 7, 'scale_pos_weight': 1, 'subsample': 0.4}
0.7419403930457946


In [53]:
revenue_y_regressor = LGBMRegressor(colsample_bytree= 0.2, learning_rate= 0.06, \
                                    max_depth= 63, min_child_weight= 0.01, \
                                    n_estimators= 2000, num_leaves= 7, scale_pos_weight= 1, subsample= 0.4)

revenue_y_regressor.fit(revenue_X_train_scaled,revenue_y_train)

LGBMRegressor(colsample_bytree=0.2, learning_rate=0.06, max_depth=63,
              min_child_weight=0.01, n_estimators=2000, num_leaves=7,
              scale_pos_weight=1, subsample=0.4)

In [54]:
# Make prediction using test data
revenue_y_pred = revenue_y_regressor.predict(revenue_X_test_scaled)

# Calculate R-squared score
revenue_score = r2_score(revenue_y_test, revenue_y_pred)
print(f"R-squared Score: {revenue_score}")

R-squared Score: 0.7602899574951469


## Revenue Models Export
<hr>
Export revenue model as .sav and .h5 formats

In [None]:
# filename = 'model/light_gbm_revenue_model.sav'
# joblib.dump(revenue_y_regressor, filename)

In [None]:
# filename = 'model/light_gbm_revenue_model.h5'
# joblib.dump(revenue_y_regressor, filename)

In [None]:
# filename = 'model/light_gbm_revenue_model.sav'
# joblib.dump(revenue_y_s_regressor, filename)

In [None]:
# filename = 'model/light_gbm_revenue_model.h5'
# joblib.dump(revenue_y_s_regressor, filename)

## Review Score Predictor
### All Variables
<hr>
Light GBM Regression Model for Review Score Prediction

In [21]:
vote_y_regressor, vote_X_train_scaled, vote_X_test_scaled, vote_y_test = LightGBM_Model(X, X, vote_y, rand_state=42)

X_train dimensions:  (7096, 1870) y_train:  (7096,)
X_test dimensions: (1774, 1870) y_test:  (1774,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [22]:
# Make prediction using test data
vote_y_pred = vote_y_regressor.predict(vote_X_test_scaled)

# Calculate R-squared score
vote_score = r2_score(vote_y_test, vote_y_pred)
print(f"R-squared Score: {vote_score}")

R-squared Score: 0.5708410594303142


## Review Score Models Export
<hr>
Export review score model as .sav and .h5 formats

In [24]:
filename = 'Model/light_gbm_vote_model.sav'
joblib.dump(vote_y_regressor, filename)

['Model/light_gbm_vote_model.sav']

In [25]:
filename_s = 'Model/light_gbm_vote_model.h5'
joblib.dump(vote_y_regressor, filename)

['Model/light_gbm_vote_model.sav']

In [26]:
filename_s = 'Model/light_gbm_vote_model.sav'
joblib.dump(vote_y_s_regressor, filename)

['Model/light_gbm_vote_model.sav']

In [27]:
filename_s = 'Model/light_gbm_vote_model.h5'
joblib.dump(vote_y_s_regressor, filename)

['Model/light_gbm_vote_model.sav']

## Selected Features / Feature Importances

### Revenue Predictor
<hr>
select the most important features to train a new model

In [15]:
# Function to find most important features and train a refined model

def important_features(X_train_scaled, y_regressor):
    # Train a new model using only the most important features using .feature_importance_ 
    # Save important features in a list select_features
    select_features=[]

    for col,score in zip(X_train_scaled.columns, y_regressor.feature_importances_):
      if score > 1:
        select_features.append(col)
        print(col,score)
        
    print(f"Number of features: {len(select_features)}")
    
    # Set new DataFrame with only important features
    select_sample_df = sample_df[select_features]
    
    return select_sample_df

In [16]:
# Set X_Selected variables on important features DataFrame
revenue_X_selected = important_features(revenue_X_train_scaled, revenue_y_regressor)

#Train a Light GBM Model
revenue_y_s_regressor, revenue_X_s_train_scaled, revenue_X_s_test_scaled, revenue_y_s_test = \
LightGBM_Model(X, revenue_X_selected, revenue_y, rand_state=42)

# Make prediction using test data
revenue_y_s_pred = revenue_y_s_regressor.predict(revenue_X_s_test_scaled)

# Calculate R-squared score
revenue_s_score = r2_score(revenue_y_s_test, revenue_y_s_pred)
print(f"R-squared Score: {revenue_s_score}")

year 325
duration 308
votes 599
budget 535
day 319
language_English 2
language_Spanish 11
language_French 8
language_Russian 3
language_German 8
language_Italian 9
language_Japanese 3
language_Mandarin 13
language_Korean 3
language_Cantonese 3
language_Hindi 6
language_Hebrew 4
language_Greek 4
language_Hungarian 2
language_Thai 5
language_Vietnamese 5
genre_Drama 37
genre_Comedy 33
genre_Action 30
genre_Crime 19
genre_Romance 20
genre_Adventure 39
genre_Thriller 4
genre_Horror 6
genre_Mystery 4
genre_Fantasy 19
genre_Sci-Fi 35
genre_Family 38
genre_Biography 13
genre_Animation 48
genre_Music 5
genre_War 2
genre_Musical 2
country_USA 2
country_UK 15
country_France 3
country_Canada 5
country_Germany 7
country_China 15
country_Australia 10
country_Spain 2
country_New Zealand 3
country_Denmark 3
director_Woody Allen 5
director_Clint Eastwood 4
director_Steven Spielberg 12
director_Martin Scorsese 12
writer_Stephen King 2
company_Universal Pictures 23
company_Warner Bros. 4
company_Columbi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

R-squared Score: 0.7720828593945964


### Review Score Predictor
<hr>
select the most important features to train a new model

In [23]:
# Set X_Selected variables on important features DataFrame
vote_X_selected = important_features(vote_X_train_scaled, vote_y_regressor)

#Train a Light GBM Model
vote_y_s_regressor, vote_X_s_train_scaled, vote_X_s_test_scaled, vote_y_s_test = \
LightGBM_Model(X, vote_X_selected, vote_y, rand_state=42)

# Make prediction using test data
vote_y_s_pred = vote_y_s_regressor.predict(vote_X_s_test_scaled)

# Calculate R-squared score
vote_s_score = r2_score(vote_y_s_test, vote_y_s_pred)
print(f"R-squared Score: {vote_s_score}")

year 352
duration 327
votes 529
budget 375
day 289
language_English 41
language_Spanish 5
language_French 4
language_Russian 5
language_German 4
language_Italian 2
language_Japanese 6
language_Mandarin 5
language_Arabic 10
language_Korean 8
language_Cantonese 11
language_Turkish 10
language_Latin 6
language_Hindi 15
language_Hebrew 6
language_Chinese 2
language_Hungarian 5
language_Afrikaans 5
language_Danish 4
genre_Drama 66
genre_Comedy 35
genre_Action 56
genre_Crime 27
genre_Romance 22
genre_Adventure 25
genre_Thriller 23
genre_Horror 59
genre_Mystery 14
genre_Fantasy 24
genre_Sci-Fi 32
genre_Family 25
genre_Biography 22
genre_Animation 56
genre_History 7
genre_Music 9
genre_Sport 5
genre_War 2
genre_Musical 8
country_USA 44
country_UK 19
country_France 12
country_Canada 5
country_Germany 6
country_China 6
country_Japan 11
country_Russia 21
country_Mexico 2
country_Hong Kong 6
country_Spain 4
country_India 11
country_South Korea 10
country_Belgium 3
country_Turkey 19
country_Ireland

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

R-squared Score: 0.5747153040165913
