# **Environment Setup**

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

from matplotlib import pyplot
import pandas as pd
import numpy as np
import lightgbm as lgb
import seaborn as sns

from numpy import mean
from numpy import std

# Light GBM Regressor Machine Learning Model
from lightgbm import LGBMRegressor

# Machine Learning Model Exporter
import joblib

# Sample Data Loading

In [2]:
rev_select_sample_df = pd.read_csv("encoded_data/select_sample_data_rev_xgb.csv")
revenue_y = pd.read_csv("encoded_data/revenue_xgb.csv")
rat_select_sample_df = pd.read_csv("encoded_data/select_sample_data_rate_xgb.csv")
rate_y = pd.read_csv("encoded_data/rating_xgb.csv")

In [3]:
X_rev = rev_select_sample_df
X_rat = rat_select_sample_df

# Data Scaling

In [5]:
# Function to train an LGBM Model

def data_scaling(Variable, target, rand_state=42):
    
    # Split the data to train and test
    X_train, X_test, y_train, y_test = \
    train_test_split(Variable, target, test_size=0.2, random_state=rand_state)

    print('X_train dimensions: ', X_train.shape, 'y_train: ', y_train.shape)
    print('X_test dimensions:', X_test.shape, 'y_test: ', y_test.shape)

    # Scale only duration and day columns using Min Max Scaler
    duration_scaler = MinMaxScaler().fit(Variable[["duration"]])
    day_scaler = MinMaxScaler().fit(Variable[["day"]])

    # Duplicate X_train and X_test for scaling purposes
    X_train_scaled = X_train
    X_test_scaled = X_test
    
    # Use Scaler to transform relevant X_train and X_test columns
    X_train_scaled["duration"] = duration_scaler.transform(X_train_scaled[["duration"]])
    X_train_scaled["day"] = day_scaler.transform(X_train_scaled[["day"]])

    X_test_scaled["duration"] = duration_scaler.transform(X_test_scaled[["duration"]])
    X_test_scaled["day"] = day_scaler.transform(X_test_scaled[["day"]])
    return X_train_scaled, X_test_scaled, y_train, y_test

## Revenue Predictor
<hr>
Light GBM Regression Model for Revenue Prediction

In [7]:
# Create and train a LightGBM model for revenue prediction
revenue_X_train_scaled, revenue_X_test_scaled, revenue_y_train, revenue_y_test = \
data_scaling(X_rev, revenue_y, rand_state=42)

# Set up LightGBM regressor model using default settings
revenue_y_regressor = LGBMRegressor()

# Train model using training data
revenue_y_regressor.fit(revenue_X_train_scaled,revenue_y_train)

X_train dimensions:  (7096, 426) y_train:  (7096, 1)
X_test dimensions: (1774, 426) y_test:  (1774, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_scaled["duration"] = duration_scaler.transform(X_train_scaled[["duration"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_scaled["day"] = day_scaler.transform(X_train_scaled[["day"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_scaled["duration"] = duration_scaler.transf

LGBMRegressor()

In [8]:
# Make prediction using test data
revenue_y_pred = revenue_y_regressor.predict(revenue_X_test_scaled)

# Calculate R-squared score
revenue_score = r2_score(revenue_y_test, revenue_y_pred)
print(f"R-squared Score: {revenue_score}")

R-squared Score: 0.7694747402648999


In [11]:
# Parameters for GridsearchCV
param_grid = {
    'max_depth': [3, 5, 7],
    'num_leaves': [3,7,15,31,63,127],
    'learning_rate': [0.03,0.1,0.3],
    'n_estimators': [200, 500],
    'bagging_fraction ': [0.1, 0.4, 0.7, 1.0],
    'feature_fraction': [0.6, 0.8, 1.0],
    'max_bin':[63,127,255]
}

In [12]:
gsearch = GridSearchCV(revenue_y_regressor, param_grid=param_grid,verbose=3, n_jobs=-1)
gsearch.fit(revenue_X_train_scaled, revenue_y_train)

Fitting 5 folds for each of 4536 candidates, totalling 22680 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   40.6s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | e



GridSearchCV(estimator=LGBMRegressor(), n_jobs=-1,
             param_grid={'bagging_fraction ': [0.1, 0.4, 0.7, 1.0],
                         'feature_fraction': [0.6, 0.8, 1.0],
                         'learning_rate': [0.03, 0.1, 0.3],
                         'max_bin': [63, 127, 255], 'max_depth': [3, 5, 7],
                         'n_estimators': [200, 500],
                         'num_leaves': [3, 7, 15, 31, 63, 127, 256]},
             verbose=3)

In [13]:
print(gsearch.best_params_)
print(gsearch.best_score_)

{'bagging_fraction ': 0.1, 'feature_fraction': 0.6, 'learning_rate': 0.1, 'max_bin': 127, 'max_depth': 5, 'n_estimators': 500, 'num_leaves': 7}
0.775764480016108


In [14]:
# Use hypertuned parameters to train model
revenue_y_gscv_regressor = LGBMRegressor(bagging_fraction = 0.1, 
                                         feature_fraction= 0.6, 
                                         learning_rate= 0.1, 
                                         max_bin= 127, 
                                         max_depth= 5, 
                                         n_estimators= 500, 
                                         num_leaves= 7)

revenue_y_gscv_regressor.fit(revenue_X_train_scaled,revenue_y_train)

LGBMRegressor(bagging_fraction=0.1, feature_fraction=0.6, max_bin=127,
              max_depth=5, n_estimators=500, num_leaves=7)

In [15]:
# Make prediction using test data on hypertuned model
revenue_y_pred = revenue_y_gscv_regressor.predict(revenue_X_test_scaled)

# Calculate R-squared score
revenue_gscv_score = r2_score(revenue_y_test, revenue_y_pred)
print(f"R-squared Score: {revenue_gscv_score}")

R-squared Score: 0.7663601677784938


In [16]:
filename = 'revenue_lgbm_model.sav'
joblib.dump(revenue_y_regressor, filename)

['revenue_lgbm_model.sav']

## Review Score Predictor
<hr>
Light GBM Regression Model for Review Score Prediction

In [18]:
# Create and train a LightGBM model for revenue prediction
rate_X_train_scaled, rate_X_test_scaled, rate_y_train, rate_y_test = \
data_scaling(X_rat, rate_y, rand_state=42)

# Set up LightGBM regressor model using default settings
rate_y_regressor = LGBMRegressor()

# Train model using training data
rate_y_regressor.fit(rate_X_train_scaled,rate_y_train)

X_train dimensions:  (7096, 473) y_train:  (7096, 1)
X_test dimensions: (1774, 473) y_test:  (1774, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_scaled["duration"] = duration_scaler.transform(X_train_scaled[["duration"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_scaled["day"] = day_scaler.transform(X_train_scaled[["day"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_scaled["duration"] = duration_scaler.transf

LGBMRegressor()

In [19]:
# Make prediction using test data
rate_y_pred = rate_y_regressor.predict(rate_X_test_scaled)

# Calculate R-squared score
rate_score = r2_score(rate_y_test, rate_y_pred)
print(f"R-squared Score: {rate_score}")

R-squared Score: 0.572287839432083


In [20]:
# Parameters for GridsearchCV
param_grid = {
    'max_depth': [3, 5, 7],
    'num_leaves': [3,7,15,31,63,127],
    'learning_rate': [0.03,0.1,0.3],
    'n_estimators': [200, 500],
    'bagging_fraction ': [0.1, 0.4, 0.7, 1.0],
    'feature_fraction': [0.6, 0.8, 1.0],
    'max_bin':[63,127,255]
}

In [21]:
gsearch = GridSearchCV(rate_y_regressor, param_grid=param_grid,verbose=3, n_jobs=-1)
gsearch.fit(rate_X_train_scaled, rate_y_train)

Fitting 5 folds for each of 3888 candidates, totalling 19440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed:   45.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2032 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 3856 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 4592 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 5392 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 6256 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | e



GridSearchCV(estimator=LGBMRegressor(), n_jobs=-1,
             param_grid={'bagging_fraction ': [0.1, 0.4, 0.7, 1.0],
                         'feature_fraction': [0.6, 0.8, 1.0],
                         'learning_rate': [0.03, 0.1, 0.3],
                         'max_bin': [63, 127, 255], 'max_depth': [3, 5, 7],
                         'n_estimators': [200, 500],
                         'num_leaves': [3, 7, 15, 31, 63, 127]},
             verbose=3)

In [22]:
print(gsearch.best_params_)
print(gsearch.best_score_)

{'bagging_fraction ': 0.1, 'feature_fraction': 0.6, 'learning_rate': 0.03, 'max_bin': 63, 'max_depth': 7, 'n_estimators': 500, 'num_leaves': 31}
0.6053397710804465


In [23]:
# Use hypertuned parameters to train model
rate_y_gscv_regressor = LGBMRegressor(bagging_fraction = 0.1, 
                                         feature_fraction= 0.6, 
                                         learning_rate= 0.03, 
                                         max_bin= 63, 
                                         max_depth= 7, 
                                         n_estimators= 500, 
                                         num_leaves= 31)

rate_y_gscv_regressor.fit(rate_X_train_scaled,rate_y_train)

LGBMRegressor(bagging_fraction=0.1, feature_fraction=0.6, learning_rate=0.03,
              max_bin=63, max_depth=7, n_estimators=500)

In [24]:
# Make prediction using test data on hypertuned model
rate_y_pred = rate_y_gscv_regressor.predict(rate_X_test_scaled)

# Calculate R-squared score
rate_gscv_score = r2_score(rate_y_test, rate_y_pred)
print(f"R-squared Score: {rate_gscv_score}")

R-squared Score: 0.5767690619823476


In [25]:
filename = 'rate_lgbm_model.sav'
joblib.dump(rate_y_gscv_regressor, filename)

['rate_lgbm_model.sav']