# **Environment Setup**

In [1]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score
import pandas as pd
from datetime import datetime
from collections import Counter

In [2]:
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

# Sample Data Loading

In [3]:
rev_select_sample_df = pd.read_csv("encoded_data/select_sample_data_rev_xgb.csv")
revenue_y = pd.read_csv("encoded_data/revenue_xgb.csv")
rat_select_sample_df = pd.read_csv("encoded_data/select_sample_data_rate_xgb.csv")
rate_y = pd.read_csv("encoded_data/rating_xgb.csv")

In [4]:
X_rev = rev_select_sample_df
X_rat = rat_select_sample_df

# Data Scaling

In [5]:
from sklearn.model_selection import train_test_split
X_rev_train, X_rev_test, revenue_y_train, revenue_y_test = train_test_split(X_rev, revenue_y, test_size=0.2)
X_rat_train, X_rat_test, rate_y_train, rate_y_test = train_test_split(X_rat, rate_y, test_size=0.2)

In [6]:
duration_scaler=MinMaxScaler().fit(X_rev[["duration"]])
day_scaler=MinMaxScaler().fit(X_rev[["day"]])

In [7]:
X_rev_train_scaled=X_rev_train
X_rev_test_scaled=X_rev_test

X_rat_train_scaled=X_rat_train
X_rat_test_scaled=X_rat_test

In [8]:
X_rev_train_scaled["duration"]=duration_scaler.transform(X_rev_train_scaled[["duration"]])
X_rev_train_scaled["day"]=day_scaler.transform(X_rev_train_scaled[["day"]])
X_rev_test_scaled["duration"]=duration_scaler.transform(X_rev_test_scaled[["duration"]])
X_rev_test_scaled["day"]=day_scaler.transform(X_rev_test_scaled[["day"]])

X_rat_train_scaled["duration"]=duration_scaler.transform(X_rat_train_scaled[["duration"]])
X_rat_train_scaled["day"]=day_scaler.transform(X_rat_train_scaled[["day"]])
X_rat_test_scaled["duration"]=duration_scaler.transform(X_rat_test_scaled[["duration"]])
X_rat_test_scaled["day"]=day_scaler.transform(X_rat_test_scaled[["day"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_rev_train_scaled["duration"]=duration_scaler.transform(X_rev_train_scaled[["duration"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_rev_train_scaled["day"]=day_scaler.transform(X_rev_train_scaled[["day"]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_rev_test_scaled["duration"]=duration

# Revenue Predictor Training

In [9]:
regressor = XGBRegressor()
regressor.fit(X_rev_train_scaled, revenue_y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [10]:
revenue_y_pred = regressor.predict(X_rev_test_scaled)
score = r2_score(revenue_y_test, revenue_y_pred)
print(score)

0.8030875858205271


In [11]:
regressor = XGBRegressor()

In [15]:
gridparas = {
    'max_depth': [5, 7],
    'learning_rate': [0.06,0.1,0.3],
    'n_estimators': [100,200],
    'min_child_weight': [1, 2, 3],
    'subsample': [0.7,1.0],
    'colsample_bytree': [0.9,1.0]
}

In [16]:
gsearch = GridSearchCV(regressor, param_grid=gridparas,verbose=3, n_jobs=-1)
gsearch.fit(X_rev_train_scaled,revenue_y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   37.3s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed: 19.3min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 28.3min finished


GridSearchCV(estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0, gpu_id=-1,
                                    importance_type='gain',
                                    interaction_constraints='',
                                    learning_rate=0.300000012, max_delta_step=0,
                                    max_depth=6, min_child_weight=1,
                                    missing=nan, monotone_constraints='()',
                                    n_estimators=100, n_jobs=8,
                                    num_parallel_tree=1, random_state=0,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, subsample=1,
                                    tree_method='exact', validate_parameters=1,
                                    verbosity=None),
             n_jobs=-1,
 

In [17]:
print(gsearch.best_params_)
print(gsearch.best_score_)

{'colsample_bytree': 1.0, 'learning_rate': 0.3, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 200, 'subsample': 1.0}
0.781930910327875


In [18]:
regressor = XGBRegressor(max_depth= 5, min_child_weight= 3, learning_rate=0.3,
                         n_estimators=200, colsample_bytree= 1.0, subsample = 1.0)
regressor.fit(X_rev_train_scaled, revenue_y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1.0, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.3, max_delta_step=0, max_depth=5,
             min_child_weight=3, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1.0,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [19]:
revenue_y_pred = regressor.predict(X_rev_test_scaled)
score = r2_score(revenue_y_test, revenue_y_pred)
print(score)

0.7920106808090885


In [20]:
import joblib
filename = 'revenue_xgboost_model.sav'
joblib.dump(regressor, filename)

['revenue_xgboost_model.sav']

# Rating Predictor Training

In [21]:
rate_regressor = XGBRegressor()
rate_regressor.fit(X_rat_train_scaled, rate_y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [22]:
rate_y_pred = rate_regressor.predict(X_rat_test_scaled)
score = r2_score(rate_y_test, rate_y_pred)
print(score)

0.5833734513465703


In [23]:
rate_regressor = XGBRegressor(max_depth= 6, min_child_weight= 2, learning_rate=0.3,
                         n_estimators=200, colsample_bytree= 1.0, subsample = 1.0)

In [24]:
gridparas = {
    'max_depth': [5, 7],
    'learning_rate': [0.06,0.1,0.3],
    'n_estimators': [100,200],
    'min_child_weight': [2, 3],
    'subsample': [0.5, 0.7,1.0],
    'colsample_bytree': [0.9,1.0]
}

In [25]:
gsearch = GridSearchCV(rate_regressor, param_grid=gridparas,verbose=3, n_jobs=-1)
gsearch.fit(X_rat_train_scaled,rate_y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 496 tasks      | elapsed: 21.7min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 32.0min finished


GridSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=1.0,
                                    gamma=None, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.3, max_delta_step=None,
                                    max_depth=6, min_child_weight=2,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=200, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=1.0,
                                    tree_method=None, validate_parameters=None,
               

In [26]:
print(gsearch.best_params_)
print(gsearch.best_score_)

{'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 2, 'n_estimators': 200, 'subsample': 1.0}
0.6093377578749076


In [28]:
rate_regressor = XGBRegressor(max_depth= 7, min_child_weight= 2, learning_rate=0.1,
                         n_estimators=200, colsample_bytree= 0.9, subsample = 1.0)
rate_regressor.fit(X_rat_train_scaled, rate_y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=7,
             min_child_weight=2, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1.0,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [29]:
rate_y_pred = rate_regressor.predict(X_rat_test_scaled)
score = r2_score(rate_y_test, rate_y_pred)
print(score)

0.6112028378040094


In [30]:
import joblib
filename = 'rating_xgboost_model.sav'
joblib.dump(rate_regressor, filename)

['rating_xgboost_model.sav']