In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

In [3]:
import pickle

with open('preprocess.pkl', 'rb') as file:
    pre = pickle.load(file)

In [4]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

In [5]:
X_train = train.drop('log_total_users', axis=1)
y_train = train.log_total_users

In [6]:
X_test = test.drop('log_total_users', axis=1)
y_test = test.log_total_users

## Preprocessing:

In [8]:
X_train_tf = pre.transform(X_train)

In [10]:
X_test_tf = pre.transform(X_test)

## Model Fitting:

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error

### Decision Tree

In [13]:
from sklearn.tree import DecisionTreeRegressor

In [15]:
dt = DecisionTreeRegressor(random_state=0)
params = {'max_depth':np.arange(5,21,3), 'min_samples_split':np.arange(2,12,2),
           'max_features':['sqrt','log2',None], 'min_samples_leaf':np.arange(1,10,1)
          }
grid = GridSearchCV(dt, param_grid=params, n_jobs=-1, verbose=1)
grid_model = grid.fit(X_train_tf, y_train)

Fitting 5 folds for each of 810 candidates, totalling 4050 fits


In [16]:
dt_best = grid.best_estimator_
print('Best Hyperparameters:', grid.best_params_)

Best Hyperparameters: {'max_depth': 14, 'max_features': None, 'min_samples_leaf': 8, 'min_samples_split': 2}


In [17]:
dtr = dt_best.fit(X_train_tf, y_train)
dtr_train_pred = dtr.predict(X_train_tf)
dtr_test_pred = dtr.predict(X_test_tf)

In [18]:
dtr_train_mse = mean_squared_error(dtr_train_pred, y_train)
dtr_test_mse = mean_squared_error(dtr_test_pred, y_test)
dtr_train_mse, dtr_test_mse

(0.14196568481554026, 0.21150614711000443)

In [19]:
dtr_train_mape = mean_absolute_percentage_error(dtr_train_pred, y_train)
dtr_test_mape = mean_absolute_percentage_error(dtr_test_pred, y_test)
dtr_train_mape, dtr_test_mape

(0.0418132621695804, 0.05026482648411736)

### Gradient Boosting

In [20]:
from sklearn.ensemble import GradientBoostingRegressor

In [21]:
gb = GradientBoostingRegressor(random_state=0)
params1 = {'n_estimators':np.arange(100,301,50), 'max_depth':np.arange(1,16,3),
           'max_features':['sqrt','log2','auto'], 'learning_rate':[0.01,0.05,0.1,0.5]
          }
grid1 = GridSearchCV(gb, param_grid=params1, n_jobs=-1, verbose=1)
grid_model1 = grid1.fit(X_train_tf, y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


In [22]:
gb_best = grid1.best_estimator_
print('Best Hyperparameters:', grid1.best_params_)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 300}


In [23]:
gbr = gb_best.fit(X_train_tf, y_train)
gbr_train_pred = gbr.predict(X_train_tf)
gbr_test_pred = gbr.predict(X_test_tf)

In [24]:
gbr_train_mse = mean_squared_error(gbr_train_pred, y_train)
gbr_test_mse = mean_squared_error(gbr_test_pred, y_test)
gbr_train_mse, gbr_test_mse

(0.058210659245755206, 0.14383105425995735)

In [25]:
gbr_train_mape = mean_absolute_percentage_error(gbr_train_pred, y_train)
gbr_test_mape = mean_absolute_percentage_error(gbr_test_pred, y_test)
gbr_train_mape, gbr_test_mape

(0.027099523189119054, 0.041955405318118706)

### XGBoost

In [26]:
import xgboost

In [27]:
xg = xgboost.XGBRegressor(random_state=0)
params2 = {'eta':[0.01,0.1,0.3,0.5,1], 'max_delta_step':[1,3,5,7],
           'max_features':['sqrt','log2'], 'subsample':[0.5,0.6,0.7,1]
          }
grid2 = GridSearchCV(xg, param_grid=params2, n_jobs=-1, verbose=1, error_score='raise')
grid_model2 = grid2.fit(X_train_tf, y_train)

Fitting 5 folds for each of 160 candidates, totalling 800 fits


Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are

In [28]:
xgb_best = grid2.best_estimator_
print('Best Hyperparameters:', grid2.best_params_)

Best Hyperparameters: {'eta': 0.3, 'max_delta_step': 3, 'max_features': 'sqrt', 'subsample': 1}


In [29]:
xgb = xgb_best.fit(X_train_tf, y_train)
xgb_train_pred = xgb.predict(X_train_tf)
xgb_test_pred = xgb.predict(X_test_tf)

In [30]:
xgb_train_mse = mean_squared_error(xgb_train_pred, y_train)
xgb_test_mse = mean_squared_error(xgb_test_pred, y_test)
xgb_train_mse, xgb_test_mse

(0.07205307057672146, 0.1428437035372701)

In [31]:
xgb_train_mape = mean_absolute_percentage_error(xgb_train_pred, y_train)
xgb_test_mape = mean_absolute_percentage_error(xgb_test_pred, y_test)
xgb_train_mape, xgb_test_mape

(0.03039845632314615, 0.04168078509697201)

### Pickle

In [32]:
dt = DecisionTreeRegressor(max_depth=14, max_features=None, min_samples_leaf=8, min_samples_split=2, random_state=0)

In [33]:
dt_model = dt.fit(X_train_tf, y_train)

In [34]:
with open('dt.pkl', 'wb') as file:
    pickle.dump(dt_model, file)

In [35]:
gb = GradientBoostingRegressor(learning_rate=0.1, max_depth=7, max_features='sqrt', n_estimators=300, random_state=0)

In [36]:
gb_model = gb.fit(X_train_tf, y_train)

In [37]:
with open('gb.pkl', 'wb') as file:
    pickle.dump(gb_model, file)

In [38]:
xg = xgboost.XGBRegressor(eta=0.3, max_delta_step=3, max_features='sqrt', subsample=1, random_state=0)

In [39]:
xg_model = xg.fit(X_train_tf, y_train)

In [40]:
with open('xg.pkl', 'wb') as file:
    pickle.dump(xg_model, file)

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are not used.

Parameters: { "max_features" } are