# Models built with all the features 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook
import pprint

In [4]:
import pickle

file = open('data_enc.pkl', 'rb')
Xtrain = pickle.load(file)
ytrain = pickle.load(file)
Xtest = pickle.load(file)

file.close()

In [5]:
print("Xtrain", Xtrain.shape)
print("ytrain",ytrain.shape)
print("Xtest",Xtest.shape)

Xtrain (188318, 1187)
ytrain (188318,)
Xtest (125546, 1187)


In [6]:
sample_submit = pd.read_csv('sample_submission.csv')

In [7]:
from sklearn.model_selection import RandomizedSearchCV, KFold, cross_val_score

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor

import time

In [8]:
SEED = 80

#### Select small subset for hyper parameter tuning

In [9]:
from sklearn.model_selection import train_test_split
X1, X2, y1, y2 = train_test_split(Xtrain, ytrain, test_size= 0.1, random_state = SEED)
print(X1.shape, y1.shape)
print(X2.shape, y2.shape)

(169486, 1187) (169486,)
(18832, 1187) (18832,)


In [10]:
from pprint import pprint
from scipy.stats import randint, uniform

# Models without any feature selection 

# Random forest

In [8]:
# random forest parameters

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 600, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 30, num = 10)]
max_depth.append(None)
#min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

# Create the random grid
param_rf = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               #'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
           }


In [9]:
pprint(param_rf)

{'max_depth': [10, 12, 14, 16, 18, 21, 23, 25, 27, 30, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'n_estimators': [200, 244, 288, 333, 377, 422, 466, 511, 555, 600]}


#### Hyper-parameter tuning using randomized search and 5-fold CV 

Due to large dataset and computation restriction, we train the model on a small subset of the data and then fit the model on the whole training data. 
The fitted model is then evaluated on the held out test data on the kaggle website. 

In [17]:
start_time = time.time()
rf = RandomForestRegressor(random_state=SEED, oob_score = True, n_jobs = -1)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_rf, n_iter = 10, cv = 5, verbose=1, scoring='neg_mean_absolute_error', n_jobs = -1)
# Fit the random search model
rf_random.fit(X2, y2)
print("--- %s seconds ---" % (time.time() - start_time))


Fitting 5 folds for each of 10 candidates, totalling 50 fits
--- 2002.649445772171 seconds ---


In [18]:
print(rf_random.best_params_, rf_random.best_score_)

{'n_estimators': 333, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 27} -0.4533768693104105


In [19]:
rf_random.best_estimator_ 

RandomForestRegressor(max_depth=27, min_samples_leaf=2, n_estimators=333,
                      n_jobs=-1, oob_score=True, random_state=80)

In [31]:
rf2 = RandomForestRegressor(max_depth=27, min_samples_leaf=2, n_estimators=333, max_features = 'auto',
                      n_jobs=-1, oob_score=True, random_state=80)
rf2.fit(Xtrain,ytrain)
pred_rf = rf2.predict(Xtest)

submit_rf = sample_submit
submit_rf['loss'] = pred_rf 
submit_rf.to_csv('submit_rf.csv', index = False)


#### RF score on test set Private LB = 3024.03497 -- very bad 

# XGBoost

In [60]:
param_grid ={'max_depth':randint(3,8),
             'min_child_weight':randint(1,6),
             'eta':uniform(0.05,0.095),
             'subsample': uniform(0.6,0.3) ,
             'colsample_bytree': uniform(0.6,0.3),
             'n_estimators': randint(200,600)
            }
pprint(param_grid)

{'colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000016D1AD75250>,
 'eta': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000016D1AD79430>,
 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000016D1AD79FD0>,
 'min_child_weight': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000016D1AD79370>,
 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000016D1AD69C70>,
 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000016D1AD75A00>}


In [None]:
# xgb = XGBRegressor(n_jobs=-1, objective= 'reg:squarederror',random_state=SEED, verbosity=0)
# grid_xgb = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid,n_iter=10, n_jobs=-1, cv=3, scoring='neg_mean_absolute_error',verbose=1)

# def xg_eval_mae(y_pred, dtrain):
#     y_true = dtrain.get_label()
#     mymae = np.mean(abs(np.expm1(y_true)-np.expm1(y_pred)))
#     return 'mymae', mymae

# grid_xgb.fit(X2,y2,
#          eval_set = [(X1,y1)],
#          eval_metric = xg_eval_mae,
#          early_stopping_rounds = 50, verbose=0)

# print(grid_xgb.best_params_, grid_xgb.best_score_)

# low on memory - cannot run on pc - kernel keeps restarting/hanging

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [28]:
xgb = XGBRegressor(n_jobs=-1, objective= 'reg:squarederror',random_state=SEED, verbosity=0)
grid_xgb = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid,n_iter=10, n_jobs=-1, cv=3, scoring='neg_mean_absolute_error',verbose=1)

grid_xgb.fit(X2,y2, verbose=0)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n...
                                        'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x2b43b8d786a0>,
                                        'min_child_weight': <scip

In [29]:
print(grid_xgb.best_params_, grid_xgb.best_score_)

{'colsample_bytree': 0.859474115591304, 'eta': 0.0500389156243541, 'max_depth': 6, 'min_child_weight': 5, 'n_estimators': 440, 'subsample': 0.6471619961876824} -0.4322413940500162


In [56]:
xgb2 = XGBRegressor(colsample_bytree=0.859474115591304,
             eta=0.0500389156243541, max_depth=6, min_child_weight=5,
             n_estimators=440, n_jobs=-1,
             random_state=80, reg_alpha=0, reg_lambda=1,
             subsample=0.6471619961876824,
             verbosity=0)

xgb2.fit(Xtrain, ytrain)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.859474115591304,
             eta=0.0500389156243541, gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.0500389151,
             max_delta_step=0, max_depth=6, min_child_weight=5, missing=nan,
             monotone_constraints='()', n_estimators=440, n_jobs=-1,
             num_parallel_tree=1, random_state=80, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.6471619961876824,
             tree_method='exact', validate_parameters=1, verbosity=0)

In [58]:
pred = xgb2.predict(Xtest)

submit_xgb = sample_submit
submit_xgb['loss'] = np.expm1(pred)
submit_xgb.to_csv('submit_xgb2.csv', index = False)


#### XGBoost score on test set Private LB = 1141.88064

# LightGBM

In [9]:
param_lgb = {'n_estimators':randint(200,600),
              'colsample_bytree': uniform(0.6,0.3),
              'learning_rate': uniform(0.05, 0.095),
              #'max_depth': [int(x) for x in np.linspace(start = 2, stop = 12, num = 6)],
              'num_leaves':[15, 31, 63, 127, 255, 511, 1023, 2047, 4095]}
pprint(param_lgb)

{'colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001FD57410B20>,
 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001FD57410F40>,
 'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001FD57410370>,
 'num_leaves': [15, 31, 63, 127, 255, 511, 1023, 2047, 4095]}


In [11]:
lgbm = LGBMRegressor(objective='regression_l1', random_state=SEED, n_jobs = -1, verbosity=0)
grid_lg = RandomizedSearchCV(estimator=lgbm, param_distributions=param_lgb,n_iter=100, n_jobs=-1, cv=3, scoring='neg_mean_absolute_error',verbose=1)

def eval_mae(y_true, y_pred):
    mymae = np.mean(abs(np.expm1(y_true)-np.expm1(y_pred)))
    return 'mymae', mymae, False

grid_lg.fit(X2, y2, 
         eval_set = [(X1,y1)],
         eval_metric = eval_mae,
         early_stopping_rounds = 15
         )

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 37.7min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 58.5min finished


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's l1: 0.644159	valid_0's mymae: 1777.88
Training until validation scores don't improve for 15 rounds
[2]	valid_0's l1: 0.628947	valid_0's mymae: 1744.98
[3]	valid_0's l1: 0.614932	valid_0's mymae: 1714.2
[4]	valid_0's l1: 0.602764	valid_0's mymae: 1686.92
[5]	valid_0's l1: 0.590794	valid_0's mymae: 1659.51
[6]	valid_0's l1: 0.579583	valid_0's mymae: 1633.45
[7]	valid_0's l1: 0.56981	valid_0's mymae: 1610.1
[8]	valid_0's l1: 0.560427	valid_0's mymae: 1587.43
[9]	valid_0's l1: 0.551918	valid_0's mymae: 1566.42
[10]	valid_0's l1: 0.54448	valid_0's mymae: 1547.59
[11]	valid_0's l1: 0.537158	valid_0's mymae: 1528.64
[12]	valid_0's l1: 0.530417	valid_0's mymae: 1511.2
[13]	valid_0's l1: 0.52388	valid_0's mymae: 1493.71
[14]	valid_0's l1: 0.518001	valid_0's mymae: 1477.77
[15]	valid_0's l1: 0.51279	valid_0's mymae: 1463.34
[16]	valid_0's l1: 0.507909	valid_0's 

RandomizedSearchCV(cv=3,
                   estimator=LGBMRegressor(objective='regression_l1',
                                           random_state=80, verbosity=0),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001FD57410B20>,
                                        'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001FD57410F40>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001FD57410370>,
                                        'num_leaves': [15, 31, 63, 127, 255,
                                                       511, 1023, 2047, 4095]},
                   scoring='neg_mean_absolute_error', verbose=1)

In [12]:
print(grid_lg.best_params_, grid_lg.best_score_)

{'colsample_bytree': 0.6490660531637237, 'learning_rate': 0.057683251484169334, 'n_estimators': 597, 'num_leaves': 63} -0.4177918978206256


In [13]:
#pprint(grid_lg.cv_results_['params'])
#pprint(grid_lg.cv_results_['mean_test_score'])
grid_lg.best_estimator_

LGBMRegressor(colsample_bytree=0.6490660531637237,
              learning_rate=0.057683251484169334, n_estimators=597,
              num_leaves=63, objective='regression_l1', random_state=80,
              verbosity=0)

In [21]:
lgbm2 = LGBMRegressor(colsample_bytree=0.6490660531637237,
              learning_rate=0.057683251484169334, n_estimators=597,
              num_leaves=63, objective='regression_l1', random_state=80,
              verbosity=0)
lgbm2.fit(Xtrain, ytrain)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


LGBMRegressor(colsample_bytree=0.6490660531637237,
              learning_rate=0.057683251484169334, n_estimators=597,
              num_leaves=63, objective='regression_l1', random_state=80,
              verbosity=0)

In [26]:
pred = lgbm2.predict(Xtest, num_iteration = lgbm2.best_iteration_)

submit_lgbm = sample_submit
submit_lgbm['loss'] = np.expm1(pred)
submit_lgbm.to_csv('submit_lgbm2.csv', index = False)


#### LightGBM score on test set Private LB = 1133.77977

#### XGBoost and LightGBM have similar performance but LightGBM trains much faster than XGBoost.

In [4]:
print('Score on test set - Private LB')
print('\n')
from tabulate import tabulate
print(tabulate( [ ['Random Forest', 3024.03497], ['XGBoost',1141.88064], ['LightGBM' ,1133.77977] ] , headers=['Model', 'MAE']))

Score on test set - Private LB


Model              MAE
-------------  -------
Random Forest  3024.03
XGBoost        1141.88
LightGBM       1133.78
