Tune hyperparameters for the following models:

- Logistic Regression
- LightGBM
- XGBoost
- Random Forest

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import roc_auc_score
import sklearn.metrics as metrics
from sklearn.model_selection import  RandomizedSearchCV, GridSearchCV, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb

In [2]:
# load processed train and validation data
train = pd.read_csv('../data/train_data_processed.csv')
valid = pd.read_csv('../data/valid_data_processed.csv')

In [3]:
# create X and y

X_train = train.drop(columns=['y'])
y_train = train['y']

X_valid = valid.drop(columns=['y'])
y_valid = valid['y']

In [4]:
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(28824, 62)
(28824,)
(6176, 62)
(6176,)


##  feature scaling

In [5]:
scaler = StandardScaler()

# fit on training data
scaler.fit(X_train)

# transform on train and valid
X_train_std = scaler.transform(X_train)
X_valid_std = scaler.transform(X_valid)

## Hyperparameter Tuning

## 1. Logistic Regression

In [6]:
parameters = {
    'C': [0.001, 0.01, 0.1, 1.0, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}

# define the model
lr = LogisticRegression(random_state=2021, max_iter=200, class_weight='balanced')
cv_lr = GridSearchCV(lr, parameters, cv=5)

# run the random search cv on the train set to find the best parameters
%time cv_lr.fit(X_train_std, y_train)

CPU times: user 1min 22s, sys: 2.21 s, total: 1min 25s
Wall time: 28.1 s


GridSearchCV(cv=5,
             estimator=LogisticRegression(class_weight='balanced', max_iter=200,
                                          random_state=2021),
             param_grid={'C': [0.001, 0.01, 0.1, 1.0, 10, 100],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear']})

In [7]:
print('The best parameters are: \n', cv_lr.best_params_)

The best parameters are: 
 {'C': 100, 'solver': 'newton-cg'}


In [8]:
# fitting with best parameters from the above search

lr_model = LogisticRegression(
    random_state=2021, 
    max_iter=200,
    class_weight='balanced',
    C=0.1,  # best_params_ gives 100 but manual experimentation with this value was better
    solver='newton-cg') 

# fit the model
lr_model.fit(X_train_std, y_train)

LogisticRegression(C=0.1, class_weight='balanced', max_iter=200,
                   random_state=2021, solver='newton-cg')

In [9]:
# predictions  
y_predict_train = lr_model.predict_proba(X_train_std)
y_predict_valid = lr_model.predict_proba(X_valid_std)

# AUC scores
train_score_lr = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_lr = roc_auc_score(y_valid, y_predict_valid[:,1])
print("Training ROC-AUC score of Logistic Regression model: ", train_score_lr)
print("Validation ROC-AUC score of Logistic Regression model: ", valid_score_lr)

Training ROC-AUC score of Logistic Regression model:  0.8015866888060914
Validation ROC-AUC score of Logistic Regression model:  0.7912948885805665


## 2. LightGBM

In [10]:
# random search

parameters = {
    'num_leaves': [x for x in range(4,50, 3)],
    'max_depth':[x for x in range(-1,51, 3)],
    'learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1, 0.3],
    'n_estimators': [x for x in range(100,1001, 100)],
}

# define the model
lgbm = lgb.LGBMClassifier(random_state=2021, objective='binary')
# cv_lgbm = GridSearchCV(lgbm, parameters, cv=5)
cv_lgbm = RandomizedSearchCV(lgbm, parameters, cv=5)

# run the random search cv on the train set to find the best parameters
%time cv_lgbm.fit(X_train_std, y_train)

CPU times: user 3min 45s, sys: 13.2 s, total: 3min 58s
Wall time: 35.3 s


RandomizedSearchCV(cv=5,
                   estimator=LGBMClassifier(objective='binary',
                                            random_state=2021),
                   param_distributions={'learning_rate': [0.01, 0.015, 0.025,
                                                          0.05, 0.1, 0.3],
                                        'max_depth': [-1, 2, 5, 8, 11, 14, 17,
                                                      20, 23, 26, 29, 32, 35,
                                                      38, 41, 44, 47, 50],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500, 600, 700, 800,
                                                         900, 1000],
                                        'num_leaves': [4, 7, 10, 13, 16, 19, 22,
                                                       25, 28, 31, 34, 37, 40,
                                                       43, 46, 49]})

In [11]:
print('The best parameters are: \n', cv_lgbm.best_params_)

The best parameters are: 
 {'num_leaves': 10, 'n_estimators': 900, 'max_depth': 11, 'learning_rate': 0.01}


In [12]:
# fitting with best parameters from the above search

lgb_model = lgb.LGBMClassifier(
    random_state=2021, 
    objective='binary',
    num_leaves=10,
    max_depth=11,
    learning_rate=0.01,
    n_estimators=900) 

# fit the model
lgb_model.fit(X_train_std, y_train)

LGBMClassifier(learning_rate=0.01, max_depth=11, n_estimators=900,
               num_leaves=10, objective='binary', random_state=2021)

In [13]:
# predictions
y_predict_train = lgb_model.predict_proba(X_train_std)
y_predict_valid = lgb_model.predict_proba(X_valid_std)

# AUC scores
train_score_lgb = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_lgb = roc_auc_score(y_valid, y_predict_valid[:,1])
print("Training ROC-AUC score of LightGBM model: ", train_score_lgb)
print("Validation ROC-AUC score of LightGBM model: ", valid_score_lgb)

Training ROC-AUC score of LightGBM model:  0.8366473202026334
Validation ROC-AUC score of LightGBM model:  0.8030689360926009


## 3. XGBoost

In [14]:
parameters = {
    'learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1, 0.3],
    'n_estimators': [x for x in range(100,1001, 100)],
    'max_depth': [3, 4, 6, 8, 10],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 1],
    'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
}

# define the model
xgbc = xgb.XGBClassifier(random_state = 2021, verbosity=0)
cv_xgb = RandomizedSearchCV(xgbc, parameters, cv=5)

# run the random search cv on the train set to find the best parameters
%time cv_xgb.fit(X_train_std, y_train)



CPU times: user 48min 32s, sys: 1min 7s, total: 49min 40s
Wall time: 7min 32s


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
                                           scale_pos_weight=None,
                                           subsample=None, tree_method=None,
                                      

In [15]:
print('The best parameters are: \n', cv_xgb.best_params_)

The best parameters are: 
 {'subsample': 0.7, 'n_estimators': 100, 'min_child_weight': 7, 'max_depth': 10, 'learning_rate': 0.025, 'colsample_bytree': 0.6}


In [16]:
# fitting with best parameters from the above search

xgb_model = xgb.XGBClassifier(
    random_state=2021, 
    learning_rate=0.025,
    n_estimators=100,
    max_depth=10,
    min_child_weight=7,
    subsample=0.7,
    colsample_bytree=0.6, 
    verbosity=0
) 

# fit the model
xgb_model.fit(X_train_std, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.025, max_delta_step=0, max_depth=10,
              min_child_weight=7, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              random_state=2021, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.7, tree_method='exact', validate_parameters=1,
              verbosity=0)

In [17]:
# predictions
y_predict_train = xgb_model.predict_proba(X_train_std)
y_predict_valid = xgb_model.predict_proba(X_valid_std)

# AUC scores
train_score_xgb = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_xgb = roc_auc_score(y_valid, y_predict_valid[:,1])
print("Training ROC-AUC score of XGBoost model: ", train_score_xgb)
print("Validation ROC-AUC score of XGBoost model: ", valid_score_xgb)

Training ROC-AUC score of XGBoost model:  0.854777113150471
Validation ROC-AUC score of XGBoost model:  0.8074324350731832


## 4. Random Forest

In [18]:
parameters = {
    'max_features': [x for x in range(15,(len(X_train.columns)+1), 50)],
    'n_estimators': [50, 100, 250, 500, 750, 1000], 
    'max_depth': [10, 25, 50, 75, 100] + [None]
}

# define the model
rf = RandomForestClassifier(random_state=2021)
cv_rf = RandomizedSearchCV(rf, parameters, cv=5)

# run the random search cv on the train set to find the best parameters
%time cv_rf.fit(X_train_std, y_train)

CPU times: user 9min 28s, sys: 8.37 s, total: 9min 37s
Wall time: 9min 48s


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=2021),
                   param_distributions={'max_depth': [10, 25, 50, 75, 100,
                                                      None],
                                        'max_features': [15],
                                        'n_estimators': [50, 100, 250, 500, 750,
                                                         1000]})

In [19]:
print('The best parameters are: \n', cv_rf.best_params_)

The best parameters are: 
 {'n_estimators': 500, 'max_features': 15, 'max_depth': 10}


In [20]:
# fitting with best parameters from the above search

rf_model = RandomForestClassifier(
    random_state=2021, 
    n_estimators = 500, 
    max_features = 15,
    max_depth = 10
) 

# fit the model
rf_model.fit(X_train_std, y_train)

RandomForestClassifier(max_depth=10, max_features=15, n_estimators=500,
                       random_state=2021)

In [21]:
# predictions
y_predict_train = rf_model.predict_proba(X_train_std)
y_predict_valid = rf_model.predict_proba(X_valid_std)

# AUC scores
train_score_rf = roc_auc_score(y_train, y_predict_train[:,1])
valid_score_rf = roc_auc_score(y_valid, y_predict_valid[:,1])
print("Training ROC-AUC score of Random Forest model: ", train_score_rf)
print("Validation ROC-AUC score of Random Forest model: ", valid_score_rf)

Training ROC-AUC score of Random Forest model:  0.8752111136403355
Validation ROC-AUC score of Random Forest model:  0.8051141552511416


## summary

In [22]:
summary = {
    'model':['Logistic Regression', 'LightGBM', 'XGBoost', 'Random Forest'],
    'Train ROC-AUC score':[train_score_lr, train_score_lgb, train_score_xgb, train_score_rf],
    'Validation ROC-AUC score':[valid_score_lr, valid_score_lgb, valid_score_xgb, valid_score_rf]
}

pd.DataFrame(summary)

Unnamed: 0,model,Train ROC-AUC score,Validation ROC-AUC score
0,Logistic Regression,0.801587,0.791295
1,LightGBM,0.836647,0.803069
2,XGBoost,0.854777,0.807432
3,Random Forest,0.875211,0.805114


### observations:

comparing these tuned models to the initial ones with default parameters in [this notebook](https://github.com/rachelkriggs/rocket/blob/main/notebooks/03_model.ipynb):

- The Logistic Regression model overfit a small amount more but overall performed about the same.
- The LightGBM model's validation score saw a small decrease; however, this model improved in overfitting less.
- The XGBoost model's validation score got better and this model overfit less.
- The Random Forest model's validation score increased and the model overfit less; however, this model is still overfitting the most in this set.