In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

import optuna
from sklearn.model_selection import cross_validate

import sklearn
import category_encoders
from sklearn import set_config
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV, train_test_split
import pickle
set_config(transform_output = "pandas")

import optuna.logging
optuna.logging.set_verbosity(optuna.logging.WARNING)
import os


In [4]:
# reading datasets
with open('data/cleaned/xtrain.pkl', 'rb') as f:
    xtrain = pickle.load(f)
with open('data/cleaned/xtest.pkl', 'rb') as f:
    xtest = pickle.load(f)
with open('data/cleaned/xval.pkl', 'rb') as f:
    xval = pickle.load(f)
with open('data/cleaned/ytrain.pkl', 'rb') as f:
    ytrain = pickle.load(f)
with open('data/cleaned/ytest.pkl', 'rb') as f:
    ytest = pickle.load(f)
with open('data/cleaned/yval.pkl', 'rb') as f:
    yval = pickle.load(f)

## Modeling

In [19]:
# defining some utility functions
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, log_loss

# dataframe to store model performance metrics
model_performance = pd.DataFrame(columns=['model_name', 'train_accuracy', 'test_accuracy', 'train_precision', 'test_precision',
                                          'train_recall', 'test_recall', 'train_f1', 'test_f1', 'train_roc_auc', 'test_roc_auc', 
                                          'train_log_loss', 'test_log_loss'])
# dictionary to store the models
estimators = {}

# function to calculate model performance metrics
def model_scorer(clf, x, y, prefix=''):
    ypred = clf.predict(x)
    yprob = clf.predict_proba(x)[:, 1]
    score_dict = {
        prefix+'accuracy': np.round(accuracy_score(y, ypred),3),
        prefix+'precision': np.round(precision_score(y, ypred),3),
        prefix+'recall': np.round(recall_score(y, ypred),3),
        prefix+'f1': np.round(f1_score(y, ypred),3),
        prefix+'roc_auc': np.round(roc_auc_score(y, yprob),3),
        prefix+'log_loss': np.round(log_loss(y, yprob),3)
    }
    return score_dict

# function to add model performance to the dataframe and model to the dictionary
def add_model_performance(model_name, clf, xtrain=xtrain, ytrain=ytrain, xtest=xtest, ytest=ytest, df=model_performance, model_dict=estimators):
    train_scores = model_scorer(clf, xtrain, ytrain, prefix='train_')
    test_scores = model_scorer(clf, xtest, ytest, prefix='test_')
    train_scores.update(test_scores)
    train_scores['model_name'] = model_name
    df.loc[df.shape[0]] = train_scores
    model_dict[model_name] = clf
    return df,model_dict

#### Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

# defining the objective function for the optimization
def objective(trial, xtrain, ytrain, kf):
    params = {
        'C': trial.suggest_loguniform('C', 1e-3, 1e3),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear'])
    }
    model = LogisticRegression(**params)
    scores = cross_validate(model, xtrain, ytrain, cv=kf, scoring='neg_log_loss', n_jobs=-1)
    loss = -scores['test_score'].mean()
    return loss

# optimization using kfold cross validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)
obj_func = lambda trial: objective(trial, xtrain.values, ytrain.values.ravel(), kf)
study = optuna.create_study(direction='minimize')
study.optimize(obj_func, n_trials=30)

# printing the best score and best parameters
print(f"Best score: {study.best_value:5f}")
print(f"Best params: {study.best_params}")

# training the model with the best parameters
best_model = LogisticRegression(**study.best_params).fit(xtrain.values, ytrain.values.ravel())

# adding the model performance to the dataframe and model to the dictionary
model_performance, estimators = add_model_performance(model_name='LogisticRegression', clf=best_model)

# displaying the model performance
display(model_performance)


Best score: 0.390138
Best params: {'C': 0.6472139377148375, 'penalty': 'l2', 'solver': 'liblinear'}


Unnamed: 0,model_name,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc,train_log_loss,test_log_loss
0,LogisticRegression,0.848,0.849,0.874,0.873,0.849,0.852,0.861,0.863,0.908,0.91,0.39,0.388


#### Decision Tree Classifier

In [23]:
from sklearn.tree import DecisionTreeClassifier

def objective(trial, xtrain, ytrain, kf):
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 300, 7),
        'min_samples_split': trial.suggest_int('min_samples_split', 50, 1000, 50),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 50, 1000, 50),
    }
    model = DecisionTreeClassifier(**params)
    scores = cross_validate(model, xtrain, ytrain, cv=kf, scoring='neg_log_loss', n_jobs=-1)
    loss = -scores['test_score'].mean()
    return loss

kf = KFold(n_splits=3, shuffle=True, random_state=42)
obj_func = lambda trial: objective(trial, xtrain.values, ytrain.values.ravel(), kf)
study = optuna.create_study(direction='minimize',)
study.optimize(obj_func, n_trials=50)

print(f"Best score: {study.best_value:5f}")
print(f"Best params: {study.best_params}")

best_model = DecisionTreeClassifier(**study.best_params).fit(xtrain.values, ytrain.values.ravel())
model_performance, estimators = add_model_performance(model_name='DecisionTreeClassifier', clf=best_model)

display(model_performance)


Best score: 0.193432
Best params: {'max_depth': 113, 'min_samples_split': 600, 'min_samples_leaf': 100}


Unnamed: 0,model_name,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc,train_log_loss,test_log_loss
0,LogisticRegression,0.848,0.849,0.874,0.873,0.849,0.852,0.861,0.863,0.908,0.91,0.39,0.388
1,DecisionTreeClassifier,0.935,0.935,0.897,0.898,0.997,0.997,0.944,0.945,0.962,0.953,0.182,0.191


#### Random Forest Classifier

In [24]:
from sklearn.ensemble import RandomForestClassifier

def objective(trial, xtrain, ytrain, kf):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1001, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 100, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 100, 10),
    }
    model = RandomForestClassifier(**params)
    scores = cross_validate(model, xtrain, ytrain, cv=kf, scoring='neg_log_loss', n_jobs=-1)
    loss = -scores['test_score'].mean()
    return loss

kf = KFold(n_splits=3, shuffle=True, random_state=42)
obj_func = lambda trial: objective(trial, xtrain.values, ytrain.values.ravel(), kf)
study = optuna.create_study(direction='minimize',)
study.optimize(obj_func, n_trials=5)

print(f"Best score: {study.best_value:5f}")
print(f"Best params: {study.best_params}")

best_model = RandomForestClassifier(**study.best_params).fit(xtrain.values, ytrain.values.ravel())
model_performance, estimators = add_model_performance(model_name='RandomForestClassifier', clf=best_model)

display(model_performance)


Best score: 0.189381
Best params: {'n_estimators': 800, 'min_samples_split': 62, 'min_samples_leaf': 11}


Unnamed: 0,model_name,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc,train_log_loss,test_log_loss
0,LogisticRegression,0.848,0.849,0.874,0.873,0.849,0.852,0.861,0.863,0.908,0.91,0.39,0.388
1,DecisionTreeClassifier,0.935,0.935,0.897,0.898,0.997,0.997,0.944,0.945,0.962,0.953,0.182,0.191
2,RandomForestClassifier,0.936,0.936,0.898,0.898,0.998,0.998,0.945,0.946,0.996,0.954,0.162,0.186


#### XGBoost Classifier

In [26]:
from xgboost import XGBClassifier

def objective(trial, xtrain, ytrain, kf):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 501, 50),
        'max_depth': trial.suggest_categorical('max_depth', [1,2]),
    }
    model = XGBClassifier(**params)
    scores = cross_validate(model, xtrain, ytrain, cv=kf, scoring='neg_log_loss', n_jobs=-1)
    loss = -scores['test_score'].mean()
    return loss

kf = KFold(n_splits=3, shuffle=True, random_state=42)
obj_func = lambda trial: objective(trial, xtrain.values, ytrain.values.ravel(), kf)
study = optuna.create_study(direction='minimize',)
study.optimize(obj_func, n_trials=50)

print(f"Best score: {study.best_value:5f}")
print(f"Best params: {study.best_params}")

best_model = XGBClassifier(**study.best_params).fit(xtrain.values, ytrain.values.ravel())
model_performance, estimators = add_model_performance(model_name='XGBClassifier', clf=best_model)

display(model_performance)

Best score: 0.254015
Best params: {'n_estimators': 460, 'max_depth': 2}


Unnamed: 0,model_name,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc,train_log_loss,test_log_loss
0,LogisticRegression,0.848,0.849,0.874,0.873,0.849,0.852,0.861,0.863,0.908,0.91,0.39,0.388
1,DecisionTreeClassifier,0.935,0.935,0.897,0.898,0.997,0.997,0.944,0.945,0.962,0.953,0.182,0.191
2,RandomForestClassifier,0.936,0.936,0.898,0.898,0.998,0.998,0.945,0.946,0.996,0.954,0.162,0.186
3,XGBClassifier,0.914,0.914,0.9,0.9,0.952,0.951,0.925,0.925,0.95,0.949,0.251,0.252


#### KNN Classifier

In [28]:
from sklearn.neighbors import KNeighborsClassifier

def objective(trial, xtrain, ytrain, kf):
    params = {
        'n_neighbors': trial.suggest_int('n_neighbors', 1, 20, 4),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
    }
    model = KNeighborsClassifier(**params)
    scores = cross_validate(model, xtrain, ytrain, cv=kf, scoring='accuracy', n_jobs=-1)
    loss = scores['test_score'].mean()
    return loss

xtrain_sample = xtrain.sample(frac=0.5, random_state=42)
ytrain_sample = ytrain.loc[xtrain_sample.index]
kf = KFold(n_splits=3, shuffle=True, random_state=42)
obj_func = lambda trial: objective(trial, xtrain_sample.values, ytrain_sample.values.ravel(), kf)
study = optuna.create_study(direction='maximize',)
study.optimize(obj_func, n_trials=10)

print(f"Best score: {study.best_value:5f}")
print(f"Best params: {study.best_params}")

best_model = KNeighborsClassifier(**study.best_params).fit(xtrain.values, ytrain.values.ravel())
model_performance, estimators = add_model_performance(model_name='KNeighborsClassifier', clf=best_model)

display(model_performance)


Best score: 0.898761
Best params: {'n_neighbors': 9, 'weights': 'distance'}


Unnamed: 0,model_name,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc,train_log_loss,test_log_loss
0,LogisticRegression,0.848,0.849,0.874,0.873,0.849,0.852,0.861,0.863,0.908,0.91,0.39,0.388
1,DecisionTreeClassifier,0.935,0.935,0.897,0.898,0.997,0.997,0.944,0.945,0.962,0.953,0.182,0.191
2,RandomForestClassifier,0.936,0.936,0.898,0.898,0.998,0.998,0.945,0.946,0.996,0.954,0.162,0.186
3,XGBClassifier,0.914,0.914,0.9,0.9,0.952,0.951,0.925,0.925,0.95,0.949,0.251,0.252
4,KNeighborsClassifier,1.0,0.907,1.0,0.901,1.0,0.935,1.0,0.918,1.0,0.938,0.0,0.896


#### Stacking Classifier with Logistic Regression as Meta Classifier

In [29]:
from sklearn.ensemble import StackingClassifier

def objective(trial, xtrain, ytrain, kf, estimators_list):
    params = {
        'C': trial.suggest_loguniform('C', 1e-3, 1e3),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear'])
    }
    model = LogisticRegression(**params)
    stacker = StackingClassifier(estimators=estimators_list, final_estimator=model, cv=None, n_jobs=-1)
    scores = cross_validate(stacker, xtrain.values, ytrain.values.ravel(), cv=kf, scoring='neg_log_loss', n_jobs=-1)
    loss = -scores['test_score'].mean()
    return loss

estimators_list = [(k, v) for k, v in estimators.items()]
kf = KFold(n_splits=3, shuffle=True, random_state=42)
obj_func = lambda trial: objective(trial, xval, yval, kf, estimators_list)
study = optuna.create_study(direction='minimize',)
study.optimize(obj_func, n_trials=5)

print(f"Best score: {study.best_value:5f}")
print(f"Best params: {study.best_params}")

best_model = StackingClassifier(estimators=estimators_list, final_estimator=LogisticRegression(**study.best_params), cv=None, n_jobs=-1)
best_model.fit(xval.values, yval.values.ravel())
model_performance, estimators = add_model_performance(model_name='StackingLR', clf=best_model)

display(model_performance)




Best score: 0.201541
Best params: {'C': 773.9379121439813, 'penalty': 'l1', 'solver': 'liblinear'}


Unnamed: 0,model_name,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc,train_log_loss,test_log_loss
0,LogisticRegression,0.848,0.849,0.874,0.873,0.849,0.852,0.861,0.863,0.908,0.91,0.39,0.388
1,DecisionTreeClassifier,0.935,0.935,0.897,0.898,0.997,0.997,0.944,0.945,0.962,0.953,0.182,0.191
2,RandomForestClassifier,0.936,0.936,0.898,0.898,0.998,0.998,0.945,0.946,0.996,0.954,0.162,0.186
3,XGBClassifier,0.914,0.914,0.9,0.9,0.952,0.951,0.925,0.925,0.95,0.949,0.251,0.252
4,KNeighborsClassifier,1.0,0.907,1.0,0.901,1.0,0.935,1.0,0.918,1.0,0.938,0.0,0.896
5,StackingLR,0.934,0.934,0.898,0.898,0.994,0.994,0.943,0.944,0.953,0.953,0.198,0.197


#### Stacking Classifier with XGBoost as final estimator

In [30]:
## optuna tuning
def objective(trial, xtrain, ytrain, kf, estimators_list):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 501, 50),
        'max_depth': trial.suggest_categorical('max_depth', [1,2]),
    }
    model = XGBClassifier(**params)
    stacker = StackingClassifier(estimators=estimators_list, final_estimator=model, cv=None, n_jobs=-1)
    scores = cross_validate(stacker, xtrain, ytrain, cv=kf, scoring='neg_log_loss', n_jobs=-1)
    loss = -scores['test_score'].mean()
    return loss

estimators_list = [(k, v) for k, v in estimators.items()]
kf = KFold(n_splits=3, shuffle=True, random_state=42)
obj_func = lambda trial: objective(trial, xval.values, yval.values.ravel(), kf, estimators_list)
study = optuna.create_study(direction='minimize',)
study.optimize(obj_func, n_trials=1)

print(f"Best score: {study.best_value:5f}")
print(f"Best params: {study.best_params}")

best_model = StackingClassifier(estimators=estimators_list, final_estimator=XGBClassifier(**study.best_params), cv=None, n_jobs=-1)
best_model.fit(xval.values, yval.values.ravel())
model_performance, estimators = add_model_performance(model_name='StackingXGB', clf=best_model)

display(model_performance)




Best score: 0.183139
Best params: {'n_estimators': 110, 'max_depth': 2}




Unnamed: 0,model_name,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc,train_log_loss,test_log_loss
0,LogisticRegression,0.848,0.849,0.874,0.873,0.849,0.852,0.861,0.863,0.908,0.91,0.39,0.388
1,DecisionTreeClassifier,0.935,0.935,0.897,0.898,0.997,0.997,0.944,0.945,0.962,0.953,0.182,0.191
2,RandomForestClassifier,0.936,0.936,0.898,0.898,0.998,0.998,0.945,0.946,0.996,0.954,0.162,0.186
3,XGBClassifier,0.914,0.914,0.9,0.9,0.952,0.951,0.925,0.925,0.95,0.949,0.251,0.252
4,KNeighborsClassifier,1.0,0.907,1.0,0.901,1.0,0.935,1.0,0.918,1.0,0.938,0.0,0.896
5,StackingLR,0.934,0.934,0.898,0.898,0.994,0.994,0.943,0.944,0.953,0.953,0.198,0.197
6,StackingXGB,0.936,0.937,0.897,0.898,0.999,0.999,0.946,0.946,0.953,0.953,0.182,0.181


#### Support Vector Classifier

In [23]:
from sklearn.svm import SVC

def objective(trial, xtrain, ytrain, kf):
    params = {
        'C': trial.suggest_loguniform('C', 1e-3, 1e3),
        'kernel': trial.suggest_categorical('kernel', ['poly', 'rbf', 'sigmoid']),
        'degree': trial.suggest_int('degree', 2, 4),
        'probability': True,
    }
    model = SVC(**params)
    scores = cross_validate(model, xtrain, ytrain, cv=kf, scoring='neg_log_loss', n_jobs=-1)
    loss = -scores['test_score'].mean()
    return loss

xtrain_sample = xtrain.sample(frac=0.1, random_state=42)
ytrain_sample = ytrain.loc[xtrain_sample.index]
kf = KFold(n_splits=3, shuffle=True, random_state=42)
obj_func = lambda trial: objective(trial, xtrain_sample.values, ytrain_sample.values.ravel(), kf)
study = optuna.create_study(direction='minimize',)
study.optimize(obj_func, n_trials=5)

print(f"Best score: {study.best_value:5f}")
print(f"Best params: {study.best_params}")

best_model = SVC(**study.best_params, probability=True).fit(xtrain_sample.values, ytrain_sample.values.ravel())
model_performance, estimators = add_model_performance(model_name='SVC', clf=best_model)

display(model_performance)


Unnamed: 0,model_name,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc,train_log_loss,test_log_loss
0,SVC,0.892,0.893,0.897,0.898,0.91,0.911,0.904,0.904,0.935,0.936,0.302,0.299


#### LightGBM Classifier

In [26]:
from lightgbm import LGBMClassifier
def objective(trial, xtrain, ytrain, kf):
    params = {
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt']),
        'max_depth': trial.suggest_categorical('max_depth', [1, 2]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e0),
        'n_estimators': trial.suggest_int('n_estimators', 10, 3001, 10),
        'verbosity':-1
    }
    model = LGBMClassifier(**params)
    scores = cross_validate(model, xtrain, ytrain, cv=kf, scoring='neg_log_loss', n_jobs=-1)
    loss = -scores['test_score'].mean()
    return loss

kf = KFold(n_splits=3, shuffle=True, random_state=42)
obj_func = lambda trial: objective(trial, xtrain.values, ytrain.values.ravel(), kf)
study = optuna.create_study(direction='minimize',)
study.optimize(obj_func, n_trials=50)

print(f"Best score: {study.best_value:5f}")
print(f"Best params: {study.best_params}")

best_model = LGBMClassifier(**study.best_params).fit(xtrain.values, ytrain.values.ravel())
model_performance, estimators = add_model_performance(model_name='LGBMClassifier', clf=best_model, df=model_performance, model_dict=estimators)

display(model_performance)


Best score: 0.269692
Best params: {'boosting_type': 'gbdt', 'max_depth': 2, 'learning_rate': 0.03408980842127418, 'n_estimators': 560}
[LightGBM] [Info] Number of positive: 157075, number of negative: 125839
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001525 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 482
[LightGBM] [Info] Number of data points in the train set: 282914, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.555204 -> initscore=0.221720
[LightGBM] [Info] Start training from score 0.221720


Unnamed: 0,model_name,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc,train_log_loss,test_log_loss
0,LogisticRegression,0.848,0.849,0.874,0.873,0.849,0.852,0.861,0.863,0.908,0.91,0.39,0.388
1,DecisionTreeClassifier,0.935,0.935,0.897,0.898,0.997,0.997,0.944,0.945,0.962,0.953,0.182,0.191
2,RandomForestClassifier,0.936,0.936,0.898,0.898,0.998,0.998,0.945,0.946,0.996,0.954,0.162,0.186
3,XGBClassifier,0.914,0.914,0.9,0.9,0.952,0.951,0.925,0.925,0.95,0.949,0.251,0.252
4,KNeighborsClassifier,1.0,0.907,1.0,0.901,1.0,0.935,1.0,0.918,1.0,0.938,0.0,0.896
5,StackingLR,0.934,0.934,0.898,0.898,0.994,0.994,0.943,0.944,0.953,0.953,0.198,0.197
6,StackingXGB,0.936,0.937,0.897,0.898,0.999,0.999,0.946,0.946,0.953,0.953,0.182,0.181
7,LGBMClassifier,0.91,0.91,0.9,0.9,0.943,0.941,0.921,0.92,0.947,0.947,0.269,0.268


#### CatBoost Classifier

In [27]:
from catboost import CatBoostClassifier
def objective(trial, xtrain, ytrain, kf):
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e0),
        'depth': trial.suggest_categorical('depth', [1, 2]),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 1e3),
        'model_size_reg': trial.suggest_loguniform('model_size_reg', 1e-3, 1e3),
        'verbose': 0
    }
    model = CatBoostClassifier(**params)
    scores = cross_validate(model, xtrain, ytrain, cv=kf, scoring='neg_log_loss', n_jobs=-1)
    loss = -scores['test_score'].mean()
    return loss

kf = KFold(n_splits=3, shuffle=True, random_state=42)
obj_func = lambda trial: objective(trial, xtrain.values, ytrain.values.ravel(), kf)
study = optuna.create_study(direction='minimize',)
study.optimize(obj_func, n_trials=50)

print(f"Best score: {study.best_value:5f}")
print(f"Best params: {study.best_params}")

best_model = CatBoostClassifier(**study.best_params).fit(xtrain.values, ytrain.values.ravel())
model_performance, estimators = add_model_performance(model_name='CatBoostClassifier', clf=best_model, df=model_performance, model_dict=estimators)

display(model_performance)

Best score: 0.353479
Best params: {'learning_rate': 0.005395480987241722, 'depth': 1, 'l2_leaf_reg': 1.3946066306928457, 'model_size_reg': 76.11035091840746}
0:	learn: 0.6910423	total: 66.7ms	remaining: 1m 6s
1:	learn: 0.6889661	total: 76.6ms	remaining: 38.2s
2:	learn: 0.6869162	total: 85.9ms	remaining: 28.6s
3:	learn: 0.6848925	total: 95ms	remaining: 23.7s
4:	learn: 0.6828915	total: 109ms	remaining: 21.6s
5:	learn: 0.6809136	total: 119ms	remaining: 19.7s
6:	learn: 0.6789650	total: 128ms	remaining: 18.1s
7:	learn: 0.6770385	total: 137ms	remaining: 17s
8:	learn: 0.6751332	total: 146ms	remaining: 16.1s
9:	learn: 0.6732556	total: 156ms	remaining: 15.4s
10:	learn: 0.6713978	total: 165ms	remaining: 14.8s
11:	learn: 0.6695686	total: 174ms	remaining: 14.4s
12:	learn: 0.6677568	total: 185ms	remaining: 14.1s
13:	learn: 0.6659726	total: 195ms	remaining: 13.8s
14:	learn: 0.6642088	total: 205ms	remaining: 13.4s
15:	learn: 0.6624655	total: 214ms	remaining: 13.1s
16:	learn: 0.6607485	total: 223ms	re

Unnamed: 0,model_name,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc,train_log_loss,test_log_loss
0,LogisticRegression,0.848,0.849,0.874,0.873,0.849,0.852,0.861,0.863,0.908,0.91,0.39,0.388
1,DecisionTreeClassifier,0.935,0.935,0.897,0.898,0.997,0.997,0.944,0.945,0.962,0.953,0.182,0.191
2,RandomForestClassifier,0.936,0.936,0.898,0.898,0.998,0.998,0.945,0.946,0.996,0.954,0.162,0.186
3,XGBClassifier,0.914,0.914,0.9,0.9,0.952,0.951,0.925,0.925,0.95,0.949,0.251,0.252
4,KNeighborsClassifier,1.0,0.907,1.0,0.901,1.0,0.935,1.0,0.918,1.0,0.938,0.0,0.896
5,StackingLR,0.934,0.934,0.898,0.898,0.994,0.994,0.943,0.944,0.953,0.953,0.198,0.197
6,StackingXGB,0.936,0.937,0.897,0.898,0.999,0.999,0.946,0.946,0.953,0.953,0.182,0.181
7,LGBMClassifier,0.91,0.91,0.9,0.9,0.943,0.941,0.921,0.92,0.947,0.947,0.269,0.268
8,CatBoostClassifier,0.884,0.885,0.899,0.9,0.892,0.893,0.896,0.896,0.938,0.938,0.353,0.352


#### AdaBoost Classifier

In [29]:
from sklearn.ensemble import AdaBoostClassifier
def objective(trial, xtrain, ytrain, kf):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 501, 50),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e0),
    }
    model = AdaBoostClassifier(**params)
    scores = cross_validate(model, xtrain, ytrain, cv=kf, scoring='neg_log_loss', n_jobs=-1)
    loss = -scores['test_score'].mean()
    return loss

kf = KFold(n_splits=3, shuffle=True, random_state=42)
obj_func = lambda trial: objective(trial, xtrain.values, ytrain.values.ravel(), kf)
study = optuna.create_study(direction='minimize',)
study.optimize(obj_func, n_trials=50)

print(f"Best score: {study.best_value:5f}")
print(f"Best params: {study.best_params}")

best_model = AdaBoostClassifier(**study.best_params).fit(xtrain.values, ytrain.values.ravel())
model_performance, estimators = add_model_performance(model_name='AdaBoostClassifier', clf=best_model, df=model_performance, model_dict=estimators)

display(model_performance)



Best score: 0.526830
Best params: {'n_estimators': 10, 'learning_rate': 0.0054699267053919896}


Unnamed: 0,model_name,train_accuracy,test_accuracy,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_roc_auc,test_roc_auc,train_log_loss,test_log_loss
0,LogisticRegression,0.848,0.849,0.874,0.873,0.849,0.852,0.861,0.863,0.908,0.91,0.39,0.388
1,DecisionTreeClassifier,0.935,0.935,0.897,0.898,0.997,0.997,0.944,0.945,0.962,0.953,0.182,0.191
2,RandomForestClassifier,0.936,0.936,0.898,0.898,0.998,0.998,0.945,0.946,0.996,0.954,0.162,0.186
3,XGBClassifier,0.914,0.914,0.9,0.9,0.952,0.951,0.925,0.925,0.95,0.949,0.251,0.252
4,KNeighborsClassifier,1.0,0.907,1.0,0.901,1.0,0.935,1.0,0.918,1.0,0.938,0.0,0.896
5,StackingLR,0.934,0.934,0.898,0.898,0.994,0.994,0.943,0.944,0.953,0.953,0.198,0.197
6,StackingXGB,0.936,0.937,0.897,0.898,0.999,0.999,0.946,0.946,0.953,0.953,0.182,0.181
7,LGBMClassifier,0.91,0.91,0.9,0.9,0.943,0.941,0.921,0.92,0.947,0.947,0.269,0.268
8,CatBoostClassifier,0.884,0.885,0.899,0.9,0.892,0.893,0.896,0.896,0.938,0.938,0.353,0.352
9,AdaBoostClassifier,0.74,0.742,0.907,0.909,0.592,0.595,0.717,0.719,0.758,0.76,0.527,0.524


In [24]:
# saving the models, model performance and estimators
for model_name, model in estimators.items():
    with open(f'data/{model_name}_final.pkl', 'wb') as f:
        pickle.dump(model, f)
with open('data/model_performance_final.pkl', 'wb') as f:
    pickle.dump(model_performance, f)
with open('data/estimators_final.pkl', 'wb') as f:
    pickle.dump(estimators, f)

# Conclusion
The best model out of all the trained ones is Stacking Classifier with XGBoost as the final model.