### Import Dependencies

In [21]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import (
                            classification_report, 
                            confusion_matrix,
                            roc_auc_score,
                            roc_curve,
                            precision_recall_curve,
                            f1_score,
                            precision_score,
                            recall_score,
                            accuracy_score
                            )
from sklearn.model_selection import (
                            StratifiedKFold,
                            cross_validate,
                            GridSearchCV
                            )

### Loading the Data

In [9]:
X_train = np.load('artifacts/X_train.npz')['arr_0']
y_train = np.load('artifacts/y_train.npz')['arr_0']
X_test = np.load('artifacts/X_test.npz')['arr_0']
y_test = np.load('artifacts/y_test.npz')['arr_0']

In [8]:
y_train

array([0, 0, 0, ..., 1, 0, 0], shape=(5634,))

### Base Model Training

In [10]:
model_lr = LogisticRegression(
    random_state=42
)

model_lr.fit(X_train, y_train)


model_dt = DecisionTreeClassifier(
    random_state=42
)

model_dt.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [11]:
y_hat_lr = model_lr.predict(X_test)
y_hat_dt = model_dt.predict(X_test)

In [12]:
accuracy = accuracy_score(y_test, y_hat_lr)
precision = precision_score(y_test, y_hat_lr)
recall = recall_score(y_test, y_hat_lr)
f1_score = f1_score(y_test, y_hat_lr)

print(f'accuracy: {accuracy}')
print(f'precision: {precision}')
print(f'recall: {recall}') 
print(f'f1_score: {f1_score}')

accuracy: 0.7991483321504613
precision: 0.6521739130434783
recall: 0.5213903743315508
f1_score: 0.5794947994056464


### Ensemble Model Implementation

In [20]:
models = {
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'Cat Boost': CatBoostClassifier()
}

### Hyper Parameter Grids

In [23]:
rf_param_grid={
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2,5],
    'max_features': ['sqrt', 'log2']
}

xgb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate' : [0.01, 0.1],
    'max_depth' : [3, 5, 7],
    'subsample': [0.8, 1.0],
}

cat_param_grid = {
    'iterations': [100, 200],
    'depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
}

param_grids = {
    'Random Forest' : rf_param_grid,
    'XGBoost': xgb_param_grid,
    'Cat Boost': cat_param_grid 
}

### Configure K-Fold CV

In [22]:
cv = StratifiedKFold(
    n_splits=5,
    random_state=42,
    shuffle=True
)

### Finding the Best Hyper Parameters

In [25]:
grid_search_results = {}

for model_name, model in models.items():
    print(f'\n Tuning {model_name}')
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(
        estimator = model,
        param_grid = param_grid,
        cv=cv,
        scoring = 'f1',
        verbose=1,
    )

    print(f'Fitting gridSearchCV for {model_name}')
    grid_search.fit(X_train, y_train)
    grid_search_results[model_name] = grid_search

    print(f'\n{model_name} gridSearchCV completed')
    print(f'Best parameters : {grid_search.best_params_}')
    print(f'Best CV score : {grid_search.best_score_}')


 Tuning Random Forest
Fitting gridSearchCV for Random Forest
Fitting 5 folds for each of 16 candidates, totalling 80 fits

Random Forest gridSearchCV completed
Best parameters : {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}
Best CV score : 0.5707470110341492

 Tuning XGBoost
Fitting gridSearchCV for XGBoost
Fitting 5 folds for each of 24 candidates, totalling 120 fits

XGBoost gridSearchCV completed
Best parameters : {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}
Best CV score : 0.5872706369923362

 Tuning Cat Boost
Fitting gridSearchCV for Cat Boost
Fitting 5 folds for each of 12 candidates, totalling 60 fits
0:	learn: 0.6868226	total: 276ms	remaining: 27.3s
1:	learn: 0.6814389	total: 280ms	remaining: 13.7s
2:	learn: 0.6765428	total: 282ms	remaining: 9.13s
3:	learn: 0.6705778	total: 284ms	remaining: 6.82s
4:	learn: 0.6648675	total: 286ms	remaining: 5.44s
5:	learn: 0.6591040	total: 289ms	remaining: 4.52s
6:	learn: