### Import Dependencies

In [12]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import (
                            classification_report, 
                            confusion_matrix,
                            roc_auc_score,
                            roc_curve,
                            precision_recall_curve,
                            f1_score,
                            precision_score,
                            recall_score,
                            accuracy_score
                            )
from sklearn.model_selection import (
                            StratifiedKFold,
                            cross_validate,
                            GridSearchCV,
                            RandomizedSearchCV
                            )

### Loading the Data

In [13]:
X_train = np.load('artifacts/X_train.npz', allow_pickle=True)['arr_0']
y_train = np.load('artifacts/y_train.npz', allow_pickle=True)['arr_0']
X_test = np.load('artifacts/X_test.npz', allow_pickle=True)['arr_0']
y_test = np.load('artifacts/y_test.npz', allow_pickle=True)['arr_0']

### Models

In [14]:
models = {
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'Cat Boost': CatBoostClassifier()
}

### Parameter Grids

In [15]:
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [6, 8, 12, 16],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


xgb_param_grid = {
    'n_estimators': [100, 200, 250, 300],
    'max_depth': [3, 4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2]
}


cat_param_grid ={
    'iterations': [200, 500, 1000],
    'depth': [2, 4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5, 7],
    'border_count': [32, 64, 128]
}


param_grids = {
    'Random Forest' : rf_param_grid,
    'XGBoost': xgb_param_grid,
    'Cat Boost': cat_param_grid 
}

### Configure K-Fold CV

In [16]:
cv = StratifiedKFold(
    n_splits=5,
    random_state=42,
    shuffle=True
)

In [17]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV

cv = StratifiedKFold(
    n_splits=5,
    random_state=42,
    shuffle=True
)

random_search_results = {}

for model_name, model in models.items():
    print(f'\nTuning {model_name}')
    
    # param_distributions instead of param_grid
    param_dist = param_grids[model_name]  
    
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=20,            
        cv=cv,
        scoring='f1',
        verbose=1,
        random_state=42,
        n_jobs=-1                
    )

    print(f'Fitting RandomizedSearchCV for {model_name}...')
    random_search.fit(X_train, y_train)
    random_search_results[model_name] = random_search

    print(f'\n{model_name} RandomizedSearchCV completed')
    print(f'Best parameters : {random_search.best_params_}')
    print(f'Best CV score : {random_search.best_score_}')



Tuning Random Forest
Fitting RandomizedSearchCV for Random Forest...
Fitting 5 folds for each of 20 candidates, totalling 100 fits

Random Forest RandomizedSearchCV completed
Best parameters : {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 16, 'criterion': 'log_loss'}
Best CV score : 0.8461323794960155

Tuning XGBoost
Fitting RandomizedSearchCV for XGBoost...
Fitting 5 folds for each of 20 candidates, totalling 100 fits

XGBoost RandomizedSearchCV completed
Best parameters : {'subsample': 0.8, 'n_estimators': 250, 'max_depth': 10, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 0.8}
Best CV score : 0.8441001466652912

Tuning Cat Boost
Fitting RandomizedSearchCV for Cat Boost...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
0:	learn: 0.6631270	total: 220ms	remaining: 1m 49s
1:	learn: 0.6401517	total: 263ms	remaining: 1m 5s
2:	learn: 0.6156170	total: 301ms	remaining: 49.9s
3:	learn: 0.5942027	total: 340ms	remaining: 42.2s
4:	learn

In [19]:
model_cat = CatBoostClassifier(
    learning_rate = 0.05,
    l2_leaf_reg = 3,
    iterations = 500,
    depth = 10,
    border_count = 64
)

In [20]:
model_cat.fit(X_train, y_train)

0:	learn: 0.6631270	total: 27ms	remaining: 13.5s
1:	learn: 0.6401517	total: 38.6ms	remaining: 9.61s
2:	learn: 0.6156170	total: 60.6ms	remaining: 10s
3:	learn: 0.5942027	total: 84.7ms	remaining: 10.5s
4:	learn: 0.5738110	total: 104ms	remaining: 10.3s
5:	learn: 0.5565053	total: 125ms	remaining: 10.3s
6:	learn: 0.5405845	total: 145ms	remaining: 10.2s
7:	learn: 0.5247854	total: 170ms	remaining: 10.5s
8:	learn: 0.5127305	total: 195ms	remaining: 10.7s
9:	learn: 0.5020329	total: 221ms	remaining: 10.8s
10:	learn: 0.4918764	total: 248ms	remaining: 11s
11:	learn: 0.4838873	total: 271ms	remaining: 11s
12:	learn: 0.4755816	total: 301ms	remaining: 11.3s
13:	learn: 0.4671482	total: 330ms	remaining: 11.4s
14:	learn: 0.4601473	total: 352ms	remaining: 11.4s
15:	learn: 0.4541666	total: 378ms	remaining: 11.4s
16:	learn: 0.4481471	total: 399ms	remaining: 11.3s
17:	learn: 0.4439831	total: 418ms	remaining: 11.2s
18:	learn: 0.4392236	total: 435ms	remaining: 11s
19:	learn: 0.4342043	total: 456ms	remaining: 10

<catboost.core.CatBoostClassifier at 0x189a2fd9b50>

In [21]:
y_hat_cat = model_cat.predict(X_test)

In [22]:
accuracy = accuracy_score(y_test, y_hat_cat)
precision = precision_score(y_test, y_hat_cat)
recall = recall_score(y_test, y_hat_cat)
f1 = f1_score(y_test, y_hat_cat)

print(f'accuracy: {accuracy}')
print(f'precision: {precision}')
print(f'recall: {recall}') 
print(f'f1_score: {f1}')

accuracy: 0.7622427253371186
precision: 0.5454545454545454
recall: 0.6256684491978609
f1_score: 0.5828144458281445
