#### Model Training
Import Data and Required Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

Import ML models and evaluation metrics

In [2]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV

import warnings

Define evaluate_model function to evaluate the following metrics on the true and predicted values:
* precision (define each)
* recall
* f1 score
* ROC_AUC
* PR_AUC

In [3]:
def evaluate_model(true, predicted, model, X_test):
    precision = precision_score(true, predicted , zero_division = 0)
    recall = recall_score(true, predicted , zero_division = 0)
    f1 = f1_score(true , predicted, zero_division = 0)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(true, y_pred_proba)
    pr_auc = average_precision_score(true, y_pred_proba)
    return precision, recall, f1, roc_auc, pr_auc

Define function to train and evaluate each model for the scaled data

* Logistic Regression (describe each)
* Logistic Regression with L1 
* Logistic Regression with L2
* Support vector classifier
* K-Neighbors classifier
* Decision tree
* Rand forest classifier
* XGB classifier
* Catboost classifier
* Adaboost classifier
* Gradient boost classifier

In [8]:
def classification(X_train, y_train, X_test, y_test):

    models = {
        "Logistic Regression": LogisticRegression(),

        "Lasso": LogisticRegression(penalty='l1', solver='liblinear'),
   
        "Ridge": LogisticRegression(penalty='l2', solver='liblinear'),

        #"Support Vector Machine": SVC(),

        "K-Neighbors Classifier": KNeighborsClassifier(),

        "Decision Tree": DecisionTreeClassifier(),

        "Random Forest Classifier": RandomForestClassifier(),

        "XGBClassifier": XGBClassifier(), 
        
        "CatBoosting Classifier": CatBoostClassifier(verbose=False),

        "AdaBoost Classifier": AdaBoostClassifier(),

        "GradientBoosting Classifier": GradientBoostingClassifier()
    }
 
    # Lists to store results
    results_train = []
    results_test = []
    
    for model_name, model in models.items():
 
        # Train model
        model.fit(X_train, y_train)
        
        # Training predictions
        y_train_pred = model.predict(X_train)
        accuracy_train = accuracy_score(y_train, y_train_pred)
        train_precision, train_recall, train_f1, train_roc_auc, train_pr_auc = evaluate_model(
            y_train, y_train_pred, model, X_train
        )
        
        # Test predictions
        y_test_pred = model.predict(X_test)
        accuracy_test = accuracy_score(y_test, y_test_pred)
        test_precision, test_recall, test_f1, test_roc_auc, test_pr_auc = evaluate_model(
            y_test, y_test_pred, model, X_test
        )
        
        # Store training results
        results_train.append({
            'Model': model_name,
            'Accuracy': accuracy_train,
            'Precision': train_precision,
            'Recall': train_recall,
            'F1 Score': train_f1,
            'ROC AUC': train_roc_auc,
            'PR AUC': train_pr_auc
        })
        
        # Store test results
        results_test.append({
            'Model': model_name,
            'Accuracy': accuracy_test,
            'Precision': test_precision,
            'Recall': test_recall,
            'F1 Score': test_f1,
            'ROC AUC': test_roc_auc,
            'PR AUC': test_pr_auc
        })
    
    # Create DataFrames
    df_train = pd.DataFrame(results_train)
    df_test = pd.DataFrame(results_test)
    
    # Display tables
    print("=" * 120)
    print("TRAINING SET PERFORMANCE")
    print("=" * 120)
    print(df_train.to_string(index=False, float_format=lambda x: f'{x:.4f}'))
    print("\n")
    
    print("=" * 120)
    print("TEST SET PERFORMANCE")
    print("=" * 120)
    print(df_test.to_string(index=False, float_format=lambda x: f'{x:.4f}'))
    print("\n")
    

Load in the scaled data to train

In [5]:
X_train=pd.read_csv('data/scaled_unbalanced_X_train.csv')
y_train=pd.read_csv('data/y_train.csv')

In [6]:
X_test=pd.read_csv('data/X_test.csv')
y_test=pd.read_csv('data/y_test.csv')

Run the models with the scaled and unbalanced data

In [7]:
classification(X_train, y_train, X_test, y_test)

Logistic Regression
LogisticRegression()


  y = column_or_1d(y, warn=True)


Lasso
LogisticRegression(penalty='l1', solver='liblinear')


  y = column_or_1d(y, warn=True)


Ridge
LogisticRegression(solver='liblinear')


  y = column_or_1d(y, warn=True)


K-Neighbors Classifier
KNeighborsClassifier()


  return self._fit(X, y)


Decision Tree
DecisionTreeClassifier()
Random Forest Classifier
RandomForestClassifier()


  return fit_method(estimator, *args, **kwargs)


XGBClassifier
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, ...)
CatBoosting Classifier
<catboost.core.CatBoostClassifier object at 0x000002328162A5D0>
AdaBoost Classifier
AdaBoostClassifier()


  y = column_or_1d(y, warn=True)


GradientBoosting Classifier
GradientBoostingClassifier()


  y = column_or_1d(y, warn=True)


TRAINING SET PERFORMANCE
                      Model  Accuracy  Precision  Recall  F1 Score  ROC AUC  PR AUC
        Logistic Regression    0.8475     0.5512  0.1815    0.2730   0.8142  0.4290
                      Lasso    0.8475     0.5514  0.1816    0.2732   0.8143  0.4289
                      Ridge    0.8475     0.5517  0.1815    0.2731   0.8142  0.4290
     K-Neighbors Classifier    0.8733     0.6799  0.3736    0.4822   0.8957  0.5518
              Decision Tree    0.9871     0.9969  0.9209    0.9574   0.9992  0.9947
   Random Forest Classifier    0.9870     0.9885  0.9287    0.9576   0.9972  0.9910
              XGBClassifier    0.8598     0.6556  0.2358    0.3468   0.8432  0.5225
     CatBoosting Classifier    0.8627     0.6829  0.2434    0.3589   0.8443  0.5411
        AdaBoost Classifier    0.8493     0.5580  0.2179    0.3134   0.8184  0.4477
GradientBoosting Classifier    0.8512     0.5819  0.2041    0.3022   0.8233  0.4602


TEST SET PERFORMANCE
                      Model 

Observations:
Looking at ROC_AUC, Adaboost performs best, with also best PR AUC
Random forest has best f1 score

Intepreting this with unbalanced dataset we would want to penalize misclassifying the minority class (1 diabetes) more heavily - which means we want to maximize precision (minimizing false positives).

ONce again, Adaboost has best precision and probably would be model of choice for unbalanced and scaled data


Load in the scaled and Balanced datasets

In [9]:
X_train=pd.read_csv('data/scaled_balanced_X_train.csv')
y_train=pd.read_csv('data/scaled_balanced_y_train.csv')

Run the classification models on the scaled and Balanced dataset

In [10]:
classification(X_train, y_train, X_test, y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


TRAINING SET PERFORMANCE
                      Model  Accuracy  Precision  Recall  F1 Score  ROC AUC  PR AUC
        Logistic Regression    0.7428     0.7337  0.7624    0.7478   0.8172  0.7819
                      Lasso    0.7429     0.7338  0.7623    0.7478   0.8172  0.7819
                      Ridge    0.7428     0.7337  0.7622    0.7477   0.8172  0.7819
     K-Neighbors Classifier    0.8562     0.8098  0.9311    0.8662   0.9475  0.9331
              Decision Tree    0.9898     0.9981  0.9814    0.9897   0.9998  0.9997
   Random Forest Classifier    0.9897     0.9938  0.9856    0.9897   0.9991  0.9992
              XGBClassifier    0.8662     0.9089  0.8141    0.8589   0.9447  0.9547
     CatBoosting Classifier    0.8755     0.9212  0.8213    0.8684   0.9492  0.9588
        AdaBoost Classifier    0.7892     0.7756  0.8139    0.7943   0.8755  0.8761
GradientBoosting Classifier    0.8328     0.8305  0.8364    0.8334   0.9205  0.9300


TEST SET PERFORMANCE
                      Model 

Observations of results:

Adaboost once again with the balanced dataset has best ROC_AUC and best PR AUC

Ada boost and gradient boost are very good on recall but low on precision

Once again random forest has best f1 score



Define function for models that do not require scaling

In [11]:
def no_scale_classification(X_train, y_train, X_test, y_test):

    models = {
        "Decision Tree": DecisionTreeClassifier(),

        "Random Forest Classifier": RandomForestClassifier(),

        "XGBClassifier": XGBClassifier(), 
        
        "CatBoosting Classifier": CatBoostClassifier(verbose=False),

        "AdaBoost Classifier": AdaBoostClassifier(),

        "GradientBoosting Classifier": GradientBoostingClassifier()
    }
 
    # Lists to store results
    results_train = []
    results_test = []
    
    for model_name, model in models.items():
        # Train model
        model.fit(X_train, y_train)
        
        # Training predictions
        y_train_pred = model.predict(X_train)
        accuracy_train = accuracy_score(y_train, y_train_pred)
        train_precision, train_recall, train_f1, train_roc_auc, train_pr_auc = evaluate_model(
            y_train, y_train_pred, model, X_train
        )
        
        # Test predictions
        y_test_pred = model.predict(X_test)
        accuracy_test = accuracy_score(y_test, y_test_pred)
        test_precision, test_recall, test_f1, test_roc_auc, test_pr_auc = evaluate_model(
            y_test, y_test_pred, model, X_test
        )
        
        # Store training results
        results_train.append({
            'Model': model_name,
            'Accuracy': accuracy_train,
            'Precision': train_precision,
            'Recall': train_recall,
            'F1 Score': train_f1,
            'ROC AUC': train_roc_auc,
            'PR AUC': train_pr_auc
        })
        
        # Store test results
        results_test.append({
            'Model': model_name,
            'Accuracy': accuracy_test,
            'Precision': test_precision,
            'Recall': test_recall,
            'F1 Score': test_f1,
            'ROC AUC': test_roc_auc,
            'PR AUC': test_pr_auc
        })
    
    # Create DataFrames
    df_train = pd.DataFrame(results_train)
    df_test = pd.DataFrame(results_test)
    
    # Display tables
    print("=" * 120)
    print("TRAINING SET PERFORMANCE")
    print("=" * 120)
    print(df_train.to_string(index=False, float_format=lambda x: f'{x:.4f}'))
    print("\n")
    
    print("=" * 120)
    print("TEST SET PERFORMANCE")
    print("=" * 120)
    print(df_test.to_string(index=False, float_format=lambda x: f'{x:.4f}'))
    print("\n")
    

Load in the unscaled and unbalanced data

In [12]:
X_train=pd.read_csv('data/unscaled_unbalanced_X_train.csv')
y_train=pd.read_csv('data/y_train.csv')

Run the classification on the unscaled and unbalanced data

In [13]:
no_scale_classification(X_train, y_train, X_test, y_test)

  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


TRAINING SET PERFORMANCE
                      Model  Accuracy  Precision  Recall  F1 Score  ROC AUC  PR AUC
              Decision Tree    0.9871     0.9969  0.9209    0.9574   0.9992  0.9947
   Random Forest Classifier    0.9870     0.9890  0.9282    0.9576   0.9973  0.9910
              XGBClassifier    0.8598     0.6556  0.2358    0.3468   0.8432  0.5225
     CatBoosting Classifier    0.8627     0.6829  0.2434    0.3589   0.8443  0.5411
        AdaBoost Classifier    0.8493     0.5580  0.2179    0.3134   0.8184  0.4477
GradientBoosting Classifier    0.8512     0.5819  0.2041    0.3022   0.8233  0.4602


TEST SET PERFORMANCE
                      Model  Accuracy  Precision  Recall  F1 Score  ROC AUC  PR AUC
              Decision Tree    0.7851     0.3220  0.3374    0.3295   0.5979  0.2157
   Random Forest Classifier    0.8396     0.4739  0.2278    0.3077   0.7856  0.3794
              XGBClassifier    0.8506     0.5616  0.2061    0.3016   0.8221  0.4474
     CatBoosting Classifier 

Observations on the unscaled vs scaled unbalanced data:

All the boosting models: XGB, CatBoost, AdaBoost and GradientBoosting have similar ROC AUC scores and PR AUC. Gradient boost has the best scores here.

These scores are higher with unscaled data.
With unbalanced data to max precision, gradient boosting also scores highest. 

Decision tree has best f1 score again. 

I would choose gradient boost for unscaled unbalanced data. 



Load in the unscaled and Balanced data

In [14]:
X_train=pd.read_csv('data/unscaled_balanced_X_train.csv')
y_train=pd.read_csv('data/unscaled_balanced_y_train.csv')

Run the classification models on unscaled and balanced data

In [15]:
no_scale_classification(X_train, y_train, X_test, y_test)

  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


TRAINING SET PERFORMANCE
                      Model  Accuracy  Precision  Recall  F1 Score  ROC AUC  PR AUC
              Decision Tree    0.9898     0.9981  0.9814    0.9897   0.9998  0.9997
   Random Forest Classifier    0.9897     0.9942  0.9852    0.9897   0.9992  0.9993
              XGBClassifier    0.8663     0.9062  0.8173    0.8595   0.9449  0.9548
     CatBoosting Classifier    0.8758     0.9196  0.8235    0.8689   0.9498  0.9592
        AdaBoost Classifier    0.8165     0.8107  0.8259    0.8182   0.9063  0.9162
GradientBoosting Classifier    0.8455     0.8592  0.8265    0.8425   0.9298  0.9404


TEST SET PERFORMANCE
                      Model  Accuracy  Precision  Recall  F1 Score  ROC AUC  PR AUC
              Decision Tree    0.7508     0.2958  0.4288    0.3501   0.6167  0.2197
   Random Forest Classifier    0.8130     0.4074  0.4279    0.4174   0.7882  0.3756
              XGBClassifier    0.8344     0.4696  0.4496    0.4593   0.8226  0.4509
     CatBoosting Classifier 

Observations of results

XGB, catboost and gradient boost are top in ROC _AUC
XGB and catboost have best PR_AUC

The precision and recall scores across models are more balanced for the balanced data than unbalanced, which is to be expected.

The f1 score is also better.

Overall, I would choose gradient boosting model since has overall better scores acros multiple metircs. 

Overall, the precision and recall is more blaanced and f1 score is better across models for balanced dataset, so i would choose balaced.

Overall, ROC_AUC and PR_AUC is better for unscaled data, with the mdoels requiring scaling (logistic regression, k neighbors) not performing substantially better (acutally still worse) then the models that do not require feature scaling.

So I will choose to leave data unscaled for best model. 

