In [36]:
import pandas as pd
import joblib
import numpy as np
import time
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss

from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv('Data/diabetes_cleaned.csv')

In [13]:
df['HeartDiseaseorAttack'].describe()

count    247980.000000
mean          0.096234
std           0.294912
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: HeartDiseaseorAttack, dtype: float64

In [4]:
df.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

## Feature Selection

In [5]:
# Features (X) and target (y)
X = df.drop(columns=['Diabetes_binary'])
y = df['Diabetes_binary']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)


In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
lasso = LogisticRegression(
    penalty='l1',
    solver='liblinear',  # required for L1
    C=0.001               # smaller C = stronger regularization
)

lasso.fit(X_train_scaled, y_train)

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,0.001
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


In [8]:
coefficients = lasso.coef_[0]

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'coefficient': coefficients
})

# Features removed (coefficient = 0)
removed_features = feature_importance[feature_importance['coefficient'] == 0]

# Features kept
kept_features = feature_importance[feature_importance['coefficient'] != 0]

print("Removed Features:")
print(removed_features)

print("\nKept Features:")
print(kept_features)


Removed Features:
          feature  coefficient
4          Smoker          0.0
7    PhysActivity          0.0
8          Fruits          0.0
9         Veggies          0.0
11  AnyHealthcare          0.0
12    NoDocbcCost          0.0
14       MentHlth          0.0
15       PhysHlth          0.0

Kept Features:
                 feature  coefficient
0                 HighBP     0.341820
1               HighChol     0.238719
2              CholCheck     0.087784
3                    BMI     0.348830
5                 Stroke     0.010054
6   HeartDiseaseorAttack     0.064250
10     HvyAlcoholConsump    -0.089602
13               GenHlth     0.497844
16              DiffWalk     0.029137
17                   Sex     0.060561
18                   Age     0.288175
19             Education    -0.013813
20                Income    -0.076277


## Model Building Methodology

To avoid data leakage and have confidence in the model's generalizability: 
- Stratified K-Crossfold Validation is only applied in the Training dataset
- Model is (stratified) split 80-20 train test; Models are all to be fairly evaluated on the same unseen data

### Feel free to add/remove for feature selection (Diabetes_binary/012 required)

In [5]:
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,0,1.0,1,15.0,1.0,0.0,0.0,0,1,...,1,0.0,5.0,10.0,20.0,0.0,0,11,4.0,5.0
1,1.0,1,0.0,1,28.0,0.0,0.0,1.0,0,1,...,1,0.0,2.0,0.0,0.0,0.0,0,11,4.0,3.0
2,1.0,1,1.0,1,33.0,0.0,0.0,0.0,1,1,...,1,0.0,2.0,10.0,0.0,0.0,0,9,4.0,7.0
3,1.0,0,1.0,1,29.0,0.0,1.0,1.0,1,1,...,1,0.0,5.0,0.0,30.0,1.0,1,12,3.0,4.0
4,0.0,0,0.0,1,24.0,1.0,0.0,0.0,0,0,...,1,0.0,3.0,0.0,0.0,1.0,1,13,5.0,6.0


In [14]:
df.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [15]:
cols_to_drop_lasso = ['Diabetes_binary'] + list(removed_features.feature)
cols_to_drop = ['Diabetes_binary'] + ['Income','AnyHealthcare','NoDocbcCost']

In [16]:
X = df.drop(cols_to_drop, axis=1)
y = df['Diabetes_binary']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=123, stratify=y
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

## SMOTE

In [17]:
smote = SMOTE(random_state=123)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

## ADASYN

In [18]:
adasyn = ADASYN(random_state=123)
X_train_ad, y_train_ad = adasyn.fit_resample(X_train, y_train)

## Random Undersampling

In [29]:
undersampler = RandomUnderSampler(random_state=123)
X_train_us, y_train_us = undersampler.fit_resample(X_train, y_train)

In [30]:
undersampler = TomekLinks(sampling_strategy='majority')
X_train_tm, y_train_tm = undersampler.fit_resample(X_train, y_train)

In [31]:
undersampler = NearMiss(version=1)
X_train_nm, y_train_nm = undersampler.fit_resample(X_train, y_train)

### Change/Modify Model used here:

In [44]:
def run_model(technique_used = None):
    if technique_used is None:
        X_train_final, y_train_final = X_train, y_train
    elif technique_used == "SMOTE":
        X_train_final, y_train_final = X_train_sm, y_train_sm
    elif technique_used == "Adasyn":
        X_train_final, y_train_final = X_train_ad, y_train_ad
    elif technique_used == "Random Undersampling":
        X_train_final, y_train_final = X_train_us, y_train_us
    elif technique_used == "TomekLinks":
        X_train_final, y_train_final = X_train_tm, y_train_tm
    elif technique_used == "NearMiss":
        X_train_final, y_train_final = X_train_nm, y_train_nm

    model = RandomForestClassifier(random_state=123, n_estimators = 20, max_depth = None)
    metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro','roc_auc']
    
    results = cross_validate(
        model, X_train_final, y_train_final,
        cv=skf,
        scoring=metrics
    )

    metrics_summary = {
        "accuracy": [results['test_accuracy'].mean()][0],
        "roc_auc": [results['test_roc_auc'].mean()][0],
        "f1_macro": [results['test_f1_macro'].mean()][0],
        "precision_macro": [results['test_precision_macro'].mean()][0],
        "recall_macro": [results['test_recall_macro'].mean()][0],
        "Resampling_Technique": technique_used,
        "Num_Features": X_train_final.shape[1],
        "Hyperparameter-tuned": "N",
        "Train/Test":"Train"
    }

    

    model_summary.append(metrics_summary)

    rf_model = RandomForestClassifier(random_state=123, n_estimators = 20, max_depth=None)
    rf_model.fit(X_train_final, y_train_final)
    y_pred = rf_model.predict(X_test)
    test_results = cross_validate(rf_model, X_test, y_test, scoring=metrics)

    test_results = {
        "accuracy": accuracy_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1]),
        "f1_macro": f1_score(y_test, y_pred, average="macro"),
        "precision_macro": precision_score(y_test, y_pred, average="macro"),
        "recall_macro": recall_score(y_test, y_pred, average="macro"),
        "Resampling_Technique": technique_used,
        "Num_Features": X_train_final.shape[1],
        "Hyperparameter-tuned": "N",
        "Train/Test":"Test"        
    }

    model_summary.append(test_results)
    
    return model_summary

    

In [60]:
techniques = [
    None,
    "SMOTE",
    "Adasyn",
    "Random Undersampling"
]


model_summary = []

for t in techniques:
    run_model(t)


In [61]:
pd.DataFrame(model_summary)

Unnamed: 0,accuracy,roc_auc,f1_macro,precision_macro,recall_macro,Resampling_Technique,Num_Features,Hyperparameter-tuned,Train/Test
0,0.849015,0.765658,0.587555,0.652741,0.572356,,18,N,Train
1,0.849706,0.765889,0.591237,0.656461,0.575241,,18,N,Test
2,0.855033,0.932255,0.854783,0.857493,0.855033,SMOTE,18,N,Train
3,0.772159,0.743433,0.611218,0.600614,0.638894,SMOTE,18,N,Test
4,0.848694,0.926976,0.848372,0.851836,0.84876,Adasyn,18,N,Train
5,0.764114,0.736769,0.605349,0.595463,0.635801,Adasyn,18,N,Test
6,0.717676,0.786034,0.717611,0.717878,0.717676,Random Undersampling,18,N,Train
7,0.704896,0.786199,0.609389,0.615729,0.719422,Random Undersampling,18,N,Test


## Train

## Test

In [162]:
rf_model = RandomForestClassifier(random_state=123, n_estimators = 20, max_depth=None)
rf_model.fit(X_train_ad, y_train_ad)
y_pred = rf_model.predict(X_test)
test_results = cross_validate(rf_model, X_test, y_test, scoring=metrics)


In [163]:
metrics_summary = {
    "accuracy": [test_results['test_accuracy'].mean()],
    "roc_auc": [test_results['test_roc_auc'].mean()],
    "f1_macro": [test_results['test_f1_macro'].mean()],
    "precision_macro": [test_results['test_precision_macro'].mean()],
    "recall_macro": [test_results['test_recall_macro'].mean()]
}
pd.DataFrame(metrics_summary)

Unnamed: 0,accuracy,roc_auc,f1_macro,precision_macro,recall_macro
0,0.846476,0.753797,0.592429,0.647086,0.577003


In [164]:
X_train.shape

(189102, 14)

## Exporting Model

In [62]:
rf_model = RandomForestClassifier(random_state=123, n_estimators = 20, max_depth=None)
rf_model.fit(X_train_us, y_train_us)

0,1,2
,n_estimators,20
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [64]:
joblib.dump(rf_model, "random_forest_model.pkl")

['random_forest_model.pkl']

In [65]:
X.columns

Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk',
       'Sex', 'Age', 'Education'],
      dtype='object')

## Hyperparameter Tuning

In [47]:
rf = RandomForestClassifier(random_state=42, class_weight=None)

In [48]:
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"],
}

In [49]:
grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring="roc_auc",     #recall
    cv=skf,
    n_jobs=-1,
    verbose=1
)


In [50]:
grid.fit(X_train_us, y_train_us)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'max_depth': [None, 5, ...], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...], ...}"
,scoring,'roc_auc'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,10
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [176]:
best_model = grid.best_estimator_

metrics = ['accuracy', 'roc_auc', 'f1_macro', 'precision_macro', 'recall_macro']

y_pred = best_model.predict(X_test)
test_results = cross_validate(rf_model, X_test, y_test, scoring=metrics)

test_results = {
    "accuracy": accuracy_score(y_test, y_pred),
    "roc_auc": roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1]),
    "f1_macro": f1_score(y_test, y_pred, average="macro"),
    "precision_macro": precision_score(y_test, y_pred, average="macro"),
    "recall_macro": recall_score(y_test, y_pred, average="macro"),
    "Resampling_Technique": "Random Undersampling",
    "Num_Features": X_test.shape[1],
    "Hyperparameter-tuned": "N",
    "Train/Test":"Test"        
}


In [177]:
metrics_summary = {
    "accuracy": [test_results['test_accuracy'].mean()],
    "roc_auc": [test_results['test_roc_auc'].mean()],
    "f1_macro": [test_results['test_f1_macro'].mean()],
    "precision_macro": [test_results['test_precision_macro'].mean()],
    "recall_macro": [test_results['test_recall_macro'].mean()]
}

results_df = pd.DataFrame(metrics_summary)
results_df

Unnamed: 0,accuracy,roc_auc,f1_macro,precision_macro,recall_macro
0,0.891987,0.965996,0.889438,0.90457,0.891987


In [53]:
best_model_rf = grid.best_estimator_

In [57]:
best_model_rf.fit(X_train_us, y_train_us)
y_pred = best_model_rf.predict(X_test)

metrics = ['accuracy', 'roc_auc', 'f1_macro', 'precision_macro', 'recall_macro']
test_results = cross_validate(best_model_rf, X_test, y_test, scoring=metrics)

In [58]:
metrics_summary = {
    "accuracy": [test_results['test_accuracy'].mean()],
    "roc_auc": [test_results['test_roc_auc'].mean()],
    "f1_macro": [test_results['test_f1_macro'].mean()],
    "precision_macro": [test_results['test_precision_macro'].mean()],
    "recall_macro": [test_results['test_recall_macro'].mean()]
}
pd.DataFrame(metrics_summary)

Unnamed: 0,accuracy,roc_auc,f1_macro,precision_macro,recall_macro
0,0.863517,0.821576,0.553635,0.74593,0.547815


In [51]:
grid.best_estimator_

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,10
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [59]:
X_train.columns

Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk',
       'Sex', 'Age', 'Education'],
      dtype='object')