In [34]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb


In [35]:
# Read diabetes csv file
diabetes_read = pd.read_csv('diabetes_data.csv')

diabetes_read.head()



Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


Data Exploration and Preprocessing


In [36]:
# Checked for missing values then,
# Determine unique values 
for x in diabetes_read.columns:
    print(x, len(diabetes_read[x].unique()))


Age 13
Sex 2
HighChol 2
CholCheck 2
BMI 80
Smoker 2
HeartDiseaseorAttack 2
PhysActivity 2
Fruits 2
Veggies 2
HvyAlcoholConsump 2
GenHlth 5
MentHlth 31
PhysHlth 31
DiffWalk 2
Stroke 2
HighBP 2
Diabetes 2


In [37]:
diabetes_read['Age'].value_counts().sort_index()

Age
1.0       979
2.0      1396
3.0      2049
4.0      2793
5.0      3520
6.0      4648
7.0      6872
8.0      8603
9.0     10112
10.0    10856
11.0     8044
12.0     5394
13.0     5426
Name: count, dtype: int64

# Age categories 
- 1 = 18-24 / 2 = 25-29 / 3 = 30-34 / 4 = 35-39 / 5 = 40-44 / 6 = 45-49 / 7 = 50-54 / 
- 8 = 55-59 / 9 = 60-64 / 10 = 65-69 / 11 = 70-74 / 12 = 75-79 / 13 = 80 or older


-To 18-44 (10,737) / 45-59 (20,123) / 60-69 (20,968) / 70+ (18,864)

- 18-44 has a huge age gap compared to others should we split that int another category? 18-31 , 32-44 ???

In [38]:
diabetes_read["Age_Category"] = np.where(diabetes_read['Age']<=5,"18-44",np.where(diabetes_read['Age']<=8,"45-59",np.where(diabetes_read['Age']<=10,"60-69","70+")))
del diabetes_read['Age']
diabetes_read

Unnamed: 0,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes,Age_Category
0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0,18-44
1,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0,70+
2,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0,70+
3,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0,70+
4,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,45-59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70687,0.0,1.0,1.0,37.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,45-59
70688,1.0,1.0,1.0,29.0,1.0,1.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,60-69
70689,0.0,1.0,1.0,25.0,0.0,1.0,0.0,1.0,0.0,0.0,5.0,15.0,0.0,1.0,0.0,1.0,1.0,70+
70690,0.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,1.0,70+


In [39]:
dummies_df = pd.get_dummies(diabetes_read, dtype=int)
dummies_df.head()

Unnamed: 0,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,...,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes,Age_Category_18-44,Age_Category_45-59,Age_Category_60-69,Age_Category_70+
0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,...,5.0,30.0,0.0,0.0,1.0,0.0,1,0,0,0
1,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0,1
2,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,10.0,0.0,0.0,0.0,0.0,0,0,0,1
3,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,3.0,0.0,0.0,1.0,0.0,0,0,0,1
4,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0


In [40]:
# Split the data
y = dummies_df["Diabetes"]
X = dummies_df.drop("Diabetes", axis=1)

In [41]:
y.value_counts()
# Weird to see an exact # for both positive and negative results 

Diabetes
0.0    35346
1.0    35346
Name: count, dtype: int64

In [42]:
# Splitting the data into training and testing 
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)


**Logistic Regression Model 74% accuracy**

In [43]:
# Train Model 
clf = LogisticRegression(random_state=1,max_iter=1000)
clf.fit(X_train, y_train)
# Make predictions
predictions = clf.predict(X_test)

print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

0.7498443954054207
              precision    recall  f1-score   support

         0.0       0.76      0.73      0.75      8913
         1.0       0.74      0.77      0.75      8760

    accuracy                           0.75     17673
   macro avg       0.75      0.75      0.75     17673
weighted avg       0.75      0.75      0.75     17673

[[6532 2381]
 [2040 6720]]


**Random Forrest**

1st result 72% accuracy 

2nd result 75.07%

In [44]:
# Random Forrest model adjusted estimators to get 74.4%
rf_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features='sqrt',
    random_state=1)
rf_clf.fit(X_train, y_train)
rf_predictions = rf_clf.predict(X_test)

print(accuracy_score(y_test,rf_predictions))
print(classification_report(y_test,rf_predictions))
print(confusion_matrix(y_test,rf_predictions))

0.7507497312284276
              precision    recall  f1-score   support

         0.0       0.78      0.71      0.74      8913
         1.0       0.73      0.80      0.76      8760

    accuracy                           0.75     17673
   macro avg       0.75      0.75      0.75     17673
weighted avg       0.75      0.75      0.75     17673

[[6300 2613]
 [1792 6968]]


In [45]:
# Gridsearch


param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_clf = RandomForestClassifier(random_state=1)

grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

best_rf_clf = grid_search.best_estimator_
rf_predictions = best_rf_clf.predict(X_test)

print(accuracy_score(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))
print(confusion_matrix(y_test, rf_predictions))

Fitting 3 folds for each of 162 candidates, totalling 486 fits


KeyboardInterrupt: 

**Using Grid Search**
Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best Parameters:  {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Best Score:  0.7505611195986345
0.7507497312284276
              precision    recall  f1-score   support

         0.0       0.78      0.71      0.74      8913
         1.0       0.73      0.80      0.76      8760

     accuracy                           0.75     17673
     macro avg      0.75      0.75      0.75     17673
     weighted avg   0.75      0.75      0.75     17673


[[6300 2613]
 [1792 6968]]

**Improve Accuracy**
- Feature Engeneering: Create new features or transform existing ones to provide more meaningful information to model
- Hyperparameter Tuning: Further fine tune hyperparameters using a more extensive grid search or random search
- Cross Validation: Ensure robust evaluation to get a more accurate estimate of the model's performance
- Feature Selection: Remove irrelevant or less important features to reduce noise and improve the mode's performance
- Ensemble Methods: Combine predictions of multiple models to improve overall performance
- Boosting Algorithms: Try other boosting algorithms like XGBoost, LightGBM, or CaatBoost

**Feature Engeneering / Hyperparameter Tuning**

In [None]:
# Initial RF model with further tuning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
import numpy as np

# Define a smaller parameter grid
param_distributions = {
    'n_estimators': [200, 300, 400],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Use RandomizedSearchCV
random_search_rf = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=1),
    param_distributions=param_distributions,
    n_iter=50,  # Number of parameter settings that are sampled
    cv=3,  # Number of cross-validation folds
    n_jobs=-1,
    verbose=2,
    random_state=1
)

random_search_rf.fit(X_train, y_train)

print("Best Parameters for Random Forest:", random_search_rf.best_params_)
best_rf_clf = random_search_rf.best_estimator_
rf_best_predictions = best_rf_clf.predict(X_test)

print("Random Search Random Forest Accuracy:", accuracy_score(y_test, rf_best_predictions))
print(classification_report(y_test, rf_best_predictions))
print(confusion_matrix(y_test, rf_best_predictions))

# Cross-validation score for more robust performance estimation
cv_scores = cross_val_score(best_rf_clf, X, y, cv=3)
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", cv_scores.mean())


Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best Parameters for Random Forest: {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 10}
Random Search Random Forest Accuracy: 0.7506931477394896
              precision    recall  f1-score   support

         0.0       0.78      0.71      0.74      8913
         1.0       0.73      0.80      0.76      8760

    accuracy                           0.75     17673
   macro avg       0.75      0.75      0.75     17673
weighted avg       0.75      0.75      0.75     17673

[[6300 2613]
 [1793 6967]]
Cross-Validation Accuracy Scores: [0.7457138  0.74779324 0.75386182]
Mean Cross-Validation Accuracy: 0.7491229559214622


**Using Cross Value Scores** {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 10}

Result 75.06%

In [None]:

rf_clf = RandomForestClassifier(
    n_estimators=400,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=2,
    max_features='log2',
    random_state=1)
rf_clf.fit(X_train, y_train)
rf_predictions = rf_clf.predict(X_test)

print(accuracy_score(y_test,rf_predictions))
print(classification_report(y_test,rf_predictions))
print(confusion_matrix(y_test,rf_predictions))

0.7506931477394896
              precision    recall  f1-score   support

         0.0       0.78      0.71      0.74      8913
         1.0       0.73      0.80      0.76      8760

    accuracy                           0.75     17673
   macro avg       0.75      0.75      0.75     17673
weighted avg       0.75      0.75      0.75     17673

[[6300 2613]
 [1793 6967]]


**Using Boosintg Algorithms**

 XGBoost 

1st Result 75.37% 


Hypertuning {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.8}


2nd Result 75.52%

In [None]:
import xgboost as xgb

# XGBoost model
xgb_clf = xgb.XGBClassifier(
    subsample=0.8,
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    gamma=0.1,
    colsample_bytee=0.8,
    random_state=1
)
xgb_clf.fit(X_train, y_train)
xgb_predictions = xgb_clf.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, xgb_predictions))
print(classification_report(y_test, xgb_predictions))
print(confusion_matrix(y_test, xgb_predictions))


Parameters: { "colsample_bytee" } are not used.



XGBoost Accuracy: 0.7552160690289271
              precision    recall  f1-score   support

         0.0       0.78      0.72      0.75      7141
         1.0       0.73      0.80      0.76      6998

    accuracy                           0.76     14139
   macro avg       0.76      0.76      0.75     14139
weighted avg       0.76      0.76      0.75     14139

[[5108 2033]
 [1428 5570]]


**Model w Selected Columns 'BMI' 'GenHlth' 'PhysHlth' 'HighBP'**

In [None]:

# 72%
# # Split the original dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Select the desired columns for training and testing
selected_columns = ['BMI', 'GenHlth', 'PhysHlth', 'HighBP']
X_train_selected = X_train[selected_columns]
X_test_selected = X_test[selected_columns]

# Train the model using X_train_selected
clf.fit(X_train_selected, y_train)

# Make predictions using X_test_selected
predictions_selected = clf.predict(X_test_selected)

# Evaluate the model's performance
print(accuracy_score(y_test, predictions_selected))
print(classification_report(y_test, predictions_selected))
print(confusion_matrix(y_test, predictions_selected))


0.7245515758501669
              precision    recall  f1-score   support

         0.0       0.73      0.72      0.72      8913
         1.0       0.72      0.73      0.72      8760

    accuracy                           0.72     17673
   macro avg       0.72      0.72      0.72     17673
weighted avg       0.72      0.72      0.72     17673

[[6395 2518]
 [2350 6410]]


**Gradient Boosting 73% accuracy**

In [None]:

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=5, random_state=0)
clf.fit(X_train, y_train)
pred_gb = clf.predict(X_test)
print(accuracy_score(y_test,pred_gb))
print(classification_report(y_test,pred_gb))
print(confusion_matrix(y_test,pred_gb))

0.7351326882815594
              precision    recall  f1-score   support

         0.0       0.75      0.71      0.73      8913
         1.0       0.72      0.77      0.74      8760

    accuracy                           0.74     17673
   macro avg       0.74      0.74      0.73     17673
weighted avg       0.74      0.74      0.73     17673

[[6289 2624]
 [2057 6703]]
