In [36]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [109]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# Libtune to tune model, get different metric scores
from sklearn import datasets, linear_model, metrics

from sklearn.model_selection import GridSearchCV


from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_score
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder 


In [133]:
import warnings
warnings.filterwarnings('ignore')


In [37]:
df = pd.read_csv("./dataset/dropped_dataset.csv")
df.head()



Unnamed: 0,has_diabetes,BMI,age,total_household_income,smoking,high_bp,high_chol,heart_diseases,asthma,kidney_diseases,marital_status,education,general_health,physical_activity,arthritis,depression,sex,race
0,0.0,3.0,6.0,9.0,4.0,2.0,2.0,2.0,3.0,2.0,2.0,3.0,1.0,1.0,1.0,1.0,2.0,1.0
1,0.0,2.0,6.0,1.0,3.0,2.0,2.0,2.0,1.0,2.0,3.0,2.0,2.0,9.0,1.0,2.0,2.0,2.0
2,0.0,3.0,6.0,9.0,4.0,1.0,1.0,2.0,3.0,2.0,1.0,3.0,1.0,9.0,1.0,1.0,2.0,1.0
3,0.0,4.0,5.0,5.0,4.0,2.0,2.0,2.0,3.0,2.0,3.0,3.0,1.0,9.0,2.0,2.0,2.0,2.0
4,0.0,2.0,6.0,4.0,3.0,1.0,1.0,2.0,3.0,2.0,3.0,2.0,2.0,4.0,1.0,2.0,1.0,1.0


In [38]:
df['has_diabetes'] = df['has_diabetes'].astype('bool')

In [39]:
df['has_diabetes'].value_counts()

has_diabetes
False    282165
True      17417
Name: count, dtype: int64

In [40]:
# Only selecting rows without diabates
df_has_diabetes_class_0 = df[df['has_diabetes'] == 0]

In [41]:
# Only selecting rows with diabates
df_has_diabetes_class_1 = df[df['has_diabetes'] == 1]

In [42]:
df_has_diabetes_class_0.head()

Unnamed: 0,has_diabetes,BMI,age,total_household_income,smoking,high_bp,high_chol,heart_diseases,asthma,kidney_diseases,marital_status,education,general_health,physical_activity,arthritis,depression,sex,race
0,False,3.0,6.0,9.0,4.0,2.0,2.0,2.0,3.0,2.0,2.0,3.0,1.0,1.0,1.0,1.0,2.0,1.0
1,False,2.0,6.0,1.0,3.0,2.0,2.0,2.0,1.0,2.0,3.0,2.0,2.0,9.0,1.0,2.0,2.0,2.0
2,False,3.0,6.0,9.0,4.0,1.0,1.0,2.0,3.0,2.0,1.0,3.0,1.0,9.0,1.0,1.0,2.0,1.0
3,False,4.0,5.0,5.0,4.0,2.0,2.0,2.0,3.0,2.0,3.0,3.0,1.0,9.0,2.0,2.0,2.0,2.0
4,False,2.0,6.0,4.0,3.0,1.0,1.0,2.0,3.0,2.0,3.0,2.0,2.0,4.0,1.0,2.0,1.0,1.0


In [43]:
# Generate a random permutation of indices
random_indices = np.random.permutation(df_has_diabetes_class_0.index)

# Select the first 17,417 rows based on the shuffled indices
sampled_df = df.iloc[random_indices[:17417]]

In [44]:
df = pd.concat([df_has_diabetes_class_1, sampled_df], axis=0, ignore_index=True)

In [45]:
df['has_diabetes'].value_counts()

has_diabetes
True     17417
False    17417
Name: count, dtype: int64

### Saving to 50/50 sampled data to file



In [46]:
df.to_csv('./dataset/sampled_50_50_dataset.csv', index=False)

In [110]:

# Encoding categorical variables if needed
encoder = LabelEncoder()
df['smoking'] = encoder.fit_transform(df['smoking'])
df['high_bp'] = encoder.fit_transform(df['high_bp'])
df['high_chol'] = encoder.fit_transform(df['high_chol'])
df['heart_diseases'] = encoder.fit_transform(df['heart_diseases'])
df['asthma'] = encoder.fit_transform(df['asthma'])
df['kidney_diseases'] = encoder.fit_transform(df['kidney_diseases'])
df['marital_status'] = encoder.fit_transform(df['marital_status'])
df['education'] = encoder.fit_transform(df['education'])
df['general_health'] = encoder.fit_transform(df['general_health'])
df['physical_activity'] = encoder.fit_transform(df['physical_activity'])
df['arthritis'] = encoder.fit_transform(df['arthritis'])
df['depression'] = encoder.fit_transform(df['depression'])
df['sex'] = encoder.fit_transform(df['sex'])
df['race'] = encoder.fit_transform(df['race'])


In [111]:
df

Unnamed: 0,has_diabetes,BMI,age,total_household_income,smoking,high_bp,high_chol,heart_diseases,asthma,kidney_diseases,marital_status,education,general_health,physical_activity,arthritis,depression,sex,race
0,True,4.0,6.0,5.0,2,1,0,1,2,1,2,3,0,3,0,0,1,0
1,True,2.0,6.0,5.0,3,0,0,1,2,1,0,3,0,0,1,1,0,0
2,True,3.0,5.0,6.0,0,1,1,1,2,1,0,1,0,3,1,0,1,0
3,True,4.0,6.0,5.0,3,0,1,1,2,1,6,2,0,3,0,1,1,0
4,True,3.0,6.0,9.0,3,1,1,1,2,1,2,1,0,3,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34829,False,4.0,6.0,5.0,3,0,0,1,2,1,0,2,0,3,1,1,0,0
34830,False,2.0,4.0,9.0,3,0,0,1,2,1,0,3,0,1,1,1,1,3
34831,False,3.0,5.0,6.0,0,0,0,1,2,1,0,1,0,0,1,1,0,0
34832,False,4.0,6.0,5.0,3,0,0,1,2,1,2,3,0,3,1,1,1,0


In [112]:
X = df[['BMI', 'age', 'total_household_income', 'smoking',
       'high_bp', 'high_chol', 'heart_diseases', 'asthma', 'kidney_diseases',
       'marital_status', 'education', 'general_health', 'physical_activity',
       'arthritis', 'depression', 'sex', 'race']]

In [113]:
y = df['has_diabetes']

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [115]:
# Creating StandardScaler instance
sc = StandardScaler()

# Fitting Standard Scaller
X_train = sc.fit_transform(X_train)

# Scaling data
X_test = sc.transform(X_test)

In [116]:
X_train

array([[-0.23130653,  0.78138053, -0.03053632, ...,  0.52164278,
         0.97604855, -0.48230091],
       [-0.23130653,  0.78138053,  0.41881012, ...,  0.52164278,
         0.97604855,  2.45558636],
       [ 1.00552397,  0.78138053, -0.03053632, ...,  0.52164278,
        -1.0245392 , -0.48230091],
       ...,
       [-0.23130653, -0.70447648, -0.03053632, ...,  0.52164278,
         0.97604855, -0.48230091],
       [-1.46813702,  0.78138053, -1.37857564, ..., -1.81644838,
        -1.0245392 ,  2.45558636],
       [ 1.00552397,  0.03845203, -0.03053632, ...,  0.52164278,
        -1.0245392 , -0.48230091]])

# implement

In [129]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Logistic Regression Hyperparameter Tuning
log_reg_params = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'newton-cg', 'lbfgs']
}
log_reg = LogisticRegression(random_state=42)
log_reg_grid = GridSearchCV(log_reg, log_reg_params, cv=5, scoring='accuracy')
log_reg_grid.fit(X_train, y_train)

# Best parameters for Logistic Regression
print("Best Parameters for Logistic Regression:", log_reg_grid.best_params_)

# Random Forest Hyperparameter Tuning
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)

# Best parameters for Random Forest
print("Best Parameters for Random Forest:", rf_grid.best_params_)

# Evaluate the best models
log_reg_best = log_reg_grid.best_estimator_
rf_best = rf_grid.best_estimator_

# Predictions
log_reg_pred = log_reg_best.predict(X_test)
rf_pred = rf_best.predict(X_test)

# Accuracy and Classification Report
print("Logistic Regression Accuracy:", accuracy_score(y_test, log_reg_pred))
print("Logistic Regression Classification Report:\n", classification_report(y_test, log_reg_pred))

print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_pred))


Best Parameters for Logistic Regression: {'C': 0.1, 'solver': 'newton-cg'}
Best Parameters for Random Forest: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Logistic Regression Accuracy: 0.738685293273371
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.72      0.74      5246
           1       0.73      0.76      0.74      5205

    accuracy                           0.74     10451
   macro avg       0.74      0.74      0.74     10451
weighted avg       0.74      0.74      0.74     10451

Random Forest Accuracy: 0.7444263706822314
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.70      0.73      5246
           1       0.73      0.78      0.75      5205

    accuracy                           0.74     10451
   macro avg       0.75      0.74      0.74     10451
weighted avg       0.75      0.7

# svm

In [143]:
from sklearn.svm import SVC

# Define the SVM model
svm_model = SVC(random_state=42)

# Hyperparameter tuning
svm_params = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel type
    'gamma': ['scale', 'auto'],  # Kernel coefficient
    'degree': [2, 3, 4],  # Degree of polynomial kernel (only relevant for 'poly' kernel)
    'coef0': [0, 0.1, 1],  # Independent term in kernel function (only relevant for 'poly' and 'sigmoid' kernels)
}

# GridSearchCV for hyperparameter tuning
svm_grid = GridSearchCV(svm_model, svm_params, cv=5, scoring='accuracy')
svm_grid.fit(X_train, y_train)

# Best hyperparameters
print("Best Parameters for SVM:", svm_grid.best_params_)

# Evaluate the best SVM model
svm_best = svm_grid.best_estimator_
svm_pred = svm_best.predict(X_test)

# Evaluation
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print("SVM Classification Report:\n", classification_report(y_test, svm_pred))



KeyboardInterrupt



In [130]:
from sklearn.ensemble import VotingClassifier

# Create Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('log_reg', log_reg_best),
    ('rf', rf_best)
], voting='hard')

# Fit and evaluate the Voting Classifier
voting_clf.fit(X_train, y_train)
voting_pred = voting_clf.predict(X_test)

# Accuracy and Classification Report for Voting Classifier
print("Voting Classifier Accuracy:", accuracy_score(y_test, voting_pred))
print("Voting Classifier Classification Report:\n", classification_report(y_test, voting_pred))


Voting Classifier Accuracy: 0.739355085637738
Voting Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.74      0.74      5246
           1       0.74      0.73      0.74      5205

    accuracy                           0.74     10451
   macro avg       0.74      0.74      0.74     10451
weighted avg       0.74      0.74      0.74     10451



## Gradient Boosting

In [131]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Define the Gradient Boosting model
gb = GradientBoostingClassifier(random_state=42)

# Hyperparameter tuning
gb_params = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'learning_rate': [0.01, 0.1, 0.5],  # Controls how much each tree corrects the previous one
    'max_depth': [3, 5, 7],  # Maximum depth of each tree
    'subsample': [0.8, 1.0]  # Fraction of samples used for fitting each tree
}

# GridSearchCV for hyperparameter tuning
gb_grid = GridSearchCV(gb, gb_params, cv=5, scoring='accuracy')
gb_grid.fit(X_train, y_train)

# Best hyperparameters
print("Best Parameters for Gradient Boosting:", gb_grid.best_params_)

# Evaluate the best Gradient Boosting model
gb_best = gb_grid.best_estimator_
gb_pred = gb_best.predict(X_test)

# Evaluation
print("Gradient Boosting Accuracy:", accuracy_score(y_test, gb_pred))
print("Gradient Boosting Classification Report:\n", classification_report(y_test, gb_pred))


Best Parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Gradient Boosting Accuracy: 0.7480623863745096
Gradient Boosting Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.72      0.74      5246
           1       0.73      0.78      0.75      5205

    accuracy                           0.75     10451
   macro avg       0.75      0.75      0.75     10451
weighted avg       0.75      0.75      0.75     10451



In [134]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Define the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False)

# Hyperparameter tuning
xgb_params = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate
    'max_depth': [3, 5, 7],  # Maximum depth of each tree
    'subsample': [0.8, 1.0],  # Fraction of samples used for fitting each tree
    'colsample_bytree': [0.8, 1.0]  # Fraction of features to use for each tree
}

# GridSearchCV for hyperparameter tuning
xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=5, scoring='accuracy')
xgb_grid.fit(X_train, y_train)

# Best hyperparameters
print("Best Parameters for XGBoost:", xgb_grid.best_params_)

# Evaluate the best XGBoost model
xgb_best = xgb_grid.best_estimator_
xgb_pred = xgb_best.predict(X_test)

# Evaluation
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_pred))
print("XGBoost Classification Report:\n", classification_report(y_test, xgb_pred))


Best Parameters for XGBoost: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
XGBoost Accuracy: 0.7472012247631805
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.72      0.74      5246
           1       0.73      0.78      0.75      5205

    accuracy                           0.75     10451
   macro avg       0.75      0.75      0.75     10451
weighted avg       0.75      0.75      0.75     10451



In [141]:
from sklearn.ensemble import StackingClassifier

# Define base models
base_models = [
    ('log_reg', log_reg_best),
    ('rf', rf_best),
    ('gb', gb_best),
    ('xgb', xgb_best)
]

# Define final estimator (a logistic regression)
final_estimator = LogisticRegression()

# Create the Stacking Classifier
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=final_estimator)

# Fit the Stacking Classifier
stacking_clf.fit(X_train, y_train)

# Evaluate the Stacking Classifier
stacking_pred = stacking_clf.predict(X_test)
print("Stacking Classifier Accuracy:", accuracy_score(y_test, stacking_pred))
print("Stacking Classifier Classification Report:\n", classification_report(y_test, stacking_pred))


Stacking Classifier Accuracy: 0.7478710171275476
Stacking Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.72      0.74      5246
           1       0.73      0.77      0.75      5205

    accuracy                           0.75     10451
   macro avg       0.75      0.75      0.75     10451
weighted avg       0.75      0.75      0.75     10451

