In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")


In [3]:
# model preprocessing
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder 

# model training

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn import datasets, linear_model, metrics

# model evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_score
from collections import Counter

# to save the model
import joblib

## Pre-processing to gain 50-50 sample for each of the classes

In [4]:
# read the dataset
df = pd.read_csv("./dataset/dropped_dataset.csv")

In [5]:
df['has_diabetes'] = df['has_diabetes'].astype('bool')

In [6]:
# Only selecting rows without diabates
df_has_diabetes_class_0 = df[df['has_diabetes'] == 0]

In [7]:
# Only selecting rows with diabates
df_has_diabetes_class_1 = df[df['has_diabetes'] == 1]

In [8]:
df_has_diabetes_class_0.head()

Unnamed: 0,has_diabetes,BMI,age,total_household_income,smoking,high_bp,high_chol,heart_diseases,asthma,kidney_diseases,marital_status,education,general_health,physical_activity,arthritis,depression,sex,race
0,False,3.0,6.0,9.0,4.0,2.0,2.0,2.0,3.0,2.0,2.0,3.0,1.0,1.0,1.0,1.0,2.0,1.0
1,False,2.0,6.0,1.0,3.0,2.0,2.0,2.0,1.0,2.0,3.0,2.0,2.0,9.0,1.0,2.0,2.0,2.0
2,False,3.0,6.0,9.0,4.0,1.0,1.0,2.0,3.0,2.0,1.0,3.0,1.0,9.0,1.0,1.0,2.0,1.0
3,False,4.0,5.0,5.0,4.0,2.0,2.0,2.0,3.0,2.0,3.0,3.0,1.0,9.0,2.0,2.0,2.0,2.0
4,False,2.0,6.0,4.0,3.0,1.0,1.0,2.0,3.0,2.0,3.0,2.0,2.0,4.0,1.0,2.0,1.0,1.0


In [9]:
# Generate a random permutation of indices
random_indices = np.random.permutation(df_has_diabetes_class_0.index)

# Select the first 17,417 rows based on the shuffled indices
sampled_df = df.iloc[random_indices[:17417]]

In [10]:
# Joining splitted datasets with class 0 and 1
df = pd.concat([df_has_diabetes_class_1, sampled_df], axis=0, ignore_index=True)

In [11]:
df['has_diabetes'].value_counts()

has_diabetes
True     17417
False    17417
Name: count, dtype: int64

In [12]:
# df.to_csv('./dataset/sampled_50_50_dataset.csv', index=False)

## Chi Square test for categorical columns

We are looking into the correlation between our categorical features(all of them) and target class label.

In [13]:
categorical_cols = ['has_diabetes', 'BMI', 'age', 'total_household_income', 'smoking',
       'high_bp', 'high_chol', 'heart_diseases', 'asthma', 'kidney_diseases',
       'marital_status', 'education', 'general_health', 'physical_activity',
       'arthritis', 'depression', 'sex', 'race']

# empty dict to store the p-value
chi_square_test_results = {}

for each_cols in categorical_cols:
    # Create a contingency table between the features and target label
    contingency_table = pd.crosstab(df[each_cols], df['has_diabetes'])
    
    # Perform the Chi-square test
    chi2, p, _, _ = chi2_contingency(contingency_table)

    # store the p-value 
    chi_square_test_results[each_cols] = p

chi_square_test_results


{'has_diabetes': 0.0,
 'BMI': 0.0,
 'age': 0.0,
 'total_household_income': 3.311844346816984e-302,
 'smoking': 1.823383841895033e-52,
 'high_bp': 0.0,
 'high_chol': 0.0,
 'heart_diseases': 1.0901730206498645e-296,
 'asthma': 2.5458224846247645e-32,
 'kidney_diseases': 4.276376242074078e-274,
 'marital_status': 8.380689822285263e-132,
 'education': 7.075384483712028e-145,
 'general_health': 0.0,
 'physical_activity': 0.0,
 'arthritis': 0.0,
 'depression': 1.8575181820199742e-33,
 'sex': 5.840051060264736e-09,
 'race': 4.028545725978419e-51}

The p-values from the Chi-square tests suggest that there is a strong statistical association between the categorical variables and the target variable (has_diabetes), as most of the p-values are extremely small (below 0.05), indicating a significant relationship. Here’s a breakdown of the results:

### Encoding the features

In [14]:
def cate_encoder(df, cols, encoder):
    for each in cols:
        df[each] = encoder.fit_transform(df[each])

    return df

In [15]:
# Encoding categorical variables 
encoder = LabelEncoder()
df = cate_encoder(df=df, cols=df.columns, encoder=encoder)
df

Unnamed: 0,has_diabetes,BMI,age,total_household_income,smoking,high_bp,high_chol,heart_diseases,asthma,kidney_diseases,marital_status,education,general_health,physical_activity,arthritis,depression,sex,race
0,1,3,5,4,2,1,0,1,2,1,2,3,0,3,0,0,1,0
1,1,1,5,4,3,0,0,1,2,1,0,3,0,0,1,1,0,0
2,1,2,4,5,0,1,1,1,2,1,0,1,0,3,1,0,1,0
3,1,3,5,4,3,0,1,1,2,1,6,2,0,3,0,1,1,0
4,1,2,5,7,3,1,1,1,2,1,2,1,0,3,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34829,0,1,5,3,3,1,1,0,2,1,2,1,0,0,1,1,1,0
34830,0,2,4,7,3,1,0,1,2,1,0,3,0,0,1,1,0,0
34831,0,1,5,0,3,1,0,1,2,1,1,1,1,0,1,1,0,7
34832,0,2,3,5,3,1,1,1,2,1,0,2,0,1,1,1,0,0


## Train Test Split

In [16]:
X = df[['BMI', 'age', 'total_household_income', 'smoking',
       'high_bp', 'high_chol', 'heart_diseases', 'asthma', 'kidney_diseases',
       'marital_status', 'education', 'general_health', 'physical_activity',
       'arthritis', 'depression', 'sex', 'race']]

In [17]:
y = df['has_diabetes']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Standardising the dataset 

In [19]:
# Creating StandardScaler instance
standard_scaler = StandardScaler()

# Fitting Standard Scaller
X_train = standard_scaler.fit_transform(X_train)

# Scaling data
X_test = standard_scaler.transform(X_test)

# Implementation

## Logistic Regression and Random Forest Classifier

In [23]:
def get_accurancy_score(y, predicted_y):
    """
    A function that will generate accurancy and classification report of a model
    
    """
    accurancy = accuracy_score(y, predicted_y)
    class_report = classification_report(y, predicted_y)

    return accurancy, class_report
    

In [24]:
def tune_and_train(model_parameters, model_class, X, y, random_state=42):
    """
    A function to generate best hyperparameter for the model using GridSearchCV.

    Returns: 
    best paramaters(dict) and best estimator(model)
    """
    
    model = model_class(random_state=random_state)
    
    model_gridsearch = GridSearchCV(model, model_parameters, cv=5, scoring='accuracy')
    
    model_gridsearch.fit(X, y)

    return model_gridsearch.best_params_, model_gridsearch.best_estimator_

In [25]:
## Logistic Regression

In [26]:
log_reg_params = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'newton-cg', 'lbfgs']
}


log_reg_best_params, log_res_model = tune_and_train(model_parameters=log_reg_params,
                                                    model_class=LogisticRegression, 
                                                    X=X_train, 
                                                    y=y_train)

# Best parameters for Logistic Regression
print("Best Parameters for Logistic Regression:", log_reg_best_params)

# Predictions
log_reg_pred = log_res_model.predict(X_test)

log_res_acc, log_res_class_report = get_accurancy_score(y_test, log_reg_pred)

# Accuracy and Classification Report
print("Logistic Regression Accuracy:", accuracy_score(y_test, log_reg_pred))
print("Logistic Regression Classification Report:\n", classification_report(y_test, log_reg_pred))

Best Parameters for Logistic Regression: {'C': 0.01, 'solver': 'lbfgs'}
Logistic Regression Accuracy: 0.739163716390776
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.72      0.73      5246
           1       0.73      0.76      0.74      5205

    accuracy                           0.74     10451
   macro avg       0.74      0.74      0.74     10451
weighted avg       0.74      0.74      0.74     10451



### Dumping Logistic Regression 

In [27]:
joblib.dump(log_res_model, './models/logistic_reg_model.pkl')

['./models/logistic_reg_model.pkl']

## Random Forest Classifier

In [28]:
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_best_params, rf_model = tune_and_train(model_parameters=rf_params,
                                                    model_class=RandomForestClassifier, 
                                                    X=X_train, 
                                                    y=y_train)

# Best parameters for Logistic Regression
print("Best Parameters for Random Forest Classifier:", rf_best_params)

# Predictions
rf_pred = rf_model.predict(X_test)

rf_acc, rf_class_report = get_accurancy_score(y_test, rf_pred)

# Accuracy and Classification Report
print("Random Forrest Classifier Accuracy:", accuracy_score(y_test, rf_pred))
print("Random Forrest Classifier Classification Report:\n", classification_report(y_test, rf_pred))

Best Parameters for Random Forest Classifier: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Random Forrest Classifier Accuracy: 0.7420342550952062
Random Forrest Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.69      0.73      5246
           1       0.72      0.80      0.76      5205

    accuracy                           0.74     10451
   macro avg       0.75      0.74      0.74     10451
weighted avg       0.75      0.74      0.74     10451



#### Dumping Random Forest Models

joblib.dump(rf_model, './models/random_forest_model.pkl')

# Support Vector Machines

In [58]:
svm_params = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Kernel type
    'gamma': ['scale', 'auto'],  # Kernel coefficient
    'degree': [2, 3, 4],  # Degree of polynomial kernel (only relevant for 'poly' kernel)
    'coef0': [0, 0.1, 1],  # Independent term in kernel function (only relevant for 'poly' kernel)
}

svm_best_params, svm_model = tune_and_train(model_parameters=svm_params,
                                                    model_class=SVC, 
                                                    X=X_train, 
                                                    y=y_train)

# Best parameters for SVM 
print("Best Parameters for SVM Classifier:", svm_best_params)

# Predictions
svm_pred = rf_model.predict(X_test)

svm_acc, svm_class_report = get_accurancy_score(y_test, rf_pred)

# Accuracy and Classification Report
print("SVM Accuracy:", accuracy_score(svm_test, svm_pred))
print("SVM Classification Report:\n", classification_report(y_test, svm_pred))

Best Parameters for SVM: {'C': 1, 'coef0': 1, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}
SVM Accuracy: 0.7408860396134341
SVM Classification Report:
               precision    recall  f1-score   support

       False       0.78      0.68      0.72      5246
        True       0.71      0.80      0.76      5205

    accuracy                           0.74     10451
   macro avg       0.74      0.74      0.74     10451
weighted avg       0.74      0.74      0.74     10451



### Dumping SVM Models


In [59]:
joblib.dump(svm_best, './models/svm_model.pkl')

['./models/svm_model.pkl']

## Gradient Boosting

In [45]:
gb_params = {
     'n_estimators': [100, 200, 300],  # Number of trees
    'learning_rate': [0.01, 0.1, 0.5],  # Controls how much each tree corrects the previous one
    'max_depth': [3, 5, 7],  # Maximum depth of each tree
    'subsample': [0.8, 1.0]  # Fraction of samples used for fitting each tree
}

gb_best_params, gb_model = tune_and_train(model_parameters=gb_params,
                                                    model_class=GradientBoostingClassifier, 
                                                    X=X_train, 
                                                    y=y_train)

# Best parameters for GB
print("Best Parameters for Gradient Boosting:", gb_best_params)

# Predictions
gb_pred = gb_model.predict(X_test)

gb_acc, gb_class_report = get_accurancy_score(y_test, rf_pred)

# Accuracy and Classification Report
print("Gradient Boosting Accuracy:", accuracy_score(svm_test, gb_pred))
print("Gradient Boosting Report:\n", classification_report(y_test, gb_pred))

Best Parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Gradient Boosting Accuracy: 0.7395464548847
Gradient Boosting Classification Report:
               precision    recall  f1-score   support

       False       0.76      0.70      0.73      5246
        True       0.72      0.78      0.75      5205

    accuracy                           0.74     10451
   macro avg       0.74      0.74      0.74     10451
weighted avg       0.74      0.74      0.74     10451



#### Dumping Gradient Boosting Model

In [46]:
joblib.dump(gb_best, './models/gradient_boosting_model.pkl')

['./models/gradient_boosting_model.pkl']

## XGBoost

In [47]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Define the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False)

# Hyperparameter tuning
xgb_params = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate
    'max_depth': [3, 5, 7],  # Maximum depth of each tree
    'subsample': [0.8, 1.0],  # Fraction of samples used for fitting each tree
    'colsample_bytree': [0.8, 1.0]  # Fraction of features to use for each tree
}

# GridSearchCV for hyperparameter tuning
xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=5, scoring='accuracy')
xgb_grid.fit(X_train, y_train)

# Best hyperparameters
print("Best Parameters for XGBoost:", xgb_grid.best_params_)

# Evaluate the best XGBoost model
xgb_best = xgb_grid.best_estimator_
xgb_pred = xgb_best.predict(X_test)

# Evaluation
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_pred))
print("XGBoost Classification Report:\n", classification_report(y_test, xgb_pred))


Best Parameters for XGBoost: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
XGBoost Accuracy: 0.7419385704717252
XGBoost Classification Report:
               precision    recall  f1-score   support

       False       0.77      0.70      0.73      5246
        True       0.72      0.79      0.75      5205

    accuracy                           0.74     10451
   macro avg       0.74      0.74      0.74     10451
weighted avg       0.74      0.74      0.74     10451



#### Dumping XGBoost Model

In [49]:
joblib.dump(gb_best, './models/xgboost_model.pkl')

['./models/xgboost_model.pkl']

## Stacking Classifier

In [60]:
from sklearn.ensemble import StackingClassifier

# Define base models
base_models = [
    ('log_reg', log_reg_best),
    ('rf', rf_best),
    ('gb', gb_best),
    ('xgb', xgb_best),
    ('svm', svm_best)
]

# Define final estimator (a logistic regression)
final_estimator = LogisticRegression()

# Create the Stacking Classifier
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=final_estimator)

# Fit the Stacking Classifier
stacking_clf.fit(X_train, y_train)

# Evaluate the Stacking Classifier
stacking_pred = stacking_clf.predict(X_test)
print("Stacking Classifier Accuracy:", accuracy_score(y_test, stacking_pred))
print("Stacking Classifier Classification Report:\n", classification_report(y_test, stacking_pred))


Stacking Classifier Accuracy: 0.7445220553057124
Stacking Classifier Classification Report:
               precision    recall  f1-score   support

       False       0.77      0.71      0.73      5246
        True       0.73      0.78      0.75      5205

    accuracy                           0.74     10451
   macro avg       0.75      0.74      0.74     10451
weighted avg       0.75      0.74      0.74     10451



#### Dumping Stacking Classifer with log reg, rf, fb and xgb

In [52]:
joblib.dump(stacking_clf, './models/stacking_classifier.pkl')

['./models/stacking_classifier.pkl']

## Voting Classifier

In [61]:
from sklearn.ensemble import VotingClassifier

# Importing saved model
log_reg = joblib.load('./models/logistic_reg_model.pkl')
rf = joblib.load('./models/random_forest_model.pkl')

# Create Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('log_reg', log_reg_best),
    ('rf', rf_best),
    ('gb', gb_best),
    ('xgb', xgb_best),
    ('svm', svm_best)
], voting='hard')

# Fit and evaluate the Voting Classifier
voting_clf.fit(X_train, y_train)
voting_pred = voting_clf.predict(X_test)

# Accuracy and Classification Report for Voting Classifier
print("Voting Classifier Accuracy:", accuracy_score(y_test, voting_pred))
print("Voting Classifier Classification Report:\n", classification_report(y_test, voting_pred))


Voting Classifier Accuracy: 0.7437565783178643
Voting Classifier Classification Report:
               precision    recall  f1-score   support

       False       0.77      0.69      0.73      5246
        True       0.72      0.79      0.76      5205

    accuracy                           0.74     10451
   macro avg       0.75      0.74      0.74     10451
weighted avg       0.75      0.74      0.74     10451



#### Dumping Voting Classifier

In [56]:
joblib.dump(voting_clf, './models/voting_classifier.pkl')

['./models/voting_classifier.pkl']

### Cross Validation

In [20]:
# Importing Saved Models

log_reg = joblib.load('./models/logistic_reg_model.pkl')
rf = joblib.load('./models/random_forest_model.pkl')
gb = joblib.load('./models/gradient_boosting_model.pkl') 
xgb = joblib.load('./models/xgboost_model.pkl')  
svm = joblib.load('./models/svm_model.pkl')

voting_clf = joblib.load('./models/voting_classifier.pkl')
stacking_clf = joblib.load('./models/stacking_classifier.pkl')

#### Running cross validation voting classifier

In [21]:
cv_scores = cross_val_score(voting_clf, X_train, y_train, cv=5, scoring='accuracy')

In [22]:
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cv_scores))

Cross-Validation Accuracy Scores: [0.74533525 0.74307976 0.7279065  0.74528302 0.74138638]
Mean Cross-Validation Accuracy: 0.7405981820545888


In [23]:
# Fit and evaluate the Voting Classifier
voting_clf.fit(X_train, y_train)
voting_pred = voting_clf.predict(X_test)

# Accuracy and Classification Report for Voting Classifier
print("Voting Classifier Accuracy on Test Set:", accuracy_score(y_test, voting_pred))
print("Voting Classifier Classification Report:\n", classification_report(y_test, voting_pred))

Voting Classifier Accuracy on Test Set: 0.739737824131662
Voting Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.72      0.73      5246
           1       0.73      0.76      0.75      5205

    accuracy                           0.74     10451
   macro avg       0.74      0.74      0.74     10451
weighted avg       0.74      0.74      0.74     10451



#### Running cross validation stacking classifier

In [24]:
cv_stacking_scores = cross_val_score(stacking_clf, X_train, y_train, cv=5, scoring='accuracy')

In [25]:
print("Cross-Validation Accuracy Scores:", cv_stacking_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cv_stacking_scores))

Cross-Validation Accuracy Scores: [0.74656551 0.74431003 0.73098216 0.74610336 0.74015587]
Mean Cross-Validation Accuracy: 0.7416233856563001


In [28]:
# Fit and evaluate the Stacking Classifier
stacking_clf.fit(X_train, y_train)
stacking_pred = voting_clf.predict(X_test)

# Accuracy and Classification Report for Stacking Classifier
print("Stacking Classifier Accuracy on Test Set:", accuracy_score(y_test, stacking_pred))
print("Stacking Classifier Classification Report:\n", classification_report(y_test, stacking_pred))

Stacking Classifier Accuracy on Test Set: 0.739737824131662
Stacking Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.72      0.73      5246
           1       0.73      0.76      0.75      5205

    accuracy                           0.74     10451
   macro avg       0.74      0.74      0.74     10451
weighted avg       0.74      0.74      0.74     10451

