In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.svm import SVC
import pickle 
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
d_data = pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')

In [3]:
# Encoding the categorical feature
d_data = pd.get_dummies(d_data, drop_first=True)

In [4]:
X = d_data[['GenHlth', 'HighBP', 'BMI', 'Age', 'DiffWalk','PhysHlth','Income']]
y = d_data['Diabetes_012']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=10,
                                                    stratify=y
                                                   )


In [6]:
# Copying training and testing data
X_train = X_train.copy()
X_test  = X_test.copy()

In [7]:
# Normalizing the numeric features
scaler = StandardScaler()
scaler.fit(X_train[['HighBP', 'Smoker', 'PhysActivity', 'BMI', 'HvyAlcoholConsump', 'AnyHealthcare','MentHlth','Age','Income']])
X_train[['HighBP', 'Smoker', 'PhysActivity', 'BMI', 'HvyAlcoholConsump', 'AnyHealthcare','MentHlth','Age','Income']] = scaler.transform(X_train[['HighBP', 'Smoker', 'PhysActivity', 'BMI', 'HvyAlcoholConsump', 'AnyHealthcare','MentHlth','Age','Income']])

In [8]:
# Instantiating the Multinomial Logistic Regression model
log_reg = LogisticRegression(solver='lbfgs', max_iter=300, random_state=10)
log_reg.fit(X_train, y_train)

## KNN Model

In [9]:
# Training the model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Evaluating the model
y_pred = knn_model.predict(X_test)
print("Test Set Accuracy:", round(accuracy_score(y_test, y_pred), 2))

Test Set Accuracy: 0.84


In [10]:
# Tuning the model

# Defining parameters
parameters_KNN = {
    'n_neighbors': range(1, 3), #increase scale
    'leaf_size': range(1, 5),
}

# Setting up GridSearchCV
grid_search_KNN = GridSearchCV(
    estimator=knn_model,
    param_grid=parameters_KNN,
    scoring='accuracy',
    n_jobs=-1,
    cv=3
)

# Fitting the grid search to the validation data
grid_search_KNN.fit(X_train, y_train)

# Evaluating and displaying results
print("Best Parameters:", grid_search_KNN.best_params_)

Best Parameters: {'leaf_size': 4, 'n_neighbors': 2}


In [11]:
# Getting the best model from the grid search
best_model_KNN = grid_search_KNN.best_estimator_
y_pred = best_model_KNN.predict(X_test)
acc = round(accuracy_score(y_test, y_pred), 2)
print("Test Set Accuracy:", acc)

# Save the best model and grid search results
with open('best_model.pkl', 'wb') as f:
    pickle.dump (best_model_KNN, f)
with open('grid_search_results.pkl', 'wb') as f:
    pickle.dump(grid_search_KNN, f)

Test Set Accuracy: 0.84


## SVC

In [None]:
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_val)
y_pred

In [12]:
# Defining a reduced hyperparameter grid for SVM
param_grid = {
    'C': [0.1, 1],
    'kernel': ['linear', 'rbf']
}

# Initializing GridSearchCV
grid_search_SVM = GridSearchCV(
    estimator=SVC(),
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    scoring='accuracy'
)

print("Best Parameters:", grid_search_SVM.best_params_)

In [13]:
# Fit the model using GridSearchCV on the training subset
grid_search.fit(X_train, y_train)

# Retrieving the best model
best_model_SVM = grid_search.best_estimator_

# Predicting on the test set with the best SVM model
y_pred_svm = best_model_SVM.predict(X_test)

# Calculating accuracy on the test set
svm_accuracy = accuracy_score(y_test, y_pred_svm)

# Printing the results
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", round(grid_search.best_score_, 2))
print("Test Set Accuracy:", round(svm_accuracy, 2))

# Save the best model and grid search results
with open('best_model.pkl', 'wb') as f:
    pickle.dump (best_model_SVM, f)
with open('grid_search_results.pkl', 'wb') as f:
    pickle.dump(grid_search_SVM, f)

## Decision Tree

In [15]:
# Training the model
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=10)
dt_model.fit(X_train, y_train)

# Evaluating the model
y_pred = dt_model.predict(X_test)
print("Test Set Accuracy:", round(accuracy_score(y_test, y_pred), 2))

Test Set Accuracy: 0.73


In [21]:
# Defining a simplified hyperparameter grid for Decision Tree
param_grid = {
    'criterion': ['gini', 'entropy'],  
    'max_depth': [None, 10, 20],      
    'min_samples_split': [2, 10, 20], 
    'min_samples_leaf': [1, 5, 10]     
}

# Initialize the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=10)

# Initializing GridSearchCV with the decision tree model
grid_search_DT = GridSearchCV(
    estimator=dt_model,
    param_grid=param_grid,
    cv=3,         
    n_jobs=-1,    
    scoring='accuracy' 
)

# Fitting the model using GridSearchCV
grid_search_DT.fit(X_train, y_train)

# Retrieving the best model
best_decision_tree = grid_search_DT.best_estimator_

print("Best Parameters:", grid_search_DT.best_params_)

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 2}


In [24]:
# Fit the best model on the training data (if not done already)
best_model_DT.fit(X_train, y_train)

# Predict on the test set using the best model
y_pred = best_model_DT.predict(X_test)

# Evaluate and display results
print("Test Set Accuracy:", round(accuracy_score(y_test, y_pred), 2))

# Save the best model and grid search results
with open('best_model.pkl', 'wb') as f:
    pickle.dump (best_model_DT, f)
with open('grid_search_results.pkl', 'wb') as f:
    pickle.dump(grid_search_DT, f)

Test Set Accuracy: 0.84
