# Predicting Diabetes Using BRFSS Data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, precision_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('Diabetes_2022.csv')

In [None]:
y = df.Diabetes_01
X = df.drop(['Diabetes_01'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
print('X_train:', X_train.shape, 'X_test:', X_test.shape)
print('y_train:', y_train.shape, 'y_test:', y_test.shape)

In [None]:
def optimize_model(model, param_grid, cv = 5, X_train = X_train, y_train = y_train):
    optimizer = GridSearchCV(model, param_grid = param_grid, scoring="accuracy", cv = cv, n_jobs = -1, verbose = True)
    optimizer.fit(X_train, y_train)
    print('Best parameters found:')
    print(optimizer.best_params_)
    print('\nBest score: %0.6f' % (optimizer.best_score_))
    
    return optimizer.best_estimator_

In [None]:
def evaluate_model(model, X_test = X_test, y_test = y_test):
    y_pred = model.predict(X_test)
    print('\nClassification report: ')
    print(classification_report(y_test, y_pred))

#### GaussianNB

In [None]:
gaussianNB_model = GaussianNB()
gaussianNB_model.fit(X_train, y_train)
print('Best score:', np.round(accuracy_score(y_test, gaussianNB_model.predict(X_test)), 6))
evaluate_model(gaussianNB_model)

In [None]:
print('Cross validation - GaussianNB:', np.round(cross_val_score(GaussianNB(), X, y, cv = 5).mean(), 6))

#### MultinomialNB

In [None]:
multinomialNB_model = MultinomialNB()
multinomialNB_model.fit(X_train, y_train)
print('Best score:', np.round(accuracy_score(y_test, multinomialNB_model.predict(X_test)), 6))
evaluate_model(multinomialNB_model)

In [None]:
print('Cross validation - MultinomialNB:', np.round(cross_val_score(MultinomialNB(), X, y, cv = 5).mean(), 6))

GaussianNB and MultinomialNB gave very similar results. Cross-validation of very basic layering models yielded accuracy scores of around 0.77.\
We can see that both models detect healthy individuals more accurately than those who are pre-diabetic or diagnosed with diabetes.

### Logistic Regression

In [None]:
lr_model = Pipeline([("scaler", StandardScaler()), ("model", LogisticRegression())])

lr_param_grid = {'model__C' : 10**np.arange(-3,4, dtype = float), 
                 'model__penalty' : ['l1','l2'],  
                 'model__class_weight' : ['balanced', None]}

In [None]:
lr_baseline = LogisticRegression(random_state = 0)
lr_baseline.fit(X_train, y_train)
print('Best score:', np.round(accuracy_score(y_test, lr_baseline.predict(X_test)), 6))
evaluate_model(lr_baseline)

In [None]:
lr_model = optimize_model(lr_model, lr_param_grid)
evaluate_model(lr_model)

In [None]:
lr_model = Pipeline([("scaler", StandardScaler()), ("model", LogisticRegression())])
lr_param_grid = {'model__C' : 10**np.linspace(-3, 0, num=50)}
lr_model = optimize_model(lr_model, lr_param_grid)
evaluate_model(lr_model)

Optimization of parameter C did not improve the accuracy of the linear regression model.

In [None]:
y_pred = lr_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(2, 1))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Reds)
plt.show()


### SVM

In [None]:
svm_model = Pipeline([("scaler", StandardScaler()), ("model", SVC(kernel='rbf'))])

svm_param_grid = {'model__C' : 10**np.arange(-3,4, dtype = float),
                  'model__class_weight' : ['balanced', None]}

In [None]:
svm_baseline = SVC(random_state = 0)
svm_baseline.fit(X_train, y_train)
print('Best score:', np.round(accuracy_score(y_test, svm_baseline.predict(X_test)), 6))
evaluate_model(svm_baseline)

In [None]:
y_pred = svm_baseline.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(2, 1))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Reds)
plt.show()


The dataset for the SVM classifier is too large.

### Decision Tree Classifier

In [None]:
tree_model = Pipeline([("scaler", StandardScaler()), ("model", DecisionTreeClassifier())])

tree_param_grid = {'model__criterion': ['gini', 'entropy'],
                   'model__max_depth': [2, 5, 10, 20, 30],
                   'model__min_samples_leaf': [5, 10, 20, 30]}

In [None]:
tree_baseline = DecisionTreeClassifier(random_state = 0)
tree_baseline.fit(X_train, y_train)
print('Best score:', np.round(accuracy_score(y_test, tree_baseline.predict(X_test)), 6))
evaluate_model(tree_baseline)

In [None]:
tree_model = optimize_model(tree_model, tree_param_grid)
evaluate_model(tree_model)

In [None]:
tree_model = Pipeline([("scaler", StandardScaler()), ("model", DecisionTreeClassifier(criterion = 'entropy', 
                                                                                      max_depth = 10))])
tree_param_grid = {'model__min_samples_leaf': [30, 45, 50, 75, 100, 150, 200, 300, 500, 650, 750, 900]}
tree_model = optimize_model(tree_model, tree_param_grid)
evaluate_model(tree_model)

Narrowing down the area

In [None]:
tree_model = Pipeline([("scaler", StandardScaler()), ("model", DecisionTreeClassifier(criterion = 'entropy', 
                                                                                      max_depth = 10))])
tree_param_grid = {'model__min_samples_leaf': np.arange(630,650,1)}
tree_model = optimize_model(tree_model, tree_param_grid)
evaluate_model(tree_model)

In [None]:
y_pred = tree_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(2, 1))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Reds)
plt.show()

### Random Forest Classifier

In [None]:
forest_model = Pipeline([("scaler", StandardScaler()), ("model", RandomForestClassifier())])

forest_param_grid = {'model__n_estimators': [200, 500, 1000],
                     'model__max_depth' : [5, 15, 50],
                     'model__min_samples_leaf': [5, 10, 20],
                     'model__criterion': ['gini', 'entropy']}

In [None]:
forest_baseline = RandomForestClassifier(random_state = 0)
forest_baseline.fit(X_train, y_train)
print('Best score:', np.round(accuracy_score(y_test, forest_baseline.predict(X_test)), 6))
evaluate_model(forest_baseline)

In [None]:
forest_model = optimize_model(forest_model, forest_param_grid)
evaluate_model(forest_model)

In [None]:
forest_model = Pipeline([("scaler", StandardScaler()), ("model", RandomForestClassifier(min_samples_leaf = 10,
                                                                                        criterion = 'entropy'))])
forest_param_grid = {'model__n_estimators': [400, 500, 700],
                     'model__max_depth' : [35, 50, 100]}
forest_model = optimize_model(forest_model, forest_param_grid)
evaluate_model(forest_model)


'''forest_model = Pipeline([("scaler", StandardScaler()), ("model", RandomForestClassifier(min_samples_leaf = 10,
                                                                                        criterion = 'entropy'))])
forest_param_grid = {'model__n_estimators': np.arange(400, 500, 20),
                     'model__max_depth' : np.arange(50, 100, 10)}
forest_model = optimize_model(forest_model, forest_param_grid)
evaluate_model(forest_model)
'''

In [None]:
y_pred = forest_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(2, 1))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Reds)
plt.show()

We have finished the search. Our best random forest model includes criterion as entropy, max depth of __, a minimum number of samples to be in a leaf node of __ and number od estimators of ___.


### XGBoost Classifier

In [None]:
xgb_model = Pipeline([("scaler", StandardScaler()), ("model", xgb.XGBClassifier(eval_metric='error'))])

xgb_param_grid  = {'model__n_estimators': [200, 400, 800],
                   'model__max_depth': [5, 10, 20],
                   'model__learning_rate': [0.05, 0.1, 0.20],
                   'model__min_child_weight': [1, 10, 100]}

In [None]:
xgb_baseline = xgb.XGBClassifier(eval_metric='error', random_state = 0)
xgb_baseline.fit(X_train, y_train)
print('Best score:', np.round(accuracy_score(y_test, xgb_baseline.predict(X_test)), 6))
evaluate_model(xgb_baseline)

In [None]:
xgb_model = optimize_model(xgb_model, xgb_param_grid)
evaluate_model(xgb_model)

In [None]:
xgb_model = Pipeline([("scaler", StandardScaler()), ("model", xgb.XGBClassifier(eval_metric='error'))])
xgb_param_grid  = {'model__n_estimators': [800, 1000],
                   'model__max_depth': [1, 3, 5],
                   'model__learning_rate': [0.001, 0.01, 0.05],
                   'model__min_child_weight': [100, 200]}
xgb_model = optimize_model(xgb_model, xgb_param_grid)
evaluate_model(xgb_model)

In [None]:
xgb_model = Pipeline([("scaler", StandardScaler()), ("model", xgb.XGBClassifier(eval_metric = 'error', 
                                                                                max_depth = 5, 
                                                                                learning_rate = 0.05, 
                                                                                min_child_weight = 100))])
xgb_param_grid  = {'model__n_estimators': [750, 800, 850]}
xgb_model = optimize_model(xgb_model, xgb_param_grid)
evaluate_model(xgb_model)

In [None]:
y_pred = xgb_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(2, 1))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Reds)
plt.show()

## Metrics

In [None]:
from sklearn import metrics
model_name = []
accuracy_score = []
f1_score = []
roc_auc_score = []
recall_score = []
precision_score = []
    
def metrics_models(name, model, X_test = X_test, y_test = y_test):
    models = list()
    models.append((name, model))
    
    for name, model in models:
        model_name.append(name)
        accuracy_score.append(metrics.accuracy_score(y_test, model.predict(X_test)))
        f1_score.append(metrics.f1_score(y_test, model.predict(X_test)))
        roc_auc_score.append(metrics.roc_auc_score(y_test, model.predict(X_test)))
        recall_score.append(metrics.recall_score(y_test, model.predict(X_test)))
        precision_score.append(metrics.precision_score(y_test, model.predict(X_test)))


In [None]:
metrics_models('Base GaussianNB', gaussianNB_model)
metrics_models('Base MultinomialNB', multinomialNB_model)
metrics_models('Base LinearRegression', lr_baseline)
metrics_models('LinearRegression', lr_model)
metrics_models('Base DecisionTreeClassifier', tree_baseline)
metrics_models('DecisionTreeClassifier', tree_model)
metrics_models('Base RandomForestClassifier', forest_baseline)
metrics_models('RandomForestClassifier', forest_model)
metrics_models('Base XGBoostClassifier', xgb_baseline)
metrics_models('XGBoostClassifier', xgb_model)

In [None]:
metrics_df = pd.DataFrame({'Model': model_name,
                           'Accuracy': accuracy_score,
                           'F1-score': f1_score,
                           'AUC': roc_auc_score,
                           'Recall': recall_score,
                           'Precision': precision_score})

In [None]:
metrics_df