Imports

In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import helpers.evaluate
from helpers.model_list import model_map

#Import models
models = {name: model_class() for name, model_class in model_map.items()}

ModuleNotFoundError: No module named 'optuna'

Load in Data

In [None]:
df = pd.read_csv("diabetes_binary_5050split_health_indicators_BRFSS2015.csv")
X = df.drop(columns=["Diabetes_binary"])
y = df["Diabetes_binary"]
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


Explore Data

In [None]:
print("Dataset shape:", df.shape)
print("\nData types:")
print(df.dtypes)


Dataset shape: (70692, 22)

Data types:
Diabetes_binary         float64
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64
NoDocbcCost             float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Sex                     float64
Age                     float64
Education               float64
Income                  float64
dtype: object


In [None]:
print("\nSummary statistics:")
print(df.describe())


Summary statistics:
       Diabetes_binary        HighBP      HighChol     CholCheck  \
count     70692.000000  70692.000000  70692.000000  70692.000000   
mean          0.500000      0.563458      0.525703      0.975259   
std           0.500004      0.495960      0.499342      0.155336   
min           0.000000      0.000000      0.000000      0.000000   
25%           0.000000      0.000000      0.000000      1.000000   
50%           0.500000      1.000000      1.000000      1.000000   
75%           1.000000      1.000000      1.000000      1.000000   
max           1.000000      1.000000      1.000000      1.000000   

                BMI        Smoker        Stroke  HeartDiseaseorAttack  \
count  70692.000000  70692.000000  70692.000000          70692.000000   
mean      29.856985      0.475273      0.062171              0.147810   
std        7.113954      0.499392      0.241468              0.354914   
min       12.000000      0.000000      0.000000              0.000000   
2

In [None]:
# Count missing values in each column
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
Diabetes_binary         0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64


In [None]:
# Check for duplicate records
print("\nNumber of duplicate rows:", df.duplicated().sum())


Number of duplicate rows: 1635


Clean Data

In [None]:
print("Dataset shape:", df.shape)
df = df.drop_duplicates()
print("New shape:", df.shape)

Dataset shape: (70692, 22)
New shape: (69057, 22)


Split Data

In [None]:
#Transform target
le = LabelEncoder()
y = le.fit_transform(y)

#Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8)

#Scale Data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Reassign col names
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)


Train Models

In [None]:


total_metrics = {}

#train each model
for name, model in models.items():
    model.train(X_train_scaled, y_train)
    metrics = model.predict(X_test_scaled, y_test)
    total_metrics[name] = metrics
    helpers.evaluate.print_stats(metrics, name)


Decision Tree  Accuracy Score:  0.6567174735650882
Decision Tree  Precision Score:  0.6571104172580188
Decision Tree  Recalle Score:  0.6551114255394411
Decision Tree  F1 Score:  0.6561093988025649
Logistic Regression  Accuracy Score:  0.747444919899565
Logistic Regression  Precision Score:  0.7357650945622493
Logistic Regression  Recalle Score:  0.7720198089847895
Logistic Regression  F1 Score:  0.7534565791517788
K Nearest Neighbors  Accuracy Score:  0.6975280263111363
K Nearest Neighbors  Precision Score:  0.6868514427261163
K Nearest Neighbors  Recalle Score:  0.7258224266006367
K Nearest Neighbors  F1 Score:  0.7057993946064942
SVC  Accuracy Score:  0.7430950949534958
SVC  Precision Score:  0.7180626527438347
SVC  Recalle Score:  0.8002829854969933
SVC  F1 Score:  0.7569466517222343


Report Stats

In [None]:
#Get best metrics
bestModel = ""
bestPerformance = 0
for name, metrics in total_metrics.items():
    if(metrics[0] > bestPerformance):
        bestModel = name
        bestPerformance = metrics[0]

print("Best Model = ", bestModel)
print("Model Accuracy = ", bestPerformance)


Best Model =  Logistic Regression
Model Accuracy =  0.747444919899565


Hyperparameter Tuning

In [None]:
svc = models["SVC"]
svc.tune_hyperParams(X_test_scaled, y_test)

NameError: name 'models' is not defined