## 1. Import Libraries

Python 3.11.5

**Library versions:**

* matplotlib==3.8.0

* numpy==1.26.0

* pandas==2.1.1

* scikit-learn==1.3.1
 
* seaborn==0.13.0

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV


print("Libraries imported successfully yay!")

Libraries imported successfully yay!


## 2. Initial Dataset Understanding

### 2.1 Read dataset

In [2]:
df = pd.read_csv('./data/diabetes_binary_health_indicators_BRFSS2015.csv')

### 2.2. Basic Dataset Information

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.describe().T

### 2.3 Data Cleaning

#### 2.3.1 Transform the data to integer values

In [3]:
df['Age']=df['Age'].astype('int64')
df['Education']=df['Education'].astype('int64')
df['Income']=df['Income'].astype('int64')
df['BMI']=df['BMI'].astype('int64')
df['CholCheck']=df['CholCheck'].astype('int64')
df['Smoker']=df['Smoker'].astype('int64')
df['Stroke']=df['Stroke'].astype('int64')
df['HeartDiseaseorAttack']=df['HeartDiseaseorAttack'].astype('int64')
df['PhysActivity']=df['PhysActivity'].astype('int64')
df['Fruits']=df['Fruits'].astype('int64')
df['Veggies']=df['Veggies'].astype('int64')
df['HvyAlcoholConsump']=df['HvyAlcoholConsump'].astype('int64')
df['AnyHealthcare']=df['AnyHealthcare'].astype('int64')
df['NoDocbcCost']=df['NoDocbcCost'].astype('int64')
df['GenHlth']=df['GenHlth'].astype('int64')
df['MentHlth']=df['MentHlth'].astype('int64')
df['PhysHlth']=df['PhysHlth'].astype('int64')
df['DiffWalk']=df['DiffWalk'].astype('int64')
df['Sex']=df['Sex'].astype('int64')
df['Diabetes_binary']=df['Diabetes_binary'].astype('int64')
df['HighBP']=df['HighBP'].astype('int64')
df['HighChol']=df['HighChol'].astype('int64')

In [None]:
df.info()

#### 2.3.2 Check for missing data (null values)

In [None]:
df.isnull().sum()

#### 2.3.3 Check for duplicated data and remove duplicates

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"There are: {duplicates} duplicates")

# Remove duplicates
df.drop_duplicates(inplace=True)

#### 2.3.4 Get dataset information on rows x columns

In [None]:
df.shape

#### 2.3.5 Check number of unique values in different features

In [None]:
unique_values = {}

for col in df.columns:
    unique_values[col] = df[col].value_counts().shape[0]

pd.DataFrame(unique_values, index=['# Unique Values']).transpose()

#### 2.3.6 Rename target variable name

In [5]:
df = df.rename(columns={'Diabetes_binary': 'Diabetes'})

## 2.3 EDA - Exploratory Data Analysis

### 2.3.1 Stacked Bar Chart Analysis of Features vs. Diabetes

In [None]:
binary_cols = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 
               'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 
               'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk']

import matplotlib.pyplot as plt

def create_pivot(data, column):
    return data.groupby([column, 'Diabetes']).size().unstack(fill_value=0)

def plot_stacked_bars(data, columns):
    _, axes = plt.subplots(3, 4, figsize=(15, 15))
    axes = axes.ravel()

    for i, col in enumerate(columns):
        if i >= len(axes):
            break
        create_pivot(data, col).plot(kind='bar', stacked=True, ax=axes[i])
        axes[i].set_title(col)
        axes[i].set_xlabel(col)

    # Turn off unused subplots
    for ax in axes[len(columns):]:
        ax.axis('off')

    plt.tight_layout()
    plt.show()

plot_stacked_bars(df, binary_cols)


### 2.3.2 Target Distribution across the dataset

In [None]:
sns.countplot(x='Diabetes', data=df)
plt.title('Diabetes Prevalence')
plt.xlabel('Diabetes (0: No Diabetes, 1: Has Diabetes)')
plt.ylabel('Count')

for bar in plt.gca().containers:
    plt.gca().bar_label(bar)
plt.show()

Percentage of people with diabetes and people with no diabetes:

In [None]:
diabetes_counts = df['Diabetes'].value_counts()

# Plot the pie chart
plt.figure(figsize=(8, 8))
plt.pie(
    diabetes_counts, 
    labels=['No Diabetes (0)', 'Has Diabetes (1)'], 
    autopct='%.02f%%',
    startangle=90, 
)
plt.title('Diabetes Prevalence')
plt.show()

There are more people without diabetes than people with diabetes.

### 2.3.3 Diabetes Prevalence by Gender

In [None]:
df_plot = df.copy()
df_plot['Sex'] = df_plot['Sex'].replace({1: 'Male', 0: 'Female'})

sns.barplot(x='Sex', y='Diabetes', data=df_plot, errorbar=None)
plt.title('Diabetes Risk by Gender')
plt.ylabel('Proportion with Diabetes')
plt.show()

### 2.3.4 Diabetes Prevalence by Age Categories

In [None]:
# Age Categories Mapping
age_category_map = {
    1: '18-24', 
    2: '25-29', 
    3: '30-34', 
    4: '35-39', 
    5: '40-44',
    6: '45-49', 
    7: '50-54', 
    8: '55-59', 
    9: '60-64', 
    10: '65-69', 
    11: '70-74', 
    12: '75-79', 
    13: '80+'
}

# Countplot 
plt.figure(figsize=(20, 6))
sns.countplot(x='Age', hue='Diabetes', data=df, order=range(1, 14))
plt.title('Diabetes Prevalence by Age')
plt.xlabel('Age Categories')
plt.ylabel('Count')
plt.show()

# Consider only people with diabetes (Diabetes = 1)
df_diabetes = df[df['Diabetes'] == 1]

# Calculate the total count per age category
age_category_count_diabetes = df_diabetes['Age'].value_counts().sort_index()

# Find the age category with the highest count for people with diabetes
max_age_category_diabetes = age_category_count_diabetes.idxmax()
max_count_diabetes = age_category_count_diabetes.max()

max_age_group_diabetes = age_category_map[max_age_category_diabetes]

print(f"Age category with the highest number of people with diabetes: {max_age_category_diabetes} ({max_age_group_diabetes} years old) with a total of {max_count_diabetes} people.")


As age increases so does the diabetes diagnostic in people. 

### 2.3.5 Diabetes Prevalence in Males

In [48]:
# Data for males with diabetes
df_males = df[(df['Sex'] == 1)]

In [None]:
# Countplot 
sns.countplot(x='Age', hue='Diabetes', data=df_males, order=range(1, 14))
plt.title('Diabetes Prevalence in Males by Age Category')
plt.xlabel('Age Categories')
plt.ylabel('Count')
plt.show()

df_males_diabetes = df_males[df_males['Diabetes'] == 1]

age_category_count_male_diabetes = df_males_diabetes['Age'].value_counts().sort_index()

max_age_category_male_diabetes = age_category_count_male_diabetes.idxmax()
max_count_male_diabetes = age_category_count_male_diabetes.max()

max_age_group_male_diabetes = age_category_map[max_age_category_male_diabetes]

print(f"Age category with the highest number of people with diabetes: {max_age_category_male_diabetes} ({max_age_group_male_diabetes} years old) with a total of {max_count_male_diabetes} people.")

In [None]:
df_males_bmi_above_40 = df[(df['Sex'] == 1) & (df['BMI'] > 40)]

# Calculate the count of males with diabetes (Diabetes == 1) and without diabetes (Diabetes == 0)
diabetes_count = df_males_bmi_above_40['Diabetes'].value_counts()

# Calculate the percentage of males with BMI > 40 who have diabetes
diabetes_percentage = (diabetes_count.get(1, 0) / len(df_males_bmi_above_40)) * 100
no_diabetes_percentage = 100 - diabetes_percentage

# Create a pie chart (circle plot)
labels = ['With Diabetes', 'Without Diabetes']
sizes = [diabetes_percentage, no_diabetes_percentage]
colors = ['#ff9999','#66b3ff']
explode = (0.1, 0)  

plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Percentage of Diabetic Males with BMI > 40')
plt.axis('equal')  
plt.show()

print(f"Percentage diabetic males with BMI > 40: {diabetes_percentage:.1f}%")

### 2.3.6 Diabetes Prevalence in Females

In [51]:
# Stats for females with diabetes
df_females = df[(df['Sex'] == 0)]

In [None]:
# Countplot 
sns.countplot(x='Age', hue='Diabetes', data=df_females, order=range(1, 14))
plt.title('Diabetes Prevalence in Females by Age Category')
plt.xlabel('Age Categories')
plt.ylabel('Count')
plt.show()

df_females_diabetes = df_females[df_females['Diabetes'] == 1]

age_category_count_female_diabetes = df_females_diabetes['Age'].value_counts().sort_index()

max_age_category_female_diabetes = age_category_count_female_diabetes.idxmax()
max_count_female_diabetes = age_category_count_female_diabetes.max()

max_age_group_female_diabetes = age_category_map[max_age_category_female_diabetes]

print(f"Age category with the highest number of people with diabetes: {max_age_category_female_diabetes} ({max_age_group_female_diabetes} years old) with a total of {max_count_female_diabetes} people.")

In [None]:
df_females_bmi_above_40 = df[(df['Sex'] == 0) & (df['BMI'] > 40)]

# Calculate the count of females with diabetes (Diabetes == 1) and without diabetes (Diabetes == 0)
diabetes_count = df_females_bmi_above_40['Diabetes'].value_counts()

# Calculate the percentage of females with BMI > 40 who have diabetes
diabetes_percentage = (diabetes_count.get(1, 0) / len(df_females_bmi_above_40)) * 100
no_diabetes_percentage = 100 - diabetes_percentage

# Create a pie chart (circle plot)
labels = ['With Diabetes', 'Without Diabetes']
sizes = [diabetes_percentage, no_diabetes_percentage]
colors = ['#ff9999','#66b3ff']
explode = (0.1, 0)  

plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Percentage of Diabetic Females with BMI > 40')
plt.axis('equal')  
plt.show()

print(f"Percentage diabetic females with BMI > 40: {diabetes_percentage:.1f}%")

### 2.3.7 Diabetes and Risk Factors Analysis

In [None]:
# BMI Analysis
sns.boxplot(x='Diabetes', y='BMI', data=df)
plt.title('BMI Distribution by Diabetes Risk')
plt.xlabel('Diabetes (0: No Diabetes, 1: Has Diabetes)')
plt.ylabel('BMI')
plt.show()

# Impact of Physical Activity and Diabetes
sns.barplot(x='PhysActivity', y='Diabetes', data=df, errorbar=None)
plt.title('Diabetes Risk by Physical Activity')
plt.xlabel('Physical Activity (1: Yes, 0: No)')
plt.ylabel('Proportion with Diabetes')
plt.show()

# Impact of Smoking and Diabetes 
sns.barplot(x='Smoker', y='Diabetes', data=df, errorbar=None)
plt.title('Diabetes Risk by Smoking Status')
plt.xlabel('Smoking (1: Yes, 0: No)')
plt.ylabel('Proportion with Diabetes')
plt.show()

# Impact of General Health Status and Diabetes
sns.barplot(x='GenHlth', y='Diabetes', data=df, errorbar=None)
plt.title('Diabetes Risk by General Health')
plt.xlabel('General Health (1: Excellent, 5: Poor)')
plt.ylabel('Proportion with Diabetes')
plt.show()


### 2.3.8 Education Feature vs. Diabetes

In [None]:
plt.figure(figsize=(15, 6))

# Histogram for No Diabetic group
sns.histplot(df.Education[df.Diabetes == 0], color="y", label="No Diabetes", kde=True, stat="density")

# Histogram for Diabetic group
sns.histplot(df.Education[df.Diabetes == 1], color="m", label="Has Diabetes", kde=True, stat="density")

plt.title("Relation b/w Education and Diabetes")
plt.xlabel("Education")
plt.ylabel("Density")
plt.legend()
plt.show()


Conclusions:
- There are more people with higher levels of education.
- There are more people without diabetes who have higher levels of education.

## 2.4. Correlation Analysis

#### 2.4.1 HeatMap

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Heatmap')
plt.show()

#### 2.4.2 Correlation of Features with Target Variable

In [None]:
df.drop('Diabetes', axis=1).corrwith(df.Diabetes).plot(kind='bar', grid=True, figsize=(16, 4), title="Correlation with Diabetes",color="Brown")

##### High Correlated Features

In [None]:
high_corr = df.corr()
high_corr_features = high_corr.index[abs((high_corr["Diabetes"])) >= 0.1]
high_corr_features

##### Low Correlated Features

In [None]:
low_corr = df.corr()
low_corr_features = low_corr.index[abs(low_corr["Diabetes"]) < 0.05]
low_corr_features

## 3. Data Preprocessing

### 3.1 Feature Selection

In [7]:
# Dropping low correlated features

low_corr = ['Smoker', 'Fruits', 'Veggies', 'AnyHealthcare', 'NoDocbcCost', 'Sex']
df.drop(low_corr , axis= 1 ,inplace=True)

### 3.2 Data Splitting for Training and Testing

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Target Variable
X = df.drop('Diabetes', axis = 1)
y = df['Diabetes']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## 4. Machine Learning Algorithms

### Before Tuning

In [None]:
models = {
    "Support Vector Machine": SVC(kernel='linear'),
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(), 
}

results = []

for name, model in models.items():
    # Train model
    print(f"Training {name}...")
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Get accuracy
    accuracy = accuracy_score(y_test, y_pred)
    results.append((name, accuracy))

    # Metrics
    print(f"{name} Accuracy: {accuracy:.4f}")
    
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix for {name}:\n{cm}")

    # Plot confusion matrix
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=['Negative', 'Positive'], 
                yticklabels=['Negative', 'Positive'])
    plt.title(f"Confusion Matrix for {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

    print("-" * 50)

# Convert results to DataFrame
df_results = pd.DataFrame(results, columns=["Model", "Accuracy"])
df_results = df_results.sort_values(by="Accuracy", ascending=False)

# Print sorted results
print("Summary of Results:")
# print(df_results.to_string(index=False))

print(df_results)


### HyperParameter Tuning Function

In [12]:
results = []

def hyperparameter_tuning(model, param_dist, X_train, y_train, X_test, y_test, cv=5, n_candidates=50, scoring='accuracy'):
    halving_random_search = HalvingRandomSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_candidates=n_candidates,
        cv=cv,
        scoring=scoring,
        verbose=2,
        n_jobs=-1,
        random_state=42,
    )
    halving_random_search.fit(X_train, y_train)

    # Extract the best model and parameters
    best_model = halving_random_search.best_estimator_
    best_params = halving_random_search.best_params_

    print(f"Best Model Parameters: {best_params}")
    print(f"Best {model.__class__.__name__} Score: {halving_random_search.best_score_:.4f}")
    print(f"Best {model.__class__.__name__} Estimator: {best_model}")

    # Evaluate on the test set
    y_pred = best_model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(y_test, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(y_test, y_pred, average='weighted', zero_division=0),
        "classification_report": classification_report(y_test, y_pred),
        "confusion_matrix": cm
    }

    # Display metrics
    print("\nEvaluation Metrics on Test Set:")
    for metric, value in metrics.items():
        if metric == "confusion_matrix":
            print(f"\n{metric}:\n{value}")
        elif metric == "classification_report":
            print(f"\n{metric}:\n{value}")
        else:
            print(f"{metric}: {value:.4f}")

    # Confusion Matrix Plot
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    # Save the model name, accuracy, and other relevant metrics
    result = {
        "model": model.__class__.__name__,
        "best_accuracy": halving_random_search.best_score_,
        "test_accuracy": metrics["accuracy"],
        "params": best_params
    }
    results.append(result)
    
    return best_model, best_params, metrics


### 4.1 Logistic Regression

In [None]:
model = LogisticRegression()

param_dist = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  
    'solver': ['liblinear', 'saga', 'lbfgs'],  
    'max_iter': [100, 200, 300],  
    'penalty': ['l1', 'l2'],  
}

best_model, best_params, metrics = hyperparameter_tuning(
    model=model,
    param_dist=param_dist,
    X_train=X_train,  
    y_train=y_train,  
    X_test=X_test,    
    y_test=y_test,   
    cv=5,             
    n_candidates=20,        
    scoring='accuracy'
)

### 4.2 Random Forest

In [None]:
model = RandomForestClassifier(random_state=42)

param_dist = {
    'n_estimators': [50, 100, 150, 200, 250],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
}

best_model, best_params, metrics = hyperparameter_tuning(
    model=model,
    param_dist=param_dist,
    X_train=X_train,  
    y_train=y_train,  
    X_test=X_test,    
    y_test=y_test,   
    cv=5,             
    n_candidates=20,        
    scoring='accuracy'
)

### 4.3 Decision Tree

In [None]:
model = DecisionTreeClassifier(random_state=42)

param_dist = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
}

best_model, best_params, metrics = hyperparameter_tuning(
    model=model,
    param_dist=param_dist,
    X_train=X_train,  
    y_train=y_train,  
    X_test=X_test,    
    y_test=y_test,   
    cv=5,             
    n_candidates=20,        
    scoring='accuracy'
)

### 4.4 KNeighborsClassifier Model (KNN)

In [None]:
model = KNeighborsClassifier()

param_dist = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2],  
    'leaf_size': [20, 30, 40, 50],  
}

best_model, best_params, metrics = hyperparameter_tuning(
    model=model,
    param_dist=param_dist,
    X_train=X_train,  
    y_train=y_train,  
    X_test=X_test,    
    y_test=y_test,   
    cv=5,             
    n_candidates=20,        
    scoring='accuracy'
)

### 4.5 Support Vector Machine (SVM)

In [None]:
model = SVC()

param_dist = {
    'C': [0.1, 1, 10, 100],  
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  
    'degree': [3, 4, 5],  
    'gamma': ['scale', 'auto'],  
}

best_model, best_params, metrics = hyperparameter_tuning(
    model=model,
    param_dist=param_dist,
    X_train=X_train,  
    y_train=y_train,  
    X_test=X_test,    
    y_test=y_test,   
    cv=5,             
    n_candidates=20,        
    scoring='accuracy'
)

### 4.6 Gradient Boosting

In [None]:
model = GradientBoostingClassifier()

param_dist = {
    'n_estimators': [50, 100, 150, 200, 250],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'warm_start': [True, False],
    'n_iter_no_change': [5, 10, 20],
}

best_model, best_params, metrics = hyperparameter_tuning(
    model=model,
    param_dist=param_dist,
    X_train=X_train,  
    y_train=y_train,  
    X_test=X_test,    
    y_test=y_test,   
    cv=5,             
    n_candidates=20,        
    scoring='accuracy'
)

## 5. Results

In [None]:
# Dataframe for results in order of best test accuracy
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="test_accuracy", ascending=False)

print("\nModel Comparison (Sorted by Test Accuracy):")
print(results_df[["model", "test_accuracy", "best_accuracy"]])