## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

print("Libraries imported successfully yay!")

## 2. Initial Dataset Understanding

### 2.1 Read dataset

In [4]:
df = pd.read_csv('./data/diabetes_binary_health_indicators_BRFSS2015.csv')

### 2.2. Basic Dataset Information

In [None]:
df.head()

In [None]:
df.info()

In [5]:
df['Age']=df['Age'].astype('int64')
df['Education']=df['Education'].astype('int64')
df['Income']=df['Income'].astype('int64')
df['BMI']=df['BMI'].astype('int64')
df['CholCheck']=df['CholCheck'].astype('int64')
df['Smoker']=df['Smoker'].astype('int64')
df['Stroke']=df['Stroke'].astype('int64')
df['HeartDiseaseorAttack']=df['HeartDiseaseorAttack'].astype('int64')
df['PhysActivity']=df['PhysActivity'].astype('int64')
df['Fruits']=df['Fruits'].astype('int64')
df['Veggies']=df['Veggies'].astype('int64')
df['HvyAlcoholConsump']=df['HvyAlcoholConsump'].astype('int64')
df['AnyHealthcare']=df['AnyHealthcare'].astype('int64')
df['NoDocbcCost']=df['NoDocbcCost'].astype('int64')
df['GenHlth']=df['GenHlth'].astype('int64')
df['MentHlth']=df['MentHlth'].astype('int64')
df['PhysHlth']=df['PhysHlth'].astype('int64')
df['DiffWalk']=df['DiffWalk'].astype('int64')
df['Sex']=df['Sex'].astype('int64')
df['Diabetes_binary']=df['Diabetes_binary'].astype('int64')
df['HighBP']=df['HighBP'].astype('int64')
df['HighChol']=df['HighChol'].astype('int64')

In [None]:
df.columns

In [None]:
df.describe().T

### 2.3 Data Cleaning

In [None]:
# Check for missing data
df.isnull().sum()

In [None]:
# Check for duplicated data
df.duplicated().sum()

In [7]:
# Remove duplicated data
df.drop_duplicates(inplace=True)

In [None]:
# Get number of rows x columns
df.shape

In [8]:
# Rename columns for better understanding
df = df.rename(columns={'Diabetes_binary': 'Diabetes'})

## 2.3 EDA - Exploratory Data Analysis

### 2.3.1 Target Distribution across the dataset

In [None]:
sns.countplot(x='Diabetes', data=df)
plt.title('Diabetes Prevalence')
plt.xlabel('Diabetes (0: No Diabetes, 1: Has Diabetes)')
plt.ylabel('Count')

for bar in plt.gca().containers:
    plt.gca().bar_label(bar)
plt.show()

### 2.3.2 Diabetes Prevalence by Gender

In [None]:
df_plot = df.copy()
df_plot['Sex'] = df_plot['Sex'].replace({1: 'Male', 0: 'Female'})

sns.barplot(x='Sex', y='Diabetes', data=df_plot, errorbar=None)
plt.title('Diabetes Risk by Gender')
plt.ylabel('Proportion with Diabetes')
plt.show()


### 2.3.3 Diabetes Prevalence by Age Categories

In [None]:
# Age Categories Mapping
age_category_map = {
    1: '18-24', 
    2: '25-29', 
    3: '30-34', 
    4: '35-39', 
    5: '40-44',
    6: '45-49', 
    7: '50-54', 
    8: '55-59', 
    9: '60-64', 
    10: '65-69', 
    11: '70-74', 
    12: '75-79', 
    13: '80+'
}

# Countplot 
sns.countplot(x='Age', hue='Diabetes', data=df, order=range(1, 14))
plt.title('Diabetes Prevalence by Age')
plt.xlabel('Age Categories')
plt.ylabel('Count')
plt.show()

# Consider only people with diabetes (Diabetes = 1)
df_diabetes = df[df['Diabetes'] == 1]

# Calculate the total count per age category
age_category_count_diabetes = df_diabetes['Age'].value_counts().sort_index()

# Find the age category with the highest count for people with diabetes
max_age_category_diabetes = age_category_count_diabetes.idxmax()
max_count_diabetes = age_category_count_diabetes.max()

max_age_group_diabetes = age_category_map[max_age_category_diabetes]

print(f"Age category with the highest number of people with diabetes: {max_age_category_diabetes} ({max_age_group_diabetes} years old) with a total of {max_count_diabetes} people.")


### 2.3.4 Diabetes Prevalence in Males

In [None]:
# Data for males with diabetes
df_males = df[(df['Sex'] == 1)]

In [None]:
# Countplot 
sns.countplot(x='Age', hue='Diabetes', data=df_males, order=range(1, 14))
plt.title('Diabetes Prevalence in Males by Age Category')
plt.xlabel('Age Categories')
plt.ylabel('Count')
plt.show()

df_males_diabetes = df_males[df_males['Diabetes'] == 1]

age_category_count_male_diabetes = df_males_diabetes['Age'].value_counts().sort_index()

max_age_category_male_diabetes = age_category_count_male_diabetes.idxmax()
max_count_male_diabetes = age_category_count_male_diabetes.max()

max_age_group_male_diabetes = age_category_map[max_age_category_male_diabetes]

print(f"Age category with the highest number of people with diabetes: {max_age_category_male_diabetes} ({max_age_group_male_diabetes} years old) with a total of {max_count_male_diabetes} people.")

In [None]:
df_males_bmi_above_40 = df[(df['Sex'] == 1) & (df['BMI'] > 40)]

# Calculate the count of males with diabetes (Diabetes == 1) and without diabetes (Diabetes == 0)
diabetes_count = df_males_bmi_above_40['Diabetes'].value_counts()

# Calculate the percentage of males with BMI > 40 who have diabetes
diabetes_percentage = (diabetes_count.get(1, 0) / len(df_males_bmi_above_40)) * 100
no_diabetes_percentage = 100 - diabetes_percentage

# Create a pie chart (circle plot)
labels = ['With Diabetes', 'Without Diabetes']
sizes = [diabetes_percentage, no_diabetes_percentage]
colors = ['#ff9999','#66b3ff']
explode = (0.1, 0)  

plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Percentage of Diabetic Males with BMI > 40')
plt.axis('equal')  
plt.show()

print(f"Percentage diabetic males with BMI > 40: {diabetes_percentage:.1f}%")

### 2.3.5 Diabetes Prevalence in Females

In [133]:
# Stats for females with diabetes
df_females = df[(df['Sex'] == 0)]

In [None]:
# Countplot 
sns.countplot(x='Age', hue='Diabetes', data=df_females, order=range(1, 14))
plt.title('Diabetes Prevalence in Females by Age Category')
plt.xlabel('Age Categories')
plt.ylabel('Count')
plt.show()

df_females_diabetes = df_females[df_females['Diabetes'] == 1]

age_category_count_female_diabetes = df_females_diabetes['Age'].value_counts().sort_index()

max_age_category_female_diabetes = age_category_count_female_diabetes.idxmax()
max_count_female_diabetes = age_category_count_female_diabetes.max()

max_age_group_female_diabetes = age_category_map[max_age_category_female_diabetes]

print(f"Age category with the highest number of people with diabetes: {max_age_category_female_diabetes} ({max_age_group_female_diabetes} years old) with a total of {max_count_female_diabetes} people.")

In [None]:
df_females_bmi_above_40 = df[(df['Sex'] == 0) & (df['BMI'] > 40)]

# Calculate the count of females with diabetes (Diabetes == 1) and without diabetes (Diabetes == 0)
diabetes_count = df_females_bmi_above_40['Diabetes'].value_counts()

# Calculate the percentage of females with BMI > 40 who have diabetes
diabetes_percentage = (diabetes_count.get(1, 0) / len(df_females_bmi_above_40)) * 100
no_diabetes_percentage = 100 - diabetes_percentage

# Create a pie chart (circle plot)
labels = ['With Diabetes', 'Without Diabetes']
sizes = [diabetes_percentage, no_diabetes_percentage]
colors = ['#ff9999','#66b3ff']
explode = (0.1, 0)  

plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Percentage of Diabetic Females with BMI > 40')
plt.axis('equal')  
plt.show()

print(f"Percentage diabetic females with BMI > 40: {diabetes_percentage:.1f}%")

### 2.3.6 Diabetes and Risk Factors Analysis

In [None]:
# BMI Analysis
sns.boxplot(x='Diabetes', y='BMI', data=df)
plt.title('BMI Distribution by Diabetes Risk')
plt.xlabel('Diabetes (0: No Diabetes, 1: Has Diabetes)')
plt.ylabel('BMI')
plt.show()

# Impact of Physical Activity and Diabetes
sns.barplot(x='PhysActivity', y='Diabetes', data=df, errorbar=None)
plt.title('Diabetes Risk by Physical Activity')
plt.xlabel('Physical Activity (1: Yes, 0: No)')
plt.ylabel('Proportion with Diabetes')
plt.show()

# Impact of Smoking and Diabetes 
sns.barplot(x='Smoker', y='Diabetes', data=df, errorbar=None)
plt.title('Diabetes Risk by Smoking Status')
plt.xlabel('Smoking (1: Yes, 0: No)')
plt.ylabel('Proportion with Diabetes')
plt.show()

# Impact of General Health Status and Diabetes
sns.barplot(x='GenHlth', y='Diabetes', data=df, errorbar=None)
plt.title('Diabetes Risk by General Health')
plt.xlabel('General Health (1: Excellent, 5: Poor)')
plt.ylabel('Proportion with Diabetes')
plt.show()


### 2.3. Correlation Analysis

#### 2.3.1 HeatMap

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Heatmap')
plt.show()

#### 2.3.1 Correlation of Features with Target Variable

In [None]:
df.drop('Diabetes', axis=1).corrwith(df.Diabetes).plot(kind='bar', 
                                                                     grid=True, 
                                                                     figsize=(20, 8), 
                                                                     title="Correlation with Diabetes_binary",
                                                                     color="Brown")



##### High Correlated Features

In [None]:
high_corr = df.corr()
high_corr_features = high_corr.index[(high_corr["Diabetes"]) >= 0.2]
high_corr_features

##### Low Correlated Features

In [None]:
low_corr = df.corr()
low_corr_features = low_corr.index[(low_corr["Diabetes"]) < 0.0]
low_corr_features

## 3. Data Preprocessing

### 3.1 Feature Selection

In [9]:
# Dropping low correlated features

low_corr = ['PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'Education', 'Income']
df.drop(low_corr , axis= 1 ,inplace=True)

### 3.2 Data Splitting for Training and Testing

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Diabetes', axis = 1)
y = df['Diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## 4. Machine Learning Algorithms

### 4.1 Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=500)

lr.fit(X_train, y_train)

In [None]:
lr_pred = lr.predict(X_test)

print(f"Logistic Regression Accuracy: {accuracy_score(y_test, lr_pred):.2f}")

print(confusion_matrix(y_test, lr_pred))

print(classification_report(y_test, lr_pred))

### 4.2 Random Forest

In [None]:
# Basic Model
rf = RandomForestClassifier()

rf.fit(X_train, y_train)

# Model with the best parameters
rf_best = RandomForestClassifier(
    max_depth=None,
    min_samples_leaf=4,
    min_samples_split=10,
    n_estimators=200,
    random_state=42
)

rf_best.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix

# Best Model Used
rf_best_pred = rf_best.predict(X_test)

# Evaluate performance
print(f"Improved Random Forest Accuracy: {accuracy_score(y_test, rf_best_pred):.4f}")

print("\nClassification Report:\n", classification_report(y_test, rf_best_pred))

cm = confusion_matrix(y_test, rf_best_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a model instance
rf_model = RandomForestClassifier(random_state=42)

# GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Results after 36m
# Best Parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
# Best Accuracy: 0.8526193096561485


### 4.3 Decision Tree

In [None]:
dt = DecisionTreeClassifier(random_state=42)

dt.fit(X_train, y_train)

In [None]:
dt_pred = dt.predict(X_test)

print(f"Decision Tree Accuracy: {accuracy_score(y_test, dt_pred):.2f}")

print(confusion_matrix(y_test, dt_pred))

print(classification_report(y_test, dt_pred))

### 4.4 KNeighborsClassifier Model (KNN)

In [None]:
knn = KNeighborsClassifier()

knn.fit(X_train, y_train)

In [None]:
knn_pred = rf.predict(X_test)

print(f"KNN Accuracy: {accuracy_score(y_test, knn_pred):.2f}")

print(confusion_matrix(y_test, knn_pred))

print(classification_report(y_test, knn_pred))

### 4.5 Support Vector Machine (SVM)

In [None]:
svm = SVC(kernel='linear')

svm.fit(X_train, y_train)

In [None]:
svm_pred = rf.predict(X_test)

print(f"SVM Accuracy: {accuracy_score(y_test, svm_pred):.2f}")

print(confusion_matrix(y_test, svm_pred))

print(classification_report(y_test, svm_pred))

### 4.6 AdaBoost Classifier

In [None]:
adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)

adaboost.fit(X_train, y_train)

In [None]:
adaboost_pred = adaboost.predict(X_test)

print(f"AdaBoost Accuracy: {accuracy_score(y_test, adaboost_pred):.2f}")

print(confusion_matrix(y_test, adaboost_pred))

print(classification_report(y_test, adaboost_pred))

### 4.7 Gradient Boosting Classifier

In [None]:
gb = GradientBoostingClassifier(random_state=42)

gb.fit(X_train, y_train)

In [None]:
gb_pred = gb.predict(X_test)

print(f"Gradient Boosting Accuracy: {accuracy_score(y_test, gb_pred):.2f}")

print(confusion_matrix(y_test, gb_pred))

print(classification_report(y_test, gb_pred))

### 4.8 XGBoost

In [None]:
# from xgboost import XGBClassifier

In [None]:
# metrics...

### 4.9 Naive Bayes

In [None]:
nb = GaussianNB()

nb.fit(X_train, y_train)

In [None]:
nb_pred = nb.predict(X_test)

print(f"Gradient Boosting Accuracy: {accuracy_score(y_test, nb_pred):.2f}")

print(confusion_matrix(y_test, nb_pred))

print(classification_report(y_test, nb_pred))


## 5. Conclusion