In [68]:
# import libraries
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [69]:
df = pd.read_csv('diabetes_012_health_indicators_BRFSS2015.csv')
df

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0.0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,2.0,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0


In [70]:
# get the info of the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_012          253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [71]:
# values in the Diabetes indicator column
df['Diabetes_012'].value_counts()

Diabetes_012
0.0    213703
2.0     35346
1.0      4631
Name: count, dtype: int64

Since integers are used to describe the categorical attribute of education level we will dumify the data into separate columns.

In [72]:
# Dictionary to map the values to their descriptions
education_map = {
    1: "Never attended school or only kindergarten",
    2: "Grades 1 through 8 (Elementary)",
    3: "Grades 9 through 11 (Some high school)",
    4: "Grade 12 or GED (High school graduate)",
    5: "College 1 year to 3 years (Some college or technical school)",
    6: "College 4 years or more (College graduate)"
}

# Replace the numeric values with their corresponding descriptions
df['Education'] = df['Education'].map(education_map)

# Create dummy variables
df_dummies = pd.get_dummies(df['Education'], prefix='Education', dtype=int)

# Add the dummy variables to the original dataframe
df = pd.concat([df, df_dummies], axis=1)

# Optionally, drop the original 'Education' column
df = df.drop('Education', axis=1)
df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,DiffWalk,Sex,Age,Income,Education_College 1 year to 3 years (Some college or technical school),Education_College 4 years or more (College graduate),Education_Grade 12 or GED (High school graduate),Education_Grades 1 through 8 (Elementary),Education_Grades 9 through 11 (Some high school),Education_Never attended school or only kindergarten
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,9.0,3.0,0,0,1,0,0,0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,7.0,1.0,0,1,0,0,0,0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,9.0,8.0,0,0,1,0,0,0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,11.0,6.0,0,0,0,0,1,0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,11.0,4.0,1,0,0,0,0,0


There appears to be a significant imbalance in the data, with the number of non-diabetic patients far exceeding that of pre-diabetic and diabetic patients. To address this, I will combine the pre-diabetic and diabetic categories into a single indicator labeled "1", while non-diabetic patients will be designated as "0".

In [73]:
# make 2 as 1 in the Diabetes indicator column
df['Diabetes_012'] = df['Diabetes_012'].replace(2,1)

In [74]:
# new values in the Diabetes indicator column
df['Diabetes_012'].value_counts()

Diabetes_012
0.0    213703
1.0     39977
Name: count, dtype: int64

The dataset is still heavlily skewed to the non-diabitlic patients. As a result, I will use SMOTE to balance the data.

In [75]:
# Separate features and target
X = df.drop('Diabetes_012', axis=1)
y = df['Diabetes_012']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Create a new balanced DataFrame
df_balanced = pd.concat([pd.DataFrame(X_balanced, columns=X.columns), pd.Series(y_balanced, name='Diabetes_012')], axis=1)

In [76]:
df_balanced['Diabetes_012'].value_counts()

Diabetes_012
0.0    213703
1.0    213703
Name: count, dtype: int64

In [77]:
# Rename Diabetes_012 to Diabetes_01 Save Balanced DataFrame to CSV
df_balanced = df_balanced.rename(columns={'Diabetes_012': 'Diabetes_01'})
df_balanced.to_csv('diabetes_balanced.csv', index=False)