In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("dataset_EBA.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes,clinical_notes
0,0,2020.0,Female,32.0,Alabama,False,False,False,False,True,False,False,never,27.32,5.0,100.0,0.0,"Overweight, advised dietary and exercise modif..."
1,1,2015.0,Female,29.0,Alabama,False,True,False,False,False,False,False,never,19.95,5.0,90.0,1.0,Healthy BMI range.
2,2,2015.0,Male,18.0,Alabama,False,False,False,False,True,False,False,never,23.76,4.8,160.0,1.0,"Young patient, generally lower risk but needs ..."
3,3,2015.0,Male,41.0,Alabama,False,False,True,False,False,False,False,never,27.32,4.0,159.0,1.0,"Overweight, advised dietary and exercise modif..."
4,4,2016.0,Female,52.0,Alabama,True,False,False,False,False,False,False,never,23.75,6.5,90.0,0.0,"Healthy BMI range. High HbA1c level, indicativ..."


In [4]:
df.dtypes

Unnamed: 0                int64
year                    float64
gender                   object
age                     float64
location                 object
race:AfricanAmerican       bool
race:Asian                 bool
race:Caucasian             bool
race:Hispanic              bool
race:Other                 bool
hypertension               bool
heart_disease              bool
smoking_history          object
bmi                     float64
hbA1c_level             float64
blood_glucose_level     float64
diabetes                float64
clinical_notes           object
dtype: object

---

### **Drop Unnecessary Columns**

In [5]:
# Removing Unnamed column as it was of no use 
df.drop(columns=['Unnamed: 0'], inplace=True)

---

### **Handle Missing Values**

In [7]:
df.isnull().sum()

year                     20
gender                   39
age                     294
location                360
race:AfricanAmerican      0
race:Asian                0
race:Caucasian            0
race:Hispanic             0
race:Other                0
hypertension              0
heart_disease             0
smoking_history         418
bmi                     418
hbA1c_level             419
blood_glucose_level     418
diabetes                418
clinical_notes          418
dtype: int64

In [8]:
# Numeric Columns
num_cols = ['bmi', 'hbA1c_level', 'blood_glucose_level']
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

In [9]:
# Categorical Columns
cat_cols = ['gender', 'location', 'smoking_history']
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

In [10]:
df['year'].fillna(df['year'].mode()[0], inplace=True)
df['age'].fillna(df['age'].median(), inplace=True)

In [11]:
df = df.dropna(subset=['diabetes'])
df = df.dropna(subset=['clinical_notes'])

In [12]:
df.isnull().sum()

year                    0
gender                  0
age                     0
location                0
race:AfricanAmerican    0
race:Asian              0
race:Caucasian          0
race:Hispanic           0
race:Other              0
hypertension            0
heart_disease           0
smoking_history         0
bmi                     0
hbA1c_level             0
blood_glucose_level     0
diabetes                0
clinical_notes          0
dtype: int64

---

### **Convert Boolean Columns to Integer**

In [None]:
race_cols = [col for col in df.columns if col.startswith('race:')]
df[race_cols] = df[race_cols].astype(int)

In [14]:
df['heart_disease'] = df['heart_disease'].astype(int)
df['hypertension'] = df['hypertension'].astype(int)

In [15]:
df['diabetes'] = df['diabetes'].astype(int)

In [16]:
df

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes,clinical_notes
0,2020.0,Female,32.0,Alabama,0,0,0,0,1,0,0,Never,27.32,5.0,100.0,0,"Overweight, Advised Dietary And Exercise Modif..."
1,2015.0,Female,29.0,Alabama,0,1,0,0,0,0,0,Never,19.95,5.0,90.0,1,Healthy Bmi Range.
2,2015.0,Male,18.0,Alabama,0,0,0,0,1,0,0,Never,23.76,4.8,160.0,1,"Young Patient, Generally Lower Risk But Needs ..."
3,2015.0,Male,41.0,Alabama,0,0,1,0,0,0,0,Never,27.32,4.0,159.0,1,"Overweight, Advised Dietary And Exercise Modif..."
4,2016.0,Female,52.0,Alabama,1,0,0,0,0,0,0,Never,23.75,6.5,90.0,0,"Healthy Bmi Range. High Hba1C Level, Indicativ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2018.0,Female,33.0,Wyoming,0,0,0,0,1,0,0,Never,21.21,6.5,90.0,0,"Healthy Bmi Range. High Hba1C Level, Indicativ..."
99996,2016.0,Female,80.0,Wyoming,0,1,0,0,0,0,0,No Info,36.66,5.7,100.0,0,Elderly Patient With Increased Risk Of Chronic...
99997,2018.0,Male,46.0,Wyoming,0,1,0,0,0,0,0,Ever,36.12,6.2,158.0,0,"Obese Category, Increased Risk For Diabetes An..."
99998,2018.0,Female,51.0,Wyoming,1,0,0,0,0,0,0,Not Current,29.29,6.0,155.0,0,"Overweight, Advised Dietary And Exercise Modif..."


---

### **Encode Categorical Variables**

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
label = LabelEncoder()
df['gender'] = label.fit_transform(df['gender'])

In [19]:
label = LabelEncoder()
df['smoking_history'] = label.fit_transform(df['smoking_history'])

In [20]:
label = LabelEncoder()
df['location'] = label.fit_transform(df['location'])

In [22]:
df.head()

Unnamed: 0,year,gender,age,location,race:AfricanAmerican,race:Asian,race:Caucasian,race:Hispanic,race:Other,hypertension,heart_disease,smoking_history,bmi,hbA1c_level,blood_glucose_level,diabetes,clinical_notes
0,2020.0,0,32.0,0,0,0,0,0,1,0,0,3,27.32,5.0,100.0,0,"Overweight, Advised Dietary And Exercise Modif..."
1,2015.0,0,29.0,0,0,1,0,0,0,0,0,3,19.95,5.0,90.0,1,Healthy Bmi Range.
2,2015.0,1,18.0,0,0,0,0,0,1,0,0,3,23.76,4.8,160.0,1,"Young Patient, Generally Lower Risk But Needs ..."
3,2015.0,1,41.0,0,0,0,1,0,0,0,0,3,27.32,4.0,159.0,1,"Overweight, Advised Dietary And Exercise Modif..."
4,2016.0,0,52.0,0,1,0,0,0,0,0,0,3,23.75,6.5,90.0,0,"Healthy Bmi Range. High Hba1C Level, Indicativ..."


---