## Data Pre-processing

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("../data/interim/nhanes_data_interim.csv")

In [None]:
# Map multiclass values to more readable strings to prepare for one-hot encoding
df["diabetes"] = df["diabetes"].map({1.0: "yes", 2.0: "no", 3.0: "borderline"})
df["current_smoking_frequency"] = df["current_smoking_frequency"].map({1.0: "every_day", 2.0: "some_days", 3.0: "none"})

In [None]:
# One-hot encode the multiclass columns "diabetes" and "current_smoking_frequency"
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, dtype=int)
one_hot_encoded = encoder.fit_transform(df[["diabetes", "current_smoking_frequency"]])

encoded_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(["diabetes", "current_smoking_frequency"]))

# Concatenate the original DataFrame with the one-hot encoded DataFrame
df = pd.concat([df, encoded_df], axis=1)

# Drop the original multiclass columns after encoding
df = df.drop(columns=["diabetes", "current_smoking_frequency"], axis=1)

In [None]:
# Resultant DataFrame after one-hot encoding
df.head()

Unnamed: 0,bmi,high_blood_pressure,gender,age,glycohemoglobin_percentage,plasma_fasting_glucose_mmol_L,high_density_lipoprotein_mmol_L,smoked_at_least_100_cigs_in_lifetime,diabetes_borderline,diabetes_no,diabetes_yes,current_smoking_frequency_every_day,current_smoking_frequency_none,current_smoking_frequency_some_days
0,27.0,1.0,1.0,43.0,5.6,6.27,1.16,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,33.5,1.0,1.0,66.0,5.6,5.5,1.55,1.0,0.0,1.0,0.0,0.0,1.0,0.0
8,30.2,0.0,1.0,34.0,5.1,5.55,1.19,1.0,0.0,0.0,1.0,0.0,1.0,0.0
18,27.3,0.0,0.0,56.0,5.0,5.77,1.76,1.0,1.0,0.0,0.0,0.0,1.0,0.0
30,21.4,1.0,1.0,80.0,4.6,4.94,2.12,1.0,0.0,1.0,0.0,0.0,1.0,0.0
