## Data pre-processing

In [206]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [207]:
df = pd.read_csv("../data/interim/nhanes_data_interim.csv")

### Convert categorical variables represented by float to int

In [208]:
df["diabetes"] = df["diabetes"].astype(int)
df["current_smoking_freq"] = df["current_smoking_freq"].astype(int)

### Transform highly-skewed features

In [209]:
# Log transformation on right-skewed features
cols_to_transform = ["hba1c_percentage", "fpg_mmol_L", "hdl_mmol_L"]

for col in cols_to_transform:
    df[col + "_log"] = np.log(df[col])

# Drop original columns
df = df.drop(cols_to_transform, axis=1)

### Identify outliers

In [210]:
def three_sd_range(series):
    mean = series.mean()
    sd = series.std()
    lower_bound = mean - 3 * sd
    upper_bound = mean + 3 * sd
    
    return (lower_bound, upper_bound)

In [211]:
for col_name in ["bmi", "age", "hba1c_percentage_log", "fpg_mmol_L_log", "hdl_mmol_L_log"]:
    lower_bound, upper_bound = three_sd_range(df[col_name])
    outlier_present = (df[col_name].min() < lower_bound or df[col_name].max() > upper_bound)
    print(f"{col_name} has at least one outlier: {outlier_present}")
    
    if outlier_present:
        curr_length = df[col_name].count()
        df = df[(df[col_name] >= lower_bound) & (df[col_name] <= upper_bound)]
        new_length = df[col_name].count()
        print(f"{curr_length - new_length} row has been removed")
    
    print()

bmi has at least one outlier: True
11 row has been removed

age has at least one outlier: False

hba1c_percentage_log has at least one outlier: True
31 row has been removed

fpg_mmol_L_log has at least one outlier: True
27 row has been removed

hdl_mmol_L_log has at least one outlier: True
6 row has been removed



### Removing features that provide no predictive information

In [212]:
col_one_uniq = []
for col in df.columns:
    if df[col].nunique() <= 1:
        col_one_uniq.append(col)
        
df = df.drop(col_one_uniq, axis=1)
print(f"{len(col_one_uniq)} column(s) has been removed")

1 column(s) has been removed


## Feature Engineering

In [213]:
df["bmi_age_interaction"] = df["bmi"] * df["age"] / 100
df["glucose_hba1c_interaction"] = df["fpg_mmol_L_log"] * df["hba1c_percentage_log"]
df["bp_age_interaction"] = df["high_blood_pressure"] * df["age"]

In [214]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1134 entries, 0 to 1208
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   bmi                        1134 non-null   float64
 1   high_blood_pressure        1134 non-null   int64  
 2   gender                     1134 non-null   int64  
 3   age                        1134 non-null   int64  
 4   diabetes                   1134 non-null   int32  
 5   current_smoking_freq       1134 non-null   int32  
 6   hba1c_percentage_log       1134 non-null   float64
 7   fpg_mmol_L_log             1134 non-null   float64
 8   hdl_mmol_L_log             1134 non-null   float64
 9   bmi_age_interaction        1134 non-null   float64
 10  glucose_hba1c_interaction  1134 non-null   float64
 11  bp_age_interaction         1134 non-null   int64  
dtypes: float64(6), int32(2), int64(4)
memory usage: 106.3 KB


## Label encoding

In [215]:
# Encode target variable
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["diabetes"] = label_encoder.fit_transform(df["diabetes"])

In [216]:
# Resultant DataFrame after label encoding
df.sample(5)

Unnamed: 0,bmi,high_blood_pressure,gender,age,diabetes,current_smoking_freq,hba1c_percentage_log,fpg_mmol_L_log,hdl_mmol_L_log,bmi_age_interaction,glucose_hba1c_interaction,bp_age_interaction
99,32.0,1,1,77,0,3,2.066863,2.125848,0.131028,24.64,4.393836,77
944,28.5,0,0,43,1,3,1.648659,1.619388,0.086178,12.255,2.669818,0
561,27.5,1,1,63,1,2,1.667707,1.82777,0.548121,17.325,3.048184,63
858,36.1,0,0,42,1,3,1.757858,1.763017,0.239017,15.162,3.099133,0
653,34.0,1,0,60,1,1,1.791759,1.693779,0.438255,20.4,3.034845,60


In [217]:
df["diabetes"].value_counts()

diabetes
1    928
0    155
2     51
Name: count, dtype: int64

In [218]:
# Save the pre-processed DataFrame to a new CSV file
df.to_csv("../data/processed/nhanes_data_processed_label_encoded.csv", index=False)

## Categorical encoding

In [219]:
# Map multiclass values to more readable strings to prepare for one-hot encoding
df["diabetes"] = df["diabetes"].map({0: "yes", 1: "no", 2: "borderline"})
df["current_smoking_freq"] = df["current_smoking_freq"].map({1: "every_day", 2: "some_days", 3: "none"})

In [220]:
# One-hot encode the multiclass columns "diabetes" and "current_smoking_frequency"
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, dtype=int)
one_hot_encoded = encoder.fit_transform(df[["diabetes", "current_smoking_freq"]])

encoded_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(["diabetes", "current_smoking_freq"]))

# Concatenate the original DataFrame with the one-hot encoded DataFrame
df = pd.concat([df, encoded_df], axis=1)

# Drop the original multiclass columns after encoding
df = df.drop(columns=["diabetes", "current_smoking_freq"], axis=1)

In [221]:
# Resultant DataFrame after one-hot encoding
df.sample(5)

Unnamed: 0,bmi,high_blood_pressure,gender,age,hba1c_percentage_log,fpg_mmol_L_log,hdl_mmol_L_log,bmi_age_interaction,glucose_hba1c_interaction,bp_age_interaction,diabetes_borderline,diabetes_no,diabetes_yes,current_smoking_freq_every_day,current_smoking_freq_none,current_smoking_freq_some_days
892,31.9,1.0,1.0,65.0,1.791759,1.870263,0.058269,20.735,3.351061,65.0,0.0,0.0,1.0,0.0,1.0,0.0
867,40.3,1.0,0.0,62.0,1.916923,1.809927,0.48858,24.986,3.46949,62.0,0.0,1.0,0.0,1.0,0.0,0.0
235,20.4,0.0,0.0,64.0,1.704748,1.809927,0.688135,13.056,3.085469,0.0,0.0,1.0,0.0,0.0,1.0,0.0
955,23.6,1.0,1.0,67.0,1.960095,1.976855,-0.127833,15.812,3.874823,67.0,0.0,1.0,0.0,0.0,0.0,1.0
886,27.6,1.0,1.0,65.0,1.757858,1.682688,1.007958,17.94,2.957927,65.0,0.0,0.0,1.0,0.0,1.0,0.0


In [222]:
# Save the one-hot encoded pre-processed DataFrame to a new CSV file
df.to_csv("../data/processed/nhanes_data_processed_one_hot_encoded.csv", index=False)