## Data pre-processing

In [60]:
# Import necessary libraries
import pandas as pd

In [61]:
df = pd.read_csv("../data/interim/nhanes_data_interim.csv")

### Categorical encoding

In [62]:
# Map multiclass values to more readable strings to prepare for one-hot encoding
df["diabetes"] = df["diabetes"].map({1.0: "yes", 2.0: "no", 3.0: "borderline"})
df["current_smoking_freq"] = df["current_smoking_freq"].map({1.0: "every_day", 2.0: "some_days", 3.0: "none"})

In [63]:
# One-hot encode the multiclass columns "diabetes" and "current_smoking_frequency"
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, dtype=int)
one_hot_encoded = encoder.fit_transform(df[["diabetes", "current_smoking_freq"]])

encoded_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(["diabetes", "current_smoking_freq"]))

# Concatenate the original DataFrame with the one-hot encoded DataFrame
df = pd.concat([df, encoded_df], axis=1)

# Drop the original multiclass columns after encoding
df = df.drop(columns=["diabetes", "current_smoking_freq"], axis=1)

In [64]:
# Resultant DataFrame after one-hot encoding
df.head()

Unnamed: 0,bmi,high_blood_pressure,gender,age,hba1c_percentage,fpg_mmol_L,hdl_mmol_L,lifetime_100_cigs_smoked,diabetes_borderline,diabetes_no,diabetes_yes,current_smoking_freq_every_day,current_smoking_freq_none,current_smoking_freq_some_days
0,27.0,1,1,43,5.6,6.27,1.16,1,0,1,0,0,1,0
1,33.5,1,1,66,5.6,5.5,1.55,1,0,1,0,0,1,0
2,30.2,0,1,34,5.1,5.55,1.19,1,0,1,0,0,1,0
3,27.3,0,0,56,5.0,5.77,1.76,1,0,1,0,0,1,0
4,21.4,1,1,80,4.6,4.94,2.12,1,0,1,0,0,1,0


### Identify outliers

In [65]:
def three_sd_range(series):
    mean = series.mean()
    sd = series.std()
    lower_bound = mean - 3 * sd
    upper_bound = mean + 3 * sd
    
    return (lower_bound, upper_bound)

In [66]:
for col_name in ["bmi", "age", "hba1c_percentage", "fpg_mmol_L", "hdl_mmol_L"]:
    lower_bound, upper_bound = three_sd_range(df[col_name])
    outlier_present = (df[col_name].min() < lower_bound or df[col_name].max() > upper_bound)
    print(f"{col_name} has at least one outlier: {outlier_present}")
    
    if outlier_present:
        curr_length = df[col_name].count()
        df = df[(df[col_name] >= lower_bound) & (df[col_name] <= upper_bound)]
        new_length = df[col_name].count()
        print(f"{curr_length - new_length} row has been removed")
    
    print()

bmi has at least one outlier: True
11 row has been removed

age has at least one outlier: False

hba1c_percentage has at least one outlier: True
32 row has been removed

fpg_mmol_L has at least one outlier: True
30 row has been removed

hdl_mmol_L has at least one outlier: True
16 row has been removed



In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1120 entries, 0 to 1208
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   bmi                             1120 non-null   float64
 1   high_blood_pressure             1120 non-null   int64  
 2   gender                          1120 non-null   int64  
 3   age                             1120 non-null   int64  
 4   hba1c_percentage                1120 non-null   float64
 5   fpg_mmol_L                      1120 non-null   float64
 6   hdl_mmol_L                      1120 non-null   float64
 7   lifetime_100_cigs_smoked        1120 non-null   int64  
 8   diabetes_borderline             1120 non-null   int32  
 9   diabetes_no                     1120 non-null   int32  
 10  diabetes_yes                    1120 non-null   int32  
 11  current_smoking_freq_every_day  1120 non-null   int32  
 12  current_smoking_freq_none       1120 no

In [68]:
# Save the pre-processed DataFrame to a new CSV file
df.to_csv("../data/processed/nhanes_data_processed.csv", index=False)