## Data pre-processing

- Prepare the DataFrame to be ready for model training
- Convert categorical variables represented by float to int
- Transform highly-skewed features identified in 02_eda.ipynb
- Identify and remove outliers
- Remove features with no predictive information (i.e. features with only one unique value)
- Label encoding of the target variable

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/interim/nhanes_data_interim.csv")

### Convert categorical variables represented by float to int

In [3]:
df["diabetes"] = df["diabetes"].astype(int)
df["current_smoking_freq"] = df["current_smoking_freq"].astype(int)

### Transform highly-skewed features

In [4]:
# Log transformation on right-skewed features
cols_to_transform = ["hba1c_percentage", "fpg_mmol_L", "hdl_mmol_L"]

for col in cols_to_transform:
    df[col + "_log"] = np.log(df[col])

# Drop original columns
df = df.drop(cols_to_transform, axis=1)

### Identify outliers

In [5]:
def three_sd_range(series):
    mean = series.mean()
    sd = series.std()
    lower_bound = mean - 3 * sd
    upper_bound = mean + 3 * sd
    
    return (lower_bound, upper_bound)

In [6]:
for col_name in ["bmi", "age", "hba1c_percentage_log", "fpg_mmol_L_log", "hdl_mmol_L_log"]:
    lower_bound, upper_bound = three_sd_range(df[col_name])
    outlier_present = (df[col_name].min() < lower_bound or df[col_name].max() > upper_bound)
    print(f"{col_name} has at least one outlier: {outlier_present}")
    
    if outlier_present:
        curr_length = df[col_name].count()
        df = df[(df[col_name] >= lower_bound) & (df[col_name] <= upper_bound)]
        new_length = df[col_name].count()
        print(f"{curr_length - new_length} row has been removed")
    
    print()

bmi has at least one outlier: True
11 row has been removed

age has at least one outlier: False

hba1c_percentage_log has at least one outlier: True
31 row has been removed

fpg_mmol_L_log has at least one outlier: True
27 row has been removed

hdl_mmol_L_log has at least one outlier: True
6 row has been removed



### Removing features that provide no predictive information

In [7]:
col_one_uniq = []
for col in df.columns:
    if df[col].nunique() <= 1:
        col_one_uniq.append(col)
        
df = df.drop(col_one_uniq, axis=1)
print(f"{len(col_one_uniq)} column(s) has been removed")

1 column(s) has been removed


### Label encoding

In [8]:
# Encode target variable
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["diabetes"] = label_encoder.fit_transform(df["diabetes"])

In [9]:
# Resultant DataFrame after label encoding
df.sample(5)

Unnamed: 0,bmi,high_blood_pressure,gender,age,diabetes,current_smoking_freq,hba1c_percentage_log,fpg_mmol_L_log,hdl_mmol_L_log
110,29.3,0,1,67,1,1,1.774952,1.809927,0.215111
308,25.0,0,1,25,1,3,1.686399,1.640937,0.086178
545,33.9,1,0,73,2,1,1.740466,1.818077,0.215111
1180,34.0,1,0,71,1,3,1.774952,1.763017,0.173953
72,37.3,1,0,75,1,3,1.757858,2.021548,0.215111


In [10]:
df["diabetes"].value_counts()

diabetes
1    928
0    155
2     51
Name: count, dtype: int64

In [11]:
# Save the pre-processed DataFrame to a new CSV file
df.to_csv("../data/processed/nhanes_data_processed_label_encoded.csv", index=False)