## Model Training

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("../data/processed/nhanes_data_processed.csv")

### Split the dataset into training, cross validation and test sets

The dataset will be split into three parts:
- 60% training set - to train the model
- 20% cross validation set - to evaluate the different model configurations
- 20% test set - to give a fair estimate of the chosen model's performance against new examples

In [5]:
x = df[["bmi", "high_blood_pressure", "age", "hba1c_percentage", "fpg_mmol_L", "hdl_mmol_L", "lifetime_100_cigs_smoked", "current_smoking_freq_every_day", "current_smoking_freq_none", "current_smoking_freq_some_days"]]
y = df[["diabetes_no", "diabetes_borderline", "diabetes_yes"]]

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_, y_train, y_ = train_test_split(x, y, test_size=0.4, random_state=42)

x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.5, random_state=42)

del x_, y_

print(f"Shape of training set input: {x_train.shape}")
print(f"Shape of training set target: {y_train.shape}\n")
print(f"Shape of cross validation set input: {x_cv.shape}")
print(f"Shape of cross validation set target: {y_cv.shape}\n")
print(f"Shape of test set input: {x_test.shape}")
print(f"Shapre of test set target: {y_test.shape}")


Shape of training set input: (672, 10)
Shape of training set target: (672, 3)

Shape of cross validation set input: (224, 10)
Shape of cross validation set target: (224, 3)

Shape of test set input: (224, 10)
Shapre of test set target: (224, 3)


### Feature scaling using Z-score normalisation

In [7]:
df.describe()

Unnamed: 0,bmi,high_blood_pressure,gender,age,hba1c_percentage,fpg_mmol_L,hdl_mmol_L,lifetime_100_cigs_smoked,diabetes_borderline,diabetes_no,diabetes_yes,current_smoking_freq_every_day,current_smoking_freq_none,current_smoking_freq_some_days
count,1120.0,1120.0,1120.0,1120.0,1120.0,1120.0,1120.0,1120.0,1120.0,1120.0,1120.0,1120.0,1120.0,1120.0
mean,29.489464,0.433929,0.533036,57.15,5.687054,5.877232,1.382071,1.0,0.044643,0.820536,0.134821,0.24375,0.6875,0.06875
std,6.357229,0.495837,0.49913,15.654297,0.6853,0.982701,0.359415,0.0,0.206611,0.383912,0.341685,0.429535,0.463719,0.253142
min,16.4,0.0,0.0,18.0,3.2,3.55,0.59,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,24.9,0.0,0.0,44.0,5.3,5.27,1.11,1.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,28.6,0.0,1.0,61.0,5.6,5.66,1.32,1.0,0.0,1.0,0.0,0.0,1.0,0.0
75%,33.1,1.0,1.0,69.0,5.9,6.16,1.58,1.0,0.0,1.0,0.0,0.0,1.0,0.0
max,50.4,1.0,1.0,80.0,9.3,9.94,2.59,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
from sklearn.preprocessing import StandardScaler

vars_to_scale = ["bmi", "age", "hba1c_percentage", "fpg_mmol_L", "hdl_mmol_L"]

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train[vars_to_scale])
x_test_scaled = scaler.transform(x_test[vars_to_scale])