In [1]:
import pandas as pd
from sklearn.preprocessing import RobustScaler
from joblib import dump

#### Escalar solo el dataset de entrenamiento

In [2]:
df_data = pd.read_csv("process_dataset/train_data.csv")
df_data

Unnamed: 0,Age,Cholesterol,Heart Rate,Diabetes,Family History,Obesity,Alcohol Consumption,Exercise Hours Per Week,Previous Heart Problems,Medication Use,...,Sex_Male,Diet_Average,Diet_Healthy,Diet_Unhealthy,Continent_Africa,Continent_Asia,Continent_Australia,Continent_Europe,Continent_North America,Continent_South America
0,76,206,70,1,0,1,1,18.940748,0,0,...,1,0,0,1,0,0,0,1,0,0
1,38,293,108,0,0,0,0,3.880782,0,0,...,1,0,1,0,0,1,0,0,0,0
2,66,268,95,1,1,1,0,8.314110,0,0,...,1,1,0,0,0,1,0,0,0,0
3,68,303,74,1,0,1,0,9.856034,1,1,...,1,0,1,0,0,1,0,0,0,0
4,27,298,103,1,1,1,0,18.057032,1,1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4515,70,231,103,0,0,0,1,13.579554,0,1,...,1,0,1,0,0,0,0,1,0,0
4516,72,209,54,0,0,0,1,16.616617,1,1,...,1,0,1,0,0,0,0,0,0,1
4517,41,228,43,0,0,1,1,15.463423,0,1,...,0,0,0,1,0,0,0,0,1,0
4518,72,372,52,0,1,0,1,7.600971,0,1,...,0,0,0,1,0,0,0,1,0,0


#### Separar datos no binarios para escalar
Escalar datos binarios puede alterar su significado y estos ya se encuetran en la escala adecuada (0 y 1)

In [3]:
binary_cols = df_data.columns[df_data.nunique() == 2]
non_binary_cols = df_data.columns.difference(binary_cols)

df_binary = df_data[binary_cols]
df_non_binary = df_data[non_binary_cols]

#### Escalar los datos con Robust Scaler
Se usa Robust Scaler porque los datos tienen aproximadamente un 10% de outliers y es un escalador poco sencible a valores atípicos

In [4]:
def save_scaler(df_data, scaler_instance):
    scaler_instance.fit(X=df_data.values)
    data_scaled = scaler_instance.transform(X=df_data.values)
    df_scaled = pd.DataFrame(data=data_scaled, columns=df_data.columns)
    
    return df_scaled, scaler_instance

In [5]:
robust_scaler_instance = RobustScaler()
df_scaled, scaler_instance = save_scaler(df_non_binary, robust_scaler_instance)

In [6]:
df_scaled

Unnamed: 0,Age,BMI,Cholesterol,Diastolic,Exercise Hours Per Week,Heart Rate,Income,Physical Activity Days Per Week,Sedentary Hours Per Day,Sleep Hours Per Day,Stress Level,Systolic,Triglycerides
0,0.594595,0.255213,-0.373429,-0.64,0.873707,-0.162162,0.780848,-0.6,0.810042,-0.75,-0.8,-0.680851,0.048843
1,-0.432432,0.567512,0.251346,0.36,-0.625266,0.864865,0.801658,0.4,0.475063,0.75,0.8,0.893617,-0.221080
2,0.324324,-0.460556,0.071813,-0.76,-0.184001,0.513514,-0.317955,-0.4,-0.690448,0.00,-0.6,0.808511,-0.884319
3,0.378378,-0.638960,0.323160,-0.24,-0.030527,-0.054054,-0.915911,-0.6,0.192026,0.25,-0.4,0.829787,-0.971722
4,-0.729730,0.885066,0.287253,0.84,0.785748,0.729730,-0.109108,0.2,0.237010,0.75,0.0,-0.680851,-0.835476
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4515,0.432432,0.055210,-0.193896,-0.36,0.340088,0.729730,-0.460744,0.8,-0.246366,-0.25,0.0,0.489362,-0.002571
4516,0.486486,-0.636570,-0.351885,-0.84,0.642378,-0.594595,-0.700813,0.2,0.641098,-0.75,-0.4,-0.617021,-0.658098
4517,-0.351351,-0.408619,-0.215440,-0.56,0.527597,-0.891892,0.096399,0.0,-0.351906,0.75,0.0,-0.425532,-0.616967
4518,0.486486,0.451433,0.818671,-0.16,-0.254982,-0.648649,-0.253826,0.8,0.026498,0.00,0.4,0.234043,-0.264781


#### Cocatenar datos escalados y no escalados

In [7]:
df_concat = pd.concat([df_binary, df_scaled], axis=1)

In [8]:
df_concat

Unnamed: 0,Diabetes,Family History,Obesity,Alcohol Consumption,Previous Heart Problems,Medication Use,Sex_Female,Sex_Male,Diet_Average,Diet_Healthy,...,Diastolic,Exercise Hours Per Week,Heart Rate,Income,Physical Activity Days Per Week,Sedentary Hours Per Day,Sleep Hours Per Day,Stress Level,Systolic,Triglycerides
0,1,0,1,1,0,0,0,1,0,0,...,-0.64,0.873707,-0.162162,0.780848,-0.6,0.810042,-0.75,-0.8,-0.680851,0.048843
1,0,0,0,0,0,0,0,1,0,1,...,0.36,-0.625266,0.864865,0.801658,0.4,0.475063,0.75,0.8,0.893617,-0.221080
2,1,1,1,0,0,0,0,1,1,0,...,-0.76,-0.184001,0.513514,-0.317955,-0.4,-0.690448,0.00,-0.6,0.808511,-0.884319
3,1,0,1,0,1,1,0,1,0,1,...,-0.24,-0.030527,-0.054054,-0.915911,-0.6,0.192026,0.25,-0.4,0.829787,-0.971722
4,1,1,1,0,1,1,1,0,0,0,...,0.84,0.785748,0.729730,-0.109108,0.2,0.237010,0.75,0.0,-0.680851,-0.835476
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4515,0,0,0,1,0,1,0,1,0,1,...,-0.36,0.340088,0.729730,-0.460744,0.8,-0.246366,-0.25,0.0,0.489362,-0.002571
4516,0,0,0,1,1,1,0,1,0,1,...,-0.84,0.642378,-0.594595,-0.700813,0.2,0.641098,-0.75,-0.4,-0.617021,-0.658098
4517,0,0,1,1,0,1,1,0,0,0,...,-0.56,0.527597,-0.891892,0.096399,0.0,-0.351906,0.75,0.0,-0.425532,-0.616967
4518,0,1,0,1,0,1,1,0,0,0,...,-0.16,-0.254982,-0.648649,-0.253826,0.8,0.026498,0.00,0.4,0.234043,-0.264781


In [9]:
df_concat.to_csv(f"process_dataset/train_data_scaled_robust.csv", index=False)
dump(scaler_instance, f"results/scaler_robust.joblib")

['results/scaler_robust.joblib']