# Ingeniería de Características

In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pickle

## 1. Cargamos dataset

In [80]:
df = pd.read_csv('../data/raw/diabetes.csv')
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


## 2. Dividimos en train y test

In [81]:
X = df.drop('Outcome', axis=1)  
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
df_train = pd.concat([X_train, y_train], axis=1)
df_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
60,2,84,0,0,0,0.0,0.304,21,0
618,9,112,82,24,0,28.2,1.282,50,1
346,1,139,46,19,83,28.7,0.654,22,0
294,0,161,50,0,0,21.9,0.254,65,0
231,6,134,80,37,370,46.2,0.238,46,1
...,...,...,...,...,...,...,...,...,...
71,5,139,64,35,140,28.6,0.411,26,0
106,1,96,122,0,0,22.4,0.207,27,0
270,10,101,86,37,0,45.6,1.136,38,1
435,0,141,0,0,0,42.4,0.205,29,1


### Guardamos los datasets de test despúes del split

In [83]:
df_test = pd.concat([X_test, y_test], axis=1)
df_test.to_csv('../data/processed/df_test.csv', index=False)

## 3. Eliminamos variables con muchos faltantes

In [84]:
df_train.drop(['SkinThickness', 'Insulin'], axis=1, inplace=True)
df_train.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome
60,2,84,0,0.0,0.304,21,0
618,9,112,82,28.2,1.282,50,1
346,1,139,46,28.7,0.654,22,0
294,0,161,50,21.9,0.254,65,0
231,6,134,80,46.2,0.238,46,1


## 4. Ingeniería de características

### 4.1 Imputación de variables

In [85]:
proporcion_ceros = (df_train == 0).mean()
proporcion_ceros

Pregnancies                 0.148208
Glucose                     0.008143
BloodPressure               0.039088
BMI                         0.011401
DiabetesPedigreeFunction    0.000000
Age                         0.000000
Outcome                     0.653094
dtype: float64

In [86]:
cols_imputacion = ["Glucose", "BloodPressure", "BMI"]

feature_eng_configs = {}

for col in cols_imputacion:
    media = int(df_train[col].mean())
    imputed_key = f"{col}_imputed_value"  
    feature_eng_configs[imputed_key] = media
    df_train[col] = df_train[col].replace(0, media).astype(int)

### Guardamos valores imputados como artefacto

In [87]:
with open("../artifacts/feature_eng_configs.pkl", "wb") as f:
    pickle.dump(feature_eng_configs, f)

### 4.2 Escalado de variables

In [88]:
mm_scaler = MinMaxScaler()
mm_scaler.fit(df_train)

In [89]:
df_scaled = pd.DataFrame(mm_scaler.transform(df_train), columns=df_train.columns)

### Guardamos el Scaler como artefacto

In [92]:
with open("../artifacts/mm_scaler.pkl", "wb") as f:
    pickle.dump(mm_scaler, f)

## 5. Guardamos dataset procesado

In [91]:
df_scaled.to_csv('../data/processed/features_for_model.csv', index=False)