In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv("../data/raw/diabetes.csv")
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# configuración del Pipeline
TARGET = "Outcome"
VARS_TO_DROP = ["SkinThickness", "Insulin", TARGET]
CONTINUE_VARS_TO_IMPUTATION = ["Glucose", "BloodPressure", "BMI"]

In [4]:
x_features = dataset.drop(labels=VARS_TO_DROP, axis=1)
y_target = dataset[TARGET]

x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size=0.2, shuffle=True, random_state=42)

In [5]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import MeanMedianImputer
from sklearn.preprocessing import MinMaxScaler

In [10]:
diabetes_predict_pipeline = Pipeline([
    # imputación de variables continuas
    ("continues_var_mean_imputation", MeanMedianImputer(imputation_method="mean", variables=CONTINUE_VARS_TO_IMPUTATION)),
    
    #estandarización de variables
    ("feature_scaling", MinMaxScaler())
])

In [11]:
diabetes_predict_pipeline.fit(x_train) 

In [12]:
x_features_processed = diabetes_predict_pipeline.transform(x_train)
df_features_process = pd.DataFrame(x_features_processed, columns=x_train.columns)
df_features_process[TARGET] = y_train.reset_index()["Outcome"]

# guardamos los datos para entrenar modelos
df_features_process.to_csv("../data/processed/features_for_model.csv", index=False)
df_features_process.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.117647,0.422111,0.0,0.0,0.096499,0.0,0
1,0.529412,0.562814,0.672131,0.420268,0.514091,0.483333,1
2,0.058824,0.698492,0.377049,0.42772,0.245944,0.016667,0
3,0.0,0.809045,0.409836,0.326379,0.075149,0.733333,0
4,0.352941,0.673367,0.655738,0.688525,0.068318,0.416667,1


In [13]:
import pickle

with open("../artifacts/pipeline.pkl", "wb") as f:
    pickle.dump(diabetes_predict_pipeline, f)