In [1]:
import pandas as pd
import numpy
import pickle
from sklearn.preprocessing import MinMaxScaler

# Carga el dataset procesado
df = pd.read_csv('../../data/processed/features_for_model.csv')

with open('../../artifacts/feature_eng_configs.pkl', 'rb') as f:
    feature_eng_configs = pickle.load(f)

with open('../../artifacts/mm_scaler.pkl', 'rb') as f:
    mm_scaler = pickle.load(f)

# Imputar valores en las columnas correspondientes
cols_imputacion = ["Glucose", "BloodPressure", "BMI"]
for col in cols_imputacion:
    df[col] = df[col].replace(0, feature_eng_configs[f"{col}_imputed_value"]).astype(int)

# Aplicar la estandarización a las características
df_scaled = pd.DataFrame(mm_scaler.transform(df), columns=df.columns)

# Verificación
print(df_scaled.head())


   Pregnancies   Glucose  BloodPressure       BMI  DiabetesPedigreeFunction  \
0     0.035503 -0.459259      -0.727273 -0.514286                  0.118524   
1     0.011834 -0.459259      -0.727273 -0.514286                 -0.051464   
2     0.011834 -0.459259      -0.727273 -0.514286                 -0.045436   
3     0.047337 -0.459259      -0.727273 -0.514286                  0.375313   
4     0.041420 -0.459259      -0.727273 -0.514286                 -0.014091   

        Age  Outcome  
0 -0.446125      0.0  
1 -0.456522      0.0  
2 -0.456522      0.0  
3 -0.450378      0.0  
4 -0.442817      0.0  


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [2]:
# Dividir el dataset en X (características) y y (target)
X = df_scaled.drop('Outcome', axis=1)
y = df_scaled['Outcome']

# Dividir los datos en conjunto de entrenamiento y prueba (con los datos ya procesados)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verificación
print(X_train.shape, X_test.shape)

(123, 6) (31, 6)


In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Crear el modelo
model_knn = KNeighborsClassifier()

# Entrenar el modelo
model_knn.fit(X_train, y_train)

# Realizar predicciones
y_pred_knn = model_knn.predict(X_test)

# Evaluar el modelo
print("KNeighbors Classifier - Accuracy:", accuracy_score(y_test, y_pred_knn))
print("KNeighbors Classifier - Classification Report:\n", classification_report(y_test, y_pred_knn))
print("KNeighbors Classifier - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))


KNeighbors Classifier - Accuracy: 0.5483870967741935
KNeighbors Classifier - Classification Report:
               precision    recall  f1-score   support

         0.0       0.54      0.94      0.68        16
         1.0       0.67      0.13      0.22        15

    accuracy                           0.55        31
   macro avg       0.60      0.54      0.45        31
weighted avg       0.60      0.55      0.46        31

KNeighbors Classifier - Confusion Matrix:
 [[15  1]
 [13  2]]


In [5]:
# Probar diferentes configuraciones para KNeighbors Classifier
model_knn_1 = KNeighborsClassifier(n_neighbors=5)
model_knn_1.fit(X_train, y_train)
y_pred_knn_1 = model_knn_1.predict(X_test)
print("KNN Config 1 - Accuracy:", accuracy_score(y_test, y_pred_knn_1))

model_knn_2 = KNeighborsClassifier(n_neighbors=10)
model_knn_2.fit(X_train, y_train)
y_pred_knn_2 = model_knn_2.predict(X_test)
print("KNN Config 2 - Accuracy:", accuracy_score(y_test, y_pred_knn_2))


KNN Config 1 - Accuracy: 0.5483870967741935
KNN Config 2 - Accuracy: 0.5161290322580645


In [6]:
with open('../../artifacts/knn_model.pkl', 'wb') as f:
    pickle.dump(model_knn, f)
