In [1]:
import pandas as pd
import numpy
import pickle
from sklearn.preprocessing import MinMaxScaler

# Carga el dataset procesado
df = pd.read_csv('../../data/processed/features_for_model.csv')

with open('../../artifacts/feature_eng_configs.pkl', 'rb') as f:
    feature_eng_configs = pickle.load(f)

with open('../../artifacts/mm_scaler.pkl', 'rb') as f:
    mm_scaler = pickle.load(f)

# Imputar valores en las columnas correspondientes
cols_imputacion = ["Glucose", "BloodPressure", "BMI"]
for col in cols_imputacion:
    df[col] = df[col].replace(0, feature_eng_configs[f"{col}_imputed_value"]).astype(int)

# Aplicar la estandarización a las características
df_scaled = pd.DataFrame(mm_scaler.transform(df), columns=df.columns)

# Verificación
print(df_scaled.head())

   Pregnancies   Glucose  BloodPressure       BMI  DiabetesPedigreeFunction  \
0     0.035503 -0.459259      -0.727273 -0.514286                  0.118524   
1     0.011834 -0.459259      -0.727273 -0.514286                 -0.051464   
2     0.011834 -0.459259      -0.727273 -0.514286                 -0.045436   
3     0.047337 -0.459259      -0.727273 -0.514286                  0.375313   
4     0.041420 -0.459259      -0.727273 -0.514286                 -0.014091   

        Age  Outcome  
0 -0.446125      0.0  
1 -0.456522      0.0  
2 -0.456522      0.0  
3 -0.450378      0.0  
4 -0.442817      0.0  


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [2]:
# Dividir el dataset en X (características) y y (target)
X = df_scaled.drop('Outcome', axis=1)
y = df_scaled['Outcome']

# Dividir los datos en conjunto de entrenamiento y prueba (con los datos ya procesados)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verificación
print(X_train.shape, X_test.shape)

(123, 6) (31, 6)


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Crea el modelo
model_svc = SVC(random_state=42)

# Entrenar el modelo
model_svc.fit(X_train, y_train)

# Realiza predicciones
y_pred_svc = model_svc.predict(X_test)

# Evaluar el modelo
print("Support Vector Classifier - Accuracy:", accuracy_score(y_test, y_pred_svc))
print("Support Vector Classifier - Classification Report:\n", classification_report(y_test, y_pred_svc))
print("Support Vector Classifier - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svc))


Support Vector Classifier - Accuracy: 0.5161290322580645
Support Vector Classifier - Classification Report:
               precision    recall  f1-score   support

         0.0       0.52      1.00      0.68        16
         1.0       0.00      0.00      0.00        15

    accuracy                           0.52        31
   macro avg       0.26      0.50      0.34        31
weighted avg       0.27      0.52      0.35        31

Support Vector Classifier - Confusion Matrix:
 [[16  0]
 [15  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Probar diferetnes configuraciones pra Support Vector Classifier
model_svc_1 = SVC(kernel='linear', random_state=42)
model_svc_1.fit(X_train, y_train)
y_pred_svc_1 = model_svc_1.predict(X_test)
print("SVC Config 1 - Accuracy:", accuracy_score(y_test, y_pred_svc_1))

model_svc_2 = SVC(kernel='rbf', random_state=42)
model_svc_2.fit(X_train, y_train)
y_pred_svc_2 = model_svc_2.predict(X_test)
print("SVC Config 2 - Accuracy:", accuracy_score(y_test, y_pred_svc_2))


SVC Config 1 - Accuracy: 0.5161290322580645
SVC Config 2 - Accuracy: 0.5161290322580645


In [6]:
# Guardar el modelo entrenado
with open('../../artifacts/svc_model.pkl', 'wb') as f:
    pickle.dump(model_svc, f)
