# Creación de modelos

In [49]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [50]:
# modelos
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

### Carga del dataset

In [51]:
dataset = pd.read_csv("../data/processed/features_for_model.csv")
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,98,58,34,0.43,43,0
1,2,112,75,35,0.148,21,0
2,2,108,64,30,0.158,21,0
3,8,107,80,24,0.856,34,0
4,7,136,90,29,0.21,50,0


### Dividir en train y test

In [52]:
# Dividir el dataset en X (características) y y (target)
X = dataset.drop('Outcome', axis=1)
y = dataset['Outcome']

# Dividir los datos en conjunto de entrenamiento y prueba (con los datos ya procesados)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verificación
print(X_train.shape, X_test.shape)

(123, 6) (31, 6)


### Estandarización de variables

In [53]:
mm_scaler = MinMaxScaler()
mm_scaler.fit(X_train)

In [54]:
df_scaled = pd.DataFrame(mm_scaler.transform(X_train), columns=X_train.columns)
df_scaled.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age
0,0.153846,0.228346,0.212121,0.264706,0.328416,0.022727
1,0.076923,0.866142,0.30303,0.705882,0.465839,0.159091
2,0.461538,0.370079,0.181818,0.205882,0.080745,0.136364
3,0.076923,0.370079,0.181818,0.205882,0.656056,0.0
4,0.384615,0.181102,0.393939,0.323529,0.871118,0.25


#### Guardamos el Scaler como artefacto

In [55]:
with open("../artifacts/mm_scaler.pkl", "wb") as f:
    pickle.dump(mm_scaler, f)

## Modelos de predicción

### 1. Gradient Boost

In [56]:
# Crear el modelo
model_gb = GradientBoostingClassifier(random_state=42)

# Entrenar el modelo
model_gb.fit(X_train, y_train)

# Realizar predicciones
y_pred_gb = model_gb.predict(X_test)

# Evaluar el modelo
print("Gradient Boosting - Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Gradient Boosting - Classification Report:\n", classification_report(y_test, y_pred_gb))
print("Gradient Boosting - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))

Gradient Boosting - Accuracy: 0.6129032258064516
Gradient Boosting - Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.88      0.70        16
           1       0.71      0.33      0.45        15

    accuracy                           0.61        31
   macro avg       0.65      0.60      0.58        31
weighted avg       0.65      0.61      0.58        31

Gradient Boosting - Confusion Matrix:
 [[14  2]
 [10  5]]


In [57]:
# Probar diferentes configuraciones para Gradient Boosting Classifier
model_gb_1 = GradientBoostingClassifier(n_estimators=50, random_state=42)
model_gb_1.fit(X_train, y_train)
y_pred_gb_1 = model_gb_1.predict(X_test)
print("GB Config 1 - Accuracy:", accuracy_score(y_test, y_pred_gb_1))

model_gb_2 = GradientBoostingClassifier(n_estimators=25, learning_rate=0.05, random_state=42)
model_gb_2.fit(X_train, y_train)
y_pred_gb_2 = model_gb_2.predict(X_test)
print("GB Config 2 - Accuracy:", accuracy_score(y_test, y_pred_gb_2))

GB Config 1 - Accuracy: 0.6129032258064516
GB Config 2 - Accuracy: 0.6451612903225806


In [48]:
# Guardar el modelo entrenado
with open('../models/gb_model.pkl', 'wb') as f:
    pickle.dump(model_gb_2, f)

## 2. KN Neighbors

In [58]:
# Crear el modelo
model_knn = KNeighborsClassifier()

# Entrenar el modelo
model_knn.fit(X_train, y_train)

# Realizar predicciones
y_pred_knn = model_knn.predict(X_test)

# Evaluar el modelo
print("KNeighbors Classifier - Accuracy:", accuracy_score(y_test, y_pred_knn))
print("KNeighbors Classifier - Classification Report:\n", classification_report(y_test, y_pred_knn))
print("KNeighbors Classifier - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))

KNeighbors Classifier - Accuracy: 0.7419354838709677
KNeighbors Classifier - Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.94      0.79        16
           1       0.89      0.53      0.67        15

    accuracy                           0.74        31
   macro avg       0.79      0.74      0.73        31
weighted avg       0.78      0.74      0.73        31

KNeighbors Classifier - Confusion Matrix:
 [[15  1]
 [ 7  8]]


In [59]:
# Probar diferentes configuraciones para KNeighbors Classifier
model_knn_1 = KNeighborsClassifier(n_neighbors=5)
model_knn_1.fit(X_train, y_train)
y_pred_knn_1 = model_knn_1.predict(X_test)
print("KNN Config 1 - Accuracy:", accuracy_score(y_test, y_pred_knn_1))

model_knn_2 = KNeighborsClassifier(n_neighbors=2)
model_knn_2.fit(X_train, y_train)
y_pred_knn_2 = model_knn_2.predict(X_test)
print("KNN Config 2 - Accuracy:", accuracy_score(y_test, y_pred_knn_2))


KNN Config 1 - Accuracy: 0.7419354838709677
KNN Config 2 - Accuracy: 0.6451612903225806


In [60]:
with open('../models/knn_model.pkl', 'wb') as f:
    pickle.dump(model_knn, f)

### 3. Logistic Regression

In [61]:
# Crear el modelo
model_lr = LogisticRegression(random_state=42)

# Entrenar el modelo
model_lr.fit(X_train, y_train)

# Realizar predicciones
y_pred_lr = model_lr.predict(X_test)

# Evaluar el modelo
print("Logistic Regression - Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression - Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Logistic Regression - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

Logistic Regression - Accuracy: 0.7419354838709677
Logistic Regression - Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.94      0.79        16
           1       0.89      0.53      0.67        15

    accuracy                           0.74        31
   macro avg       0.79      0.74      0.73        31
weighted avg       0.78      0.74      0.73        31

Logistic Regression - Confusion Matrix:
 [[15  1]
 [ 7  8]]


In [62]:
# Probar diferentes configuraciones para Logistic Regression
model_lr_1 = LogisticRegression(random_state=42, max_iter=200)
model_lr_1.fit(X_train, y_train)
y_pred_lr_1 = model_lr_1.predict(X_test)
print("Logistic Regression Config 1 - Accuracy:", accuracy_score(y_test, y_pred_lr_1))

model_lr_2 = LogisticRegression(random_state=42, solver='liblinear')
model_lr_2.fit(X_train, y_train)
y_pred_lr_2 = model_lr_2.predict(X_test)
print("Logistic Regression Config 2 - Accuracy:", accuracy_score(y_test, y_pred_lr_2))

Logistic Regression Config 1 - Accuracy: 0.7419354838709677
Logistic Regression Config 2 - Accuracy: 0.6774193548387096


In [46]:
# Guardar el modelo entrenado
with open('../models/logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(model_lr, f)

### 4. Random Forest

In [63]:
# Crear el modelo
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Entrenar el modelo
model.fit(X_train, y_train)

# Realizar predicciones
y_pred = model.predict(X_test)

# Evaluar el modelo
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.6774193548387096
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.94      0.75        16
           1       0.86      0.40      0.55        15

    accuracy                           0.68        31
   macro avg       0.74      0.67      0.65        31
weighted avg       0.74      0.68      0.65        31

Confusion Matrix:
 [[15  1]
 [ 9  6]]


In [64]:
# Configuración 1: Sin cambios adicionales
model_1 = RandomForestClassifier(n_estimators=100, random_state=42)
model_1.fit(X_train, y_train)
y_pred_1 = model_1.predict(X_test)
print("Config 1 - Accuracy:", accuracy_score(y_test, y_pred_1))

# Configuración 2: Aumentar los estimadores
model_2 = RandomForestClassifier(n_estimators=200, random_state=42)
model_2.fit(X_train, y_train)
y_pred_2 = model_2.predict(X_test)
print("Config 2 - Accuracy:", accuracy_score(y_test, y_pred_2))

# Configuración 3: Cambiar la profundidad máxima de los árboles
model_3 = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model_3.fit(X_train, y_train)
y_pred_3 = model_3.predict(X_test)
print("Config 3 - Accuracy:", accuracy_score(y_test, y_pred_3))


Config 1 - Accuracy: 0.6774193548387096
Config 2 - Accuracy: 0.6451612903225806
Config 3 - Accuracy: 0.6774193548387096


In [65]:
# Guardar el modelo entrenado
with open('../models/random_forest_model.pkl', 'wb') as f:
    pickle.dump(model_1, f)

### 5. SVC

In [66]:
# Crea el modelo
model_svc = SVC(random_state=42)

# Entrenar el modelo
model_svc.fit(X_train, y_train)

# Realiza predicciones
y_pred_svc = model_svc.predict(X_test)

# Evaluar el modelo
print("Support Vector Classifier - Accuracy:", accuracy_score(y_test, y_pred_svc))
print("Support Vector Classifier - Classification Report:\n", classification_report(y_test, y_pred_svc))
print("Support Vector Classifier - Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svc))

Support Vector Classifier - Accuracy: 0.7096774193548387
Support Vector Classifier - Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.94      0.77        16
           1       0.88      0.47      0.61        15

    accuracy                           0.71        31
   macro avg       0.76      0.70      0.69        31
weighted avg       0.76      0.71      0.69        31

Support Vector Classifier - Confusion Matrix:
 [[15  1]
 [ 8  7]]


In [67]:
# Probar diferetnes configuraciones pra Support Vector Classifier
model_svc_1 = SVC(kernel='linear', random_state=42)
model_svc_1.fit(X_train, y_train)
y_pred_svc_1 = model_svc_1.predict(X_test)
print("SVC Config 1 - Accuracy:", accuracy_score(y_test, y_pred_svc_1))

model_svc_2 = SVC(kernel='rbf', random_state=42)
model_svc_2.fit(X_train, y_train)
y_pred_svc_2 = model_svc_2.predict(X_test)
print("SVC Config 2 - Accuracy:", accuracy_score(y_test, y_pred_svc_2))


SVC Config 1 - Accuracy: 0.7419354838709677
SVC Config 2 - Accuracy: 0.7096774193548387


In [68]:
# Guardar el modelo entrenado
with open('../models/svc_model.pkl', 'wb') as f:
    pickle.dump(model_svc_1, f)