# Pipeline final del modelo seleccionado (Random Forest)
### Este notebook implementa el pipeline completo de preprocesamiento, entrenamiento y guardado del modelo final


## Importación de librerias necesarias

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, f1_score, roc_auc_score
import pickle

## Clase personalizada de preprocesamiento

In [20]:
class PreprocesadorFinal(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.variables_a_eliminar = ['Smoker', 'Fruits', 'Veggies', 'AnyHealthcare', 'NoDocbcCost', 'Sex', 'HvyAlcoholConsump']

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X = X.drop(columns=self.variables_a_eliminar, errors='ignore')
        return X


## Carga el data set

In [21]:
data = pd.read_csv('/content/diabetes_binary_health_indicators_BRFSS2015.csv')

# Separar variables predictoras y objetivo
X = data.drop(columns=['Diabetes_binary'])
y = data['Diabetes_binary']

# Elimina los valores duplicados
data = data.drop_duplicates()

# División de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Add print statements to verify the shapes of the splits
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (202944, 21)
Shape of y_train: (202944,)
Shape of X_test: (50736, 21)
Shape of y_test: (50736,)


## Creación del Pipeline

In [22]:
pipeline_rf = Pipeline([
    ('preprocesamiento', PreprocesadorFinal()),
    ('escalado', StandardScaler()),
    ('modelo', RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

## Entrenamiento

In [23]:
pipeline_rf.fit(X_train, y_train)

# Evaluar el modelo
y_pred = pipeline_rf.predict(X_test)
y_prob = pipeline_rf.predict_proba(X_test)[:, 1]

print("Reporte de clasificación (Random Forest):")
print(classification_report(y_test, y_pred))
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_prob):.4f}")

Reporte de clasificación (Random Forest):
              precision    recall  f1-score   support

         0.0       0.95      0.72      0.82     43667
         1.0       0.31      0.77      0.44      7069

    accuracy                           0.73     50736
   macro avg       0.63      0.74      0.63     50736
weighted avg       0.86      0.73      0.77     50736

F1-score: 0.4411
ROC AUC: 0.8208


In [24]:
print(pipeline_rf)
pipeline_rf

Pipeline(steps=[('preprocesamiento', PreprocesadorFinal()),
                ('escalado', StandardScaler()),
                ('modelo',
                 RandomForestClassifier(class_weight='balanced', max_depth=10,
                                        n_estimators=200, n_jobs=-1,
                                        random_state=42))])


## Almacenamiento en archivo .pkl

In [25]:
with open('pipeline_random_forest.pkl', 'wb') as f:
    pickle.dump(pipeline_rf, f)

print("\nPipeline completo guardado como 'pipeline_random_forest.pkl'")


Pipeline completo guardado como 'pipeline_random_forest.pkl'
