In [4]:
import os
import pandas as pd
import pickle
import mlflow

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [5]:
project_path = os.path.dirname(os.getcwd())

In [19]:
dataset = pd.read_csv(os.path.join(project_path, "data", "raw", "diabetes.csv"))
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [59]:
# configuración del Pipeline
target = 'Outcome'
vars_to_drop = ['SkinThickness', 'Insulin', 'Outcome']
vars_to_impute = ['Glucose', 'BloodPressure', 'BMI']
mflow_url = 'http://127.0.0.1:5000'

In [None]:
x_features = dataset
y_target = dataset[target]

x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size=0.2, shuffle=True, random_state=42)

In [67]:
# cargamos el pipeline previamente definido.
with open(os.path.join(project_path, 'artifacts', 'pipeline.pkl'), 'rb') as f:
        diabetes_predict_model = pickle.load(f)

In [None]:
# dataset de entrenamiento.
x_features_train = diabetes_predict_model.fit_transform(x_train)
col_names = list(set(x_train.columns).difference(set(vars_to_drop)))

# dataset para seleccion de modelo según métrica.
x_features_test = diabetes_predict_model.transform(x_test)
col_names = list(set(x_train.columns).difference(set(vars_to_drop)))

In [60]:
mlflow.set_tracking_uri(mflow_url)
mlflow.set_experiment('Diabetes Predict Model')

2024/12/18 03:25:14 INFO mlflow.tracking.fluent: Experiment with name 'Diabetes Predict Model' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/304865332183066859', creation_time=1734513914599, experiment_id='304865332183066859', last_update_time=1734513914599, lifecycle_stage='active', name='Diabetes Predict Model', tags={}>

In [61]:
# Definición de modelos y parámetros
models_and_params = {
    "Model 1": {
        "model": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
        "params": {"n_estimators": 100, "learning_rate": 0.1}
    },
    "Model 2": {
        "model": GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, random_state=42),
        "params": {"n_estimators": 50, "learning_rate": 0.1}
    },
    "Model 3": {
        "model": GradientBoostingClassifier(n_estimators=25, learning_rate=0.05, random_state=42),
        "params": {"n_estimators": 25, "learning_rate": 0.05}
    }, 
    "Model 4": {
        "model": KNeighborsClassifier(n_neighbors=3),
        "params": {"n_neighbors": 3}
    },
    "Model 5": {
        "model": KNeighborsClassifier(n_neighbors=5),
        "params": {"n_neighbors": 5}
    },
    "Model 6": {
        "model": KNeighborsClassifier(n_neighbors=2),
        "params": {"n_neighbors": 2}
    },
    "Model 7": {
        "model": LogisticRegression(random_state=42, solver='lbfgs', max_iter=100),
        "params": {"solver": "lbfgs", "max_iter": 100}
    },
    "Model 8": {
        "model": LogisticRegression(random_state=42, solver='lbfgs', max_iter=200),
        "params": {"solver": "lbfgs", "max_iter": 200}
    },
    "Model 9": {
        "model": LogisticRegression(random_state=42, solver='liblinear', max_iter=100),
        "params": {"solver": "liblinear", "max_iter": 100}
    }, 
    "Model 10": {
        "model": RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42),
        "params": {"n_estimators": 100, "max_depth": None}
    },
    "Model 11": {
        "model": RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42),
        "params": {"n_estimators": 200, "max_depth": None}
    },
    "Model 12": {
        "model": RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
        "params": {"n_estimators": 100, "max_depth": 10}
    }, 
    "Model 13": {
        "model": SVC(kernel='linear', random_state=42),
        "params": {"kernel": "linear"}
    },
    "Model 14": {
        "model": SVC(kernel='rbf', random_state=42),
        "params": {"kernel": "rbf"}
    },
    "Model 15": {
        "model": SVC(kernel='poly', random_state=42),
        "params": {"kernel": "poly"}
    }
}

In [62]:
# array para almacenar accuracy score de cada modelo
resultados_acc = []

# Iteramos sobre los diferentes modelos
for model_name, model_info in models_and_params.items():
    with mlflow.start_run(run_name=model_name):
        model = model_info["model"]
        params = model_info["params"]
        
        # Entrenamiento del modelo
        model.fit(x_features_train, y_train)
        
        # Predicciones
        y_pred = model.predict(x_features_test)
        
        # Cálculo de métricas
        accuracy = accuracy_score(y_test, y_pred)
        
        # Registro de parámetros, métrica y modelos
        mlflow.log_params(params)
        mlflow.log_metric("accuracy score", accuracy)
        mlflow.sklearn.log_model(model, model_name)

        resultados_acc.append({"model_name": model_name, "accuracy_score": accuracy})
        
        mlflow.end_run()



🏃 View run Model 1 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/0f6d4cd7946a4d4b9874824ac821ed63
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859




🏃 View run Model 2 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/fc31f5b7383c47f8b8d24efbb2083867
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859




🏃 View run Model 3 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/b1ef1a300b974eb6a6409df3ecc344a8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859




🏃 View run Model 4 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/29a12f14cf3b4a7b84ae1f7e14454daa
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859




🏃 View run Model 5 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/7ffe7fa9fe7e4dbcb9b577b424de74c5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859




🏃 View run Model 6 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/24892364c6a140c0a8d8f10a654966eb
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859




🏃 View run Model 7 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/5ad82c61c56f48eca04d5268cdc856fd
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859




🏃 View run Model 8 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/8c47d62c3ddf491eb78e45d100b7c9dd
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859




🏃 View run Model 9 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/bac7c4e18a5848c4bdc8193aea885404
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859




🏃 View run Model 10 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/e976b074165447598767e79df82c2542
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859




🏃 View run Model 11 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/f39b8e8e556f404b9c949c0dfa72ff8d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859




🏃 View run Model 12 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/176011f867ed46e7af74e947c01358ac
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859




🏃 View run Model 13 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/18b3c5938eaf41d78e4983639c7edab3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859




🏃 View run Model 14 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/4496f57cb20c43c7bd9a3093413f7cc6
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859




🏃 View run Model 15 at: http://127.0.0.1:5000/#/experiments/304865332183066859/runs/c75483954c814638abf1951d96d883dd
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/304865332183066859


In [63]:
resultados_acc = pd.DataFrame(resultados_acc)

In [64]:
best_model_name = resultados_acc.loc[resultados_acc["accuracy_score"].idxmax()]["model_name"]
best_model = models_and_params[best_model_name]["model"]

In [68]:
diabetes_predict_model.steps.append(
            ('modelo_prediccion', best_model))

In [69]:
# configuración y entrenamiento del modelo final
diabetes_predict_model.fit(x_train, y_train)

# modelo entrenado y configurado.
with open(os.path.join(project_path, 'artifacts', 'trainded_pipeline.pkl'), 'wb') as f:
    pickle.dump(diabetes_predict_model, f)