In [1]:
"""Módulo que contiene la lógica de la task de separación de datos."""

# Librerías Externas.
from typing import Dict, Tuple, Optional

import logging

import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

from genesis_explorer.makers.model_maker import ModelMaker


logging.basicConfig(level = logging.INFO,
                    format = "%(asctime)s - %(levelname)s - %(message)s")


class MyModelMaker(ModelMaker):
    """Clase que contiene la lógica para, de una tabla base, generar una separación de datos
    para el modelado."""

    def __init__(self,
                 dataset: Dict[str, str],
                 target_column: str,
                 cloud_provider: Optional[str] = "gcp",
                 project_id: Optional[str] = "mlops-credits-vertex-poc",
                 cloud_uri: Optional[str] = "gs://migracion-gcp-bucket/consumers/consumers-iris/mlb/1/",
                 experiment_name: Optional[str] = "split-data-experiment",
                 experiment_version: Optional[str] = "1.0.0-test.200") -> None:
        """Método de instanciación de la clase.
        
        Args:
        ----------
        test_size: Optional[float].
            Tamaño de la partición de test.

        oot_size: Optional[float].
            Tamaño de la partición de oot.

        random_seed: Optional[int].
            Semilla para la generación de números aleatorios."""
        
        super().__init__(cloud_provider = cloud_provider,
                         cloud_uri = cloud_uri,
                         project_id = project_id)
        
        artifacts = self.load_artifacts(artifacts = dataset)

        self.train_data = artifacts["train_data"]
        self.test_data = artifacts["test_data"]
        self.oot_data = artifacts["oot_data"]

        self.target_column = target_column

        self.experiment_name = experiment_name
        self.experiment_version = experiment_version

    def run_task(self) -> None:
        """Método de ejecución de la task."""

        decorated_train_model_process = self.train_model_task(experiment_name = self.experiment_name,
                                                            experiment_version = self.experiment_version)(self.train_model)
        
        artifacts, _ = decorated_train_model_process(train_data = self.train_data)

        decorated_evaluate_model_process = self.evaluate_model_task(experiment_name = self.experiment_name,
                                                                    experiment_version = self.experiment_version)(self.evaluate_model)
        
        metrics = decorated_evaluate_model_process(model = artifacts["model"],
                                                   train_data = self.train_data,
                                                   test_data = self.test_data,
                                                   oot_data = self.oot_data)
            
    def train_model(self, train_data: pd.DataFrame) -> Dict[str, Tuple[pd.DataFrame, ...]]:
        """Función que se encarga de separar los datos en train y test.
        
        Args:
        ----------
        data: pd.DataFrame.
            DataFrame base de datos.
            
        Returns:
        ----------
        datasets: Dict[str, Tuple[pd.DataFrame, ...]].
            Diccionario con los datasets generados."""

        X_train = train_data.drop(columns = [self.target_column], axis = 1)
        y_train = train_data[self.target_column]

        pipeline = Pipeline(steps = [("scaler", StandardScaler()),
                                    ("model", LGBMClassifier(random_state = 42))])
        
        pipeline.fit(X_train, y_train)

        artifacts = {"model": pipeline}

        hyperparameters = {"num_leaves": 31,
                           "learning_rate": 0.05,
                           "n_estimators": 100,
                           "max_depth": 5}

        return artifacts, hyperparameters
    
    def evaluate_model(self, model: Pipeline,
                       train_data: pd.DataFrame,
                       test_data: pd.DataFrame,
                       oot_data: pd.DataFrame) -> Dict[str, float]:
        """Función que se encarga de evaluar el modelo."""

        datasets = {"train": train_data,
                    "test": test_data,
                    "oot": oot_data}

        metrics = {}
        for dataset_name, dataset in datasets.items():
            X_data = dataset.drop(columns = [self.target_column], axis = 1)
            y_data = dataset[self.target_column]
            
            y_data_scores = model.predict_proba(X_data)
            roc_auc_data = roc_auc_score(y_data, y_data_scores, multi_class = "ovr")

            metrics[f"roc_auc_{dataset_name}"] = roc_auc_data
            logging.info(f"El AUC del modelo en el conjunto de {dataset_name} es: {roc_auc_data:.4f}.")

        return metrics
    


datasets = {"train_data": "train_set",
            "test_data": "test_set",
            "oot_data": "oot_set"}

my_model_maker = MyModelMaker(dataset = datasets,
                              target_column = "target")

my_model_maker.run_task()

2025-11-15 14:05:23,058 - INFO - Instanciando la primera instancia de la clase...
2025-11-15 14:05:23,709 - INFO - Esta clase implementa el patrón Singleton, por ende, se retorna la instancia existente.
2025-11-15 14:05:23,711 - INFO - Descargando el archivo consumers/consumers-iris/mlb/1/train_set.pickle del bucket migracion-gcp-bucket en el proyecto mlops-credits-vertex-poc...
2025-11-15 14:05:24,335 - INFO - Archivo consumers/consumers-iris/mlb/1/train_set.pickle descargado correctamente.
2025-11-15 14:05:24,338 - INFO - Descargando el archivo consumers/consumers-iris/mlb/1/test_set.pickle del bucket migracion-gcp-bucket en el proyecto mlops-credits-vertex-poc...
2025-11-15 14:05:24,543 - INFO - Archivo consumers/consumers-iris/mlb/1/test_set.pickle descargado correctamente.
2025-11-15 14:05:24,547 - INFO - Descargando el archivo consumers/consumers-iris/mlb/1/oot_set.pickle del bucket migracion-gcp-bucket en el proyecto mlops-credits-vertex-poc...
2025-11-15 14:05:24,751 - INFO - A

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000363 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 81
[LightGBM] [Info] Number of data points in the train set: 105, number of used features: 4
[LightGBM] [Info] Start training from score -1.016374
[LightGBM] [Info] Start training from score -1.157453
[LightGBM] [Info] Start training from score -1.127600


2025-11-15 14:05:29,305 - INFO - Archivo /var/folders/7s/p0t580cj7m7f7wnlkdj5lg14z6r38n/T/genesis_temp_dirczrjfrjy/model.pickle subido correctamente.
2025-11-15 14:05:29,306 - INFO - Artefactos guardados correctamente.

2025-11-15 14:05:30,268 - INFO - Proceso de entrenamiento del modelo finalizado correctamente.
2025-11-15 14:05:30,269 - INFO - Cerrando ExperimentRun: experimento-v200
2025-11-15 14:05:30,725 - INFO - ExperimentRun cerrado: experimento-v200
2025-11-15 14:05:30,726 - INFO - Esta clase implementa el patrón Singleton, por ende, se retorna la instancia existente.
2025-11-15 14:05:30,726 - INFO - Creando experimento: split-data-experiment
2025-11-15 14:05:31,585 - INFO - Experimento ya existe: split-data-experiment
2025-11-15 14:05:32,017 - INFO - Creando ExperimentRun: experimento-v200
2025-11-15 14:05:32,467 - ERROR - Error al crear ExperimentRun: 409 Context with name projects/310075297167/locations/us-east1/metadataStores/default/contexts/split-data-experiment-experimen