In [24]:
import pandas as pd
import json
import sys
import pickle
import joblib
sys.path.append('../')

from config.paths import RAW_DATA_PATH, PROCESSED_DATA_PATH, CONFIG_PATH, ARTIFACTS_PATH, MODELS_PATH
from utils.config_loader import load_config
from utils.preprocessors import (
    binary_mapper,
    filter_outliers_by_percentile,
    remove_highly_correlated_features,
    impute_median_by_group,
    fill_missing_categories,
    label_encoder,
    assign_season
)

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('../data/raw/inference_sample.csv')

In [3]:
config_path = CONFIG_PATH / "settings.yaml"
config = load_config(config_path)
features_path = CONFIG_PATH / "features.yaml"
features = load_config(features_path)

In [32]:
pp_params = config['preprocessing_parameters']
wind_mapping = config['wind_mapping']

target = features['target']
features_to_map = features['features_to_map']
num_features = features['numeric_features']
cat_features = features['categorical_features']
features_with_outliers = features['features_with_outliers']
model_features = features['model_features']

In [5]:
before_drop = df.shape[0]
df.dropna(subset=[target], inplace=True)
after_drop = df.shape[0]

In [6]:
df = binary_mapper(df, features_to_map)


In [7]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
if df['Date'].isnull().any():
    n_null_dates = df['Date'].isnull().sum()
    df = df.dropna(subset=['Date'])
df['Month'] = df['Date'].dt.month

In [8]:
#df = filter_outliers_by_percentile(df, features_with_outliers, pp_params['outliers_percentile'])

In [None]:
# Botar las columnas que se botan por alta correlacion ---
# Fill de missing cat values ---
# Imputar num features ---
# Wind dir mapping ---
# Assing season ---
# Encoding ---

In [10]:
dropped_correlated_features_path = ARTIFACTS_PATH / 'dropped_correlated_features.json'
with open(dropped_correlated_features_path, 'r', encoding='utf-8') as file:
    dropped_correlated_features = json.load(file)
df.drop(dropped_correlated_features, axis=1, inplace=True)

for col in cat_features:
    if col in df.columns:
        df[col].fillna('Missing', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Missing', inplace=True)


In [11]:
imputation_group_medians_path = ARTIFACTS_PATH / 'imputation_group_medians.json'
with open(imputation_group_medians_path, 'r', encoding='utf-8') as file:
    imputation_group_medians = json.load(file)

def apply_group_median_imputation(df, group_medians, features, group_cols=['Month', 'Location'], fallback=-1):
    """
    Impute missing values in specified columns using precomputed group-level medians.

    Parameters:
        df (pd.DataFrame): DataFrame con valores faltantes.
        group_medians (dict): Diccionario con medianas por grupo en formato {'Month|Location': {feature: value}}.
        features (list): Columnas a imputar.
        group_cols (list): Columnas de agrupamiento (default=['Month', 'Location']).
        fallback (float): Valor a usar si no hay mediana disponible para un grupo o feature.

    Returns:
        pd.DataFrame: DataFrame con los valores imputados.
    """
    df_imputed = df.copy()
    valid_features = [f for f in features if f in df.columns]

    if not valid_features:
        return df_imputed

    for idx, row in df.iterrows():
        key = '|'.join(map(str, [row[col] for col in group_cols]))
        for feature in valid_features:
            if pd.isna(row[feature]):
                value = group_medians.get(key, {}).get(feature, fallback)
                df_imputed.at[idx, feature] = value

    return df_imputed

In [12]:
df = apply_group_median_imputation(df, imputation_group_medians, num_features)

In [15]:
for col in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
    if col in df.columns:
        mapped_col = f"{col}_deg"
        df[mapped_col] = df[col].map(wind_mapping)
        n_missing = df[mapped_col].isnull().sum()

df['Season'] = df['Month'].apply(assign_season)

In [20]:
encoder_path = ARTIFACTS_PATH / 'label_encoders.pkl'
with open(encoder_path, 'rb') as f:
    encoder = pickle.load(f)

def apply_label_encoders(df, encoders, fallback_value=-1):
    """
    Aplica múltiples LabelEncoders a un DataFrame, manejando categorías desconocidas.

    Parámetros:
        df (pd.DataFrame): DataFrame de entrada.
        encoders (dict): Diccionario {columna: LabelEncoder}.
        fallback_value (int): Valor para categorías no vistas.

    Retorna:
        pd.DataFrame: DataFrame con columnas codificadas.
    """
    df_encoded = df.copy()
    for col, encoder in encoders.items():
        if col not in df_encoded.columns:
            continue

        known_classes = set(encoder.classes_)

        def encode_value(val):
            if pd.isna(val) or val not in known_classes:
                return fallback_value
            return encoder.transform([val])[0]

        df_encoded[col] = df_encoded[col].apply(encode_value)

    return df_encoded

In [None]:
df = apply_label_encoders(df, encoder)

# Model

In [40]:
X = df[model_features].drop(target, axis=1)
y = df[target]

In [41]:
model_path = MODELS_PATH / 'lgbm.joblib'
model = joblib.load(model_path)

In [43]:
preds = model.predict(X)



In [46]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

auc = roc_auc_score(y, preds)
print(f"AUC: {auc:.4f}")

print(classification_report(y, preds))

AUC: 0.8623
              precision    recall  f1-score   support

           0       0.93      0.97      0.95       759
           1       0.87      0.76      0.81       224

    accuracy                           0.92       983
   macro avg       0.90      0.86      0.88       983
weighted avg       0.92      0.92      0.92       983

