In [None]:
import pandas as pd
import json
import sys
import pickle
import joblib
sys.path.append('../')

from config.paths import CONFIG_PATH, ARTIFACTS_PATH, MODELS_PATH
from utils.file_management import load_config
from utils.preprocessors import (
    binary_mapper,
    assign_season
)

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('../data/raw/inference_sample.csv')

In [3]:
config_path = CONFIG_PATH / "settings.yaml"
config = load_config(config_path)
features_path = CONFIG_PATH / "features.yaml"
features = load_config(features_path)

2025-05-27 15:31:55,343 - utils.file_management - INFO - Loading yaml file from /Users/robertogarces/data-science/projects/australia-rain/australia-rain/config/settings.yaml
2025-05-27 15:31:55,349 - utils.file_management - INFO - Loading yaml file from /Users/robertogarces/data-science/projects/australia-rain/australia-rain/config/features.yaml


In [4]:
pp_params = config['preprocessing_parameters']
wind_mapping = config['wind_mapping']

target = features['target']
features_to_map = features['features_to_map']
num_features = features['numeric_features']
cat_features = features['categorical_features']
features_with_outliers = features['features_with_outliers']
model_features = features['model_features']

In [5]:
before_drop = df.shape[0]
df.dropna(subset=[target], inplace=True)
after_drop = df.shape[0]

In [20]:
df = binary_mapper(df, features_to_map)


In [7]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
if df['Date'].isnull().any():
    n_null_dates = df['Date'].isnull().sum()
    df = df.dropna(subset=['Date'])
df['Month'] = df['Date'].dt.month

In [10]:
dropped_correlated_features_path = ARTIFACTS_PATH / 'dropped_correlated_features.json'
with open(dropped_correlated_features_path, 'r', encoding='utf-8') as file:
    dropped_correlated_features = json.load(file)
df.drop(dropped_correlated_features, axis=1, inplace=True)

for col in cat_features:
    if col in df.columns:
        df[col].fillna('Missing', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Missing', inplace=True)


In [None]:
imputation_group_medians_path = ARTIFACTS_PATH / 'imputation_group_medians.json'
with open(imputation_group_medians_path, 'r', encoding='utf-8') as file:
    imputation_group_medians = json.load(file)

def apply_group_median_imputation(df, group_medians, features, group_cols=['Month', 'Location'], fallback=-1):
    """
    Impute missing values in specified columns using precomputed group-level medians.

    Parameters:
        df (pd.DataFrame): DataFrame with missing values.
        group_medians (dict): Dict con medians per group in format {'Month|Location': {feature: value}}.
        features (list): Features to impute.
        group_cols (list): Group by features (default=['Month', 'Location']).
        fallback (float): Value to use if there's not group median.

    Returns:
        pd.DataFrame: DataFrame con los valores imputados.
    """
    df_imputed = df.copy()
    valid_features = [f for f in features if f in df.columns]

    if not valid_features:
        return df_imputed

    for idx, row in df.iterrows():
        key = '|'.join(map(str, [row[col] for col in group_cols]))
        for feature in valid_features:
            if pd.isna(row[feature]):
                value = group_medians.get(key, {}).get(feature, fallback)
                df_imputed.at[idx, feature] = value

    return df_imputed

In [12]:
df = apply_group_median_imputation(df, imputation_group_medians, num_features)

In [13]:
for col in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
    if col in df.columns:
        mapped_col = f"{col}_deg"
        df[mapped_col] = df[col].map(wind_mapping)
        n_missing = df[mapped_col].isnull().sum()

df['Season'] = df['Month'].apply(assign_season)

In [14]:
encoder_path = ARTIFACTS_PATH / 'label_encoders.pkl'
with open(encoder_path, 'rb') as f:
    encoder = pickle.load(f)

def apply_label_encoders(df, encoders, fallback_value=-1):
    """
    Aplica múltiples LabelEncoders a un DataFrame, manejando categorías desconocidas.

    Parámetros:
        df (pd.DataFrame): DataFrame de entrada.
        encoders (dict): Diccionario {columna: LabelEncoder}.
        fallback_value (int): Valor para categorías no vistas.

    Retorna:
        pd.DataFrame: DataFrame con columnas codificadas.
    """
    df_encoded = df.copy()
    for col, encoder in encoders.items():
        if col not in df_encoded.columns:
            continue

        known_classes = set(encoder.classes_)

        def encode_value(val):
            if pd.isna(val) or val not in known_classes:
                return fallback_value
            return encoder.transform([val])[0]

        df_encoded[col] = df_encoded[col].apply(encode_value)

    return df_encoded

In [15]:
df = apply_label_encoders(df, encoder)

# Model

In [16]:
X = df[model_features].drop(target, axis=1)
y = df[target]

In [17]:
model_path = MODELS_PATH / 'lgbm.joblib'
model = joblib.load(model_path)

In [18]:
preds = model.predict(X)



In [19]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

auc = roc_auc_score(y, preds)
print(f"AUC: {auc:.4f}")

print(classification_report(y, preds))

AUC: 0.7600
              precision    recall  f1-score   support

           0       0.88      0.95      0.91       759
           1       0.77      0.57      0.65       224

    accuracy                           0.86       983
   macro avg       0.82      0.76      0.78       983
weighted avg       0.86      0.86      0.86       983

