In [None]:
# ============================================
#  Mango Datathon – LGBM con ventas/demanda agregadas
#  - Target: Production
#  - Usa weekly_sales / weekly_demand agregadas por ID
#  - Tuning de num_leaves (coarse + fine)
# ============================================

import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

from google.colab import drive
drive.mount('/content/drive')

DATA_DIR = Path("/content/drive/MyDrive/Datathon")


# -------------------------------------------------------------------
# Cargar datos + renombrar columnas Unnamed del test
# -------------------------------------------------------------------
def load_data():
    train_path = DATA_DIR / "train.csv"
    test_path = DATA_DIR / "test.csv"
    sample_sub_path = DATA_DIR / "sample_submission.csv"

    print("Leyendo CSV...")
    train = pd.read_csv(train_path, sep=';')
    test = pd.read_csv(test_path, sep=';')
    sample_sub = pd.read_csv(sample_sub_path)

    # Renombrar columnas Unnamed del test para que coincidan con train
    unnamed_test_cols = [c for c in test.columns if c.startswith("Unnamed")]
    if unnamed_test_cols:
        n_unnamed = len(unnamed_test_cols)
        last_train_cols = train.columns[-n_unnamed:]
        rename_map = {old: new for old, new in zip(unnamed_test_cols, last_train_cols)}
        print("Renombrando columnas Unnamed en test.csv:")
        print(rename_map)
        test = test.rename(columns=rename_map)

    print("train shape:", train.shape)
    print("test shape:", test.shape)
    print("sample_submission shape:", sample_sub.shape)

    return train, test, sample_sub


# (Esta función queda por si quieres usar embeddings más adelante, ahora no se usa)
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def parse_embedding(df, column_name='image_embedding'):
    """Convierte la columna de embedding (string) en un DataFrame de floats."""
    if column_name not in df.columns:
        return pd.DataFrame(index=df.index)

    df[column_name] = df[column_name].astype(str).str.replace('[', '').str.replace(']', '')
    embedding_df = df[column_name].str.split(',', expand=True).astype(float)
    num_components = embedding_df.shape[1]
    embedding_df.columns = [f'embed_{i}' for i in range(num_components)]
    embedding_df = embedding_df.fillna(0)
    return embedding_df


# -------------------------------------------------------------------
# Preparar features agregadas por ID
# -------------------------------------------------------------------
def prepare_features(train: pd.DataFrame, test: pd.DataFrame, sample_sub: pd.DataFrame):
    if "Production" not in train.columns:
        raise ValueError("Falta la columna 'Production' en train.csv")

    # 1) Agregaciones por ID en TRAIN
    agg = train.groupby("ID").agg(
        weekly_sales_sum=("weekly_sales", "sum"),
        weekly_sales_mean=("weekly_sales", "mean"),
        weekly_sales_std=("weekly_sales", "std"),             # volatilidad de ventas
        weekly_demand_median=("weekly_demand", "median"),     # mediana de demanda
        demand_sales_ratio=("weekly_demand",
                            lambda x: (x.sum() /
                                       (train.loc[x.index, "weekly_sales"].sum()
                                        if train.loc[x.index, "weekly_sales"].sum() != 0
                                        else 1.0)))           # ratio demanda/venta
    ).reset_index()

    # 2) Dataset a nivel producto en TRAIN (una fila por ID)
    train_prod = train.drop_duplicates("ID").merge(agg, on="ID", how="left")

    # 3) Dataset a nivel producto en TEST alineado con sample_submission (mismo orden de ID)
    test_uniq = test.drop_duplicates("ID")
    test_prod = sample_sub[["ID"]].merge(test_uniq, on="ID", how="left")
    test_prod = test_prod.merge(agg, on="ID", how="left")  # algunos ID nuevos pueden tener NaN en las agregadas

    # 4) Definir y, X, X_test
    y = train_prod["Production"].astype(float)

    drop_cols = [
        "Production",
        "weekly_sales",
        "weekly_demand",
        "num_week_iso",
        "year",
    ]

    # Quitamos image_embedding por velocidad (si luego quieres meterlo, lo tratamos aparte)
    if "image_embedding" in train_prod.columns:
        drop_cols.append("image_embedding")

    X = train_prod.drop(columns=[c for c in drop_cols if c in train_prod.columns])
    X_test = test_prod.drop(columns=[c for c in drop_cols if c in test_prod.columns])

    # Aseguramos mismas columnas en train y test
    X_test = X_test[X.columns]

    # Detectar categóricas
    cat_cols = [c for c in X.columns if X[c].dtype == "object"]

    # Convertir a categoría para LGBM
    for c in cat_cols:
        X[c] = X[c].astype("category")
        X_test[c] = X_test[c].astype("category")

    print("Número de features:", X.shape[1])
    print("Categóricas:", len(cat_cols))

    return X, y, X_test, cat_cols


# -------------------------------------------------------------------
# Entrenamiento + validación con tuning de num_leaves
# -------------------------------------------------------------------
def train_and_evaluate_lgbm(
    X, y, cat_cols,
    test_size=0.2,
    random_state=42,
    leaves_grid=None
):
    """
    Entrena varios modelos LightGBM variando num_leaves y devuelve
    el mejor modelo según RMSE de validación junto con el mejor num_leaves.
    """
    if leaves_grid is None:
        leaves_grid = [15, 31, 63, 127]

    print("\n[VALIDACIÓN] Haciendo train/valid split...")
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    print("X_train:", X_train.shape, " X_valid:", X_valid.shape)

    lgb_train_base = lgb.Dataset(
        X_train, label=y_train,
        categorical_feature=cat_cols,
        free_raw_data=False
    )
    lgb_valid_base = lgb.Dataset(
        X_valid, label=y_valid,
        categorical_feature=cat_cols,
        free_raw_data=False
    )

    best_rmse = float("inf")
    best_model = None
    best_leaves = None

    for num_leaves in leaves_grid:
        print(f"\n[VALIDACIÓN] Probando num_leaves = {num_leaves} ...")

        params = {
            "objective": "regression",
            "metric": "rmse",
            "learning_rate": 0.005,
            "num_leaves": num_leaves,
            "feature_fraction": 0.8,
            "bagging_fraction": 0.8,
            "bagging_freq": 1,
            "verbose": -1,
            "seed": 42,
        }

        model = lgb.train(
            params,
            lgb_train_base,
            valid_sets=[lgb_train_base, lgb_valid_base],
            valid_names=["train", "valid"],
            num_boost_round=100000,
            callbacks=[lgb.early_stopping(100, verbose=100)],
        )

        y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
        rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
        print(f"[RESULTADO] num_leaves={num_leaves} -> RMSE valid: {rmse:.4f}, best_iter={model.best_iteration}")

        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model
            best_leaves = num_leaves

    print("\n==============================")
    print(f"MEJOR num_leaves: {best_leaves} con RMSE: {best_rmse:.4f}")
    print(f"Mejor iteración: {best_model.best_iteration}")
    print("==============================")

    return best_model, best_leaves


# -------------------------------------------------------------------
# Entrenar con todo el train y generar submission
# -------------------------------------------------------------------
def train_full_and_predict_lgbm(
    X, y, X_test,
    cat_cols,
    sample_sub,
    best_iteration,
    best_num_leaves
):
    print("\n[FINAL] Entrenando modelo LightGBM con TODO el train...")

    lgb_full = lgb.Dataset(
        X, label=y,
        categorical_feature=cat_cols,
        free_raw_data=False
    )

    params = {
        "objective": "regression",
        "metric": "rmse",
        "learning_rate": 0.005,
        "num_leaves": best_num_leaves,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "verbose": -1,
        "seed": 42,
    }

    model_full = lgb.train(
        params,
        lgb_full,
        num_boost_round=best_iteration
    )

    print("[FINAL] Prediciendo sobre test...")
    preds = model_full.predict(X_test, num_iteration=best_iteration)
    preds = np.maximum(preds, 0)  # evitar negativos

    submission = sample_sub.copy()
    submission["Production"] = preds

    return submission[["ID", "Production"]]


# -------------------------------------------------------------------
# Ejecutar
# -------------------------------------------------------------------
train, test, sample_sub = load_data()

X, y, X_test, cat_cols = prepare_features(train, test, sample_sub)

print("\n==== VALIDACIÓN RÁPIDA CON LGBM + TUNING GRUESO num_leaves ====")
# 1) Barrido grueso
model_val_coarse, best_leaves_coarse = train_and_evaluate_lgbm(
    X, y, cat_cols,
    test_size=0.2,
    random_state=42,
    leaves_grid=[15, 31, 63, 127]  # grid inicial
)

print(f"\n>> Mejor num_leaves en barrido grueso: {best_leaves_coarse}")

# 2) Barrido fino alrededor del mejor num_leaves
refined_grid = sorted(
    set(
        max(2, best_leaves_coarse + delta)
        for delta in [-24, -16, -8, 0, 8, 16, 24]
    )
)

print(f"\n==== VALIDACIÓN RÁPIDA CON LGBM + TUNING FINO alrededor de {best_leaves_coarse} ====")
print("Grid fino:", refined_grid)

model_val_fine, best_leaves_fine = train_and_evaluate_lgbm(
    X, y, cat_cols,
    test_size=0.2,
    random_state=42,
    leaves_grid=refined_grid
)

best_iter = model_val_fine.best_iteration
print(f"\n>> Mejor num_leaves tras barrido fino: {best_leaves_fine}")
print(f">> Mejor iteración final: {best_iter}")

print("\n==== ENTRENAMIENTO FINAL Y SUBMISSION (con num_leaves fino) ====")
submission = train_full_and_predict_lgbm(
    X, y, X_test,
    cat_cols,
    sample_sub,
    best_iteration=best_iter,
    best_num_leaves=best_leaves_fine
)

out_path = DATA_DIR / "submission_lgbm_agg_leaves_tuned_fine.csv"
submission.to_csv(out_path, index=False)
print(f"\n✅ Submission guardada en:\n{out_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Leyendo CSV...
Renombrando columnas Unnamed en test.csv:
{'Unnamed: 28': 'year', 'Unnamed: 29': 'num_week_iso', 'Unnamed: 30': 'weekly_sales', 'Unnamed: 31': 'weekly_demand', 'Unnamed: 32': 'Production'}
train shape: (95339, 33)
test shape: (2250, 33)
sample_submission shape: (2250, 2)
Número de features: 32
Categóricas: 18

==== VALIDACIÓN RÁPIDA CON LGBM + TUNING GRUESO num_leaves ====

[VALIDACIÓN] Haciendo train/valid split...
X_train: (7874, 32)  X_valid: (1969, 32)

[VALIDACIÓN] Probando num_leaves = 15 ...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3893]	train's rmse: 3940.89	valid's rmse: 6675.87
[RESULTADO] num_leaves=15 -> RMSE valid: 6675.8738, best_iter=3893

[VALIDACIÓN] Probando num_leaves = 31 ...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration i