In [None]:
import os, json, math, warnings
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, average_precision_score, roc_auc_score

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier

import joblib

warnings.filterwarnings("ignore")

SEED = 42
rng = np.random.RandomState(SEED)

# Paths
DATA_DIR   = Path("../data")
MODELS_DIR = Path("../models")
EXP_DIR    = MODELS_DIR / "experiments"
BASE_DIR   = MODELS_DIR / "baseline"
REPORTS    = Path("../reports")

for d in [MODELS_DIR, EXP_DIR, BASE_DIR, REPORTS]:
    d.mkdir(parents=True, exist_ok=True)

print("OK: directorios listos")


OK: directorios listos


In [None]:
# Resolver/crear splits desde:
# keys = ['seed','test_idx','folds','classes']
import numpy as np, json, math
from sklearn.model_selection import train_test_split

def parse_labels(s):
    if isinstance(s, float) and math.isnan(s):
        return []
    return [x.strip() for x in str(s).split("|") if x.strip()]

splits_path = DATA_DIR / "splits.json"
with open(splits_path, "r", encoding="utf-8") as f:
    raw = json.load(f)

idx_all  = np.arange(len(df))
idx_test = np.array(raw.get("test_idx", []), dtype=int)

def extract_train_valid_from_folds(folds, idx_all, idx_test):
    """
    Soporta:
      - folds = [ {train/train_idx, valid/val/valid_idx/val_idx}, ... ]
      - folds = [ valid_idx_list, valid_idx_list, ... ]
    Usa el primer fold para construir train/valid.
    """
    if not isinstance(folds, list) or len(folds) == 0:
        return None, None

    f0 = folds[0]

    # Caso A: dict con claves explícitas
    if isinstance(f0, dict):
        # intenta encontrar valid
        val = (f0.get("valid") or f0.get("val") or
               f0.get("valid_idx") or f0.get("val_idx"))
        # intenta encontrar train
        trn = (f0.get("train") or f0.get("train_idx") or
               f0.get("train_indices"))

        if val is not None and trn is not None:
            return np.array(trn, dtype=int), np.array(val, dtype=int)

        if val is not None:
            val = np.array(val, dtype=int)
            mask = np.ones(len(idx_all), dtype=bool)
            mask[val] = False
            if idx_test.size > 0:
                mask[idx_test] = False
            trn = idx_all[mask]
            return trn, val

        if trn is not None:
            trn = np.array(trn, dtype=int)
            # si solo hay train, definimos valid como el complemento (excluyendo test)
            mask = np.ones(len(idx_all), dtype=bool)
            mask[trn] = False
            if idx_test.size > 0:
                mask[idx_test] = False
            val = idx_all[mask]
            return trn, val

        return None, None

    # Caso B: lista/tupla = valid_idx directo
    if isinstance(f0, (list, tuple, np.ndarray)):
        val = np.array(f0, dtype=int)
        mask = np.ones(len(idx_all), dtype=bool)
        mask[val] = False
        if idx_test.size > 0:
            mask[idx_test] = False
        trn = idx_all[mask]
        return trn, val

    return None, None

idx_train, idx_valid = extract_train_valid_from_folds(raw.get("folds", []), idx_all, idx_test)

# Si algo faltó, generamos splits nuevos (estratificando por combinación de etiquetas),
# pero respetando el test ya dado.
if idx_train is None or idx_valid is None or idx_test.size == 0:
    print("⚠️  No se pudo reconstruir train/valid desde 'folds' o falta 'test_idx'. Creando splits nuevos…")
    # excluye test de la población a dividir
    pop = np.array([i for i in idx_all if i not in set(idx_test.tolist())])
    # estratificación por combinación de etiquetas (aprox multilabel)
    y_combo = ['|'.join(sorted(parse_labels(x))) or 'none' for x in df[label_col]]
    y_pop = [y_combo[i] for i in pop]

    idx_trn, idx_val = train_test_split(
        pop, test_size=0.1765,  # ~ 15% valid si test ya es 15% (ajusta si quieres)
        random_state=SEED, stratify=y_pop
    )
    idx_train, idx_valid = np.array(idx_trn, dtype=int), np.array(idx_val, dtype=int)
    if idx_test.size == 0:
        # si no había test, lo creamos (15% del total)
        remaining = np.array([i for i in idx_all if i not in set(idx_train.tolist()+idx_valid.tolist())])
        y_rem = [y_combo[i] for i in remaining]
        _, idx_tst = train_test_split(remaining, test_size=0.5, random_state=SEED, stratify=y_rem)
        idx_test = np.array(idx_tst, dtype=int)

# Ordena por prolijidad (opcional)
idx_train = np.sort(np.array(idx_train, dtype=int))
idx_valid = np.sort(np.array(idx_valid, dtype=int))
idx_test  = np.sort(np.array(idx_test,  dtype=int))

# Persistimos ESTÁNDAR para el resto del pipeline
with open(splits_path, "w", encoding="utf-8") as f:
    json.dump({
        "train": idx_train.tolist(),
        "valid": idx_valid.tolist(),
        "test":  idx_test.tolist()
    }, f, indent=2)

print("Splits listos → train:", len(idx_train), "valid:", len(idx_valid), "test:", len(idx_test))


✅ Splits listos → train: 2368 valid: 595 test: 600


In [None]:
from pathlib import Path
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

BASE_DIR = Path("../models/baseline"); BASE_DIR.mkdir(parents=True, exist_ok=True)

tfidf = TfidfVectorizer(
    analyzer="word", ngram_range=(1,2),
    min_df=2, max_df=0.9, max_features=120_000,
    lowercase=True, strip_accents="unicode", sublinear_tf=True
)

X_train = tfidf.fit_transform([X_text[i] for i in idx_train])
X_valid = tfidf.transform([X_text[i] for i in idx_valid])
X_test  = tfidf.transform([X_text[i] for i in idx_test])

Y_train = Y[idx_train]; Y_valid = Y[idx_valid]; Y_test = Y[idx_test]

joblib.dump(tfidf, BASE_DIR / "tfidf.joblib")
print("TF-IDF guardado en", BASE_DIR / "tfidf.joblib")
print("Shapes:", X_train.shape, X_valid.shape, X_test.shape)


TF-IDF guardado en ..\models\baseline\tfidf.joblib
Shapes: (2368, 28782) (595, 28782) (600, 28782)


In [None]:
import numpy as np
from sklearn.metrics import f1_score, average_precision_score, roc_auc_score

def to_proba(model, X):
    if hasattr(model, "predict_proba"):
        p = model.predict_proba(X)
        if isinstance(p, list): p = np.column_stack([col[:,1] for col in p])
        return p.astype("float64")
    scores = model.decision_function(X).astype("float64")
    return 1.0 / (1.0 + np.exp(-scores))

def safe_roc_auc(y_true, proba):
    aucs=[]
    for j in range(proba.shape[1]):
        yj = y_true[:,j]
        if yj.sum()>0 and (len(yj)-yj.sum())>0:
            try: aucs.append(roc_auc_score(yj, proba[:,j]))
            except: pass
    return float(np.mean(aucs)) if aucs else float("nan")

def metrics_from_proba(y_true, proba, thr):
    y_pred = (proba >= thr[None,:]).astype(int)
    return {
        "f1_micro": float(f1_score(y_true,y_pred,average="micro",zero_division=0)),
        "f1_macro": float(f1_score(y_true,y_pred,average="macro",zero_division=0)),
        "pr_macro": float(average_precision_score(y_true, proba, average="macro")),
        "roc_macro": float(safe_roc_auc(y_true, proba)),
    }

def find_thresholds_per_class(y_true, proba, grid=np.linspace(0.2,0.8,31)):
    C=proba.shape[1]; best=np.zeros(C,dtype="float64")
    for j in range(C):
        yj=y_true[:,j]
        if yj.sum()==0: best[j]=0.5; continue
        best_f1,best_t=-1.0,0.5
        for t in grid:
            f1j=f1_score(yj,(proba[:,j]>=t).astype(int),zero_division=0)
            if f1j>best_f1: best_f1,best_t=f1j,t
        best[j]=best_t
    return best


In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier

SEED=42; N_JOBS=-1; MAX_ITER_LR=5000

def get_models():
    return {
        "logreg_C2": OneVsRestClassifier(
            LogisticRegression(solver="saga", penalty="l2", C=2.0,
                               max_iter=MAX_ITER_LR, n_jobs=N_JOBS,
                               class_weight="balanced", random_state=SEED),
            n_jobs=N_JOBS),
        "ridge_a1": OneVsRestClassifier(
            RidgeClassifier(alpha=1.0, class_weight="balanced", random_state=SEED),
            n_jobs=N_JOBS),
        "linsvm_C1_cal": OneVsRestClassifier(
            CalibratedClassifierCV(LinearSVC(C=1.0, random_state=SEED),
                                   method="sigmoid", cv=5),
            n_jobs=N_JOBS),
        "cnb_a05": OneVsRestClassifier(ComplementNB(alpha=0.5), n_jobs=N_JOBS),
    }

MODELS=get_models(); list(MODELS.keys())


['logreg_C2', 'ridge_a1', 'linsvm_C1_cal', 'cnb_a05']

In [None]:
import pandas as pd

results=[]; probas_valid={}; fitted_models={}
for name, clf in MODELS.items():
    print(f"Entrenando {name} ...")
    clf.fit(X_train, Y_train)
    fitted_models[name]=clf
    p_valid = to_proba(clf, X_valid); probas_valid[name]=p_valid
    thr = find_thresholds_per_class(Y_valid, p_valid)
    met = metrics_from_proba(Y_valid, p_valid, thr); met.update({"name": name})
    results.append({"name": name, "thr": thr, "metrics_valid": met})

rank_df = (pd.DataFrame([r["metrics_valid"] for r in results])
           .sort_values("f1_macro", ascending=False).reset_index(drop=True))
rank_df


Entrenando logreg_C2 ...
Entrenando ridge_a1 ...
Entrenando linsvm_C1_cal ...
Entrenando cnb_a05 ...


Unnamed: 0,f1_micro,f1_macro,pr_macro,roc_macro,name
0,0.892269,0.884619,0.950481,0.969225,linsvm_C1_cal
1,0.888318,0.878914,0.947031,0.967621,ridge_a1
2,0.878353,0.870171,0.941549,0.963196,logreg_C2
3,0.810845,0.773244,0.842227,0.906894,cnb_a05


In [None]:
best_name = rank_df.loc[0,"name"]
best_thr  = [r["thr"] for r in results if r["name"]==best_name][0]
best_clf  = fitted_models[best_name]

p_test_best = to_proba(best_clf, X_test)
met_test_best = metrics_from_proba(Y_test, p_test_best, np.array(best_thr))
print("Mejor single:", best_name, "| test F1_macro:", round(met_test_best["f1_macro"],4))
met_test_best


Mejor single: linsvm_C1_cal | test F1_macro: 0.8628


{'f1_micro': 0.8710493046776233,
 'f1_macro': 0.8627814346493117,
 'pr_macro': 0.9376885524844704,
 'roc_macro': 0.9581557331766981}

In [None]:
names_for_ens = list(MODELS.keys())
print("Modelos en ensemble:", names_for_ens)

p_valid_ens = np.mean([probas_valid[n] for n in names_for_ens], axis=0)
thr_ens = find_thresholds_per_class(Y_valid, p_valid_ens)
met_ens_valid = metrics_from_proba(Y_valid, p_valid_ens, thr_ens)
print("Ensemble (valid):", met_ens_valid)

p_test_ens = np.mean([to_proba(fitted_models[n], X_test) for n in names_for_ens], axis=0)
met_ens_test = metrics_from_proba(Y_test, p_test_ens, thr_ens)
print("Ensemble (test):", met_ens_test)


Modelos en ensemble: ['logreg_C2', 'ridge_a1', 'linsvm_C1_cal', 'cnb_a05']
Ensemble (valid): {'f1_micro': 0.8764890282131661, 'f1_macro': 0.8707740570906258, 'pr_macro': 0.9396653212221595, 'roc_macro': 0.9610729699659389}
Ensemble (test): {'f1_micro': 0.8564263322884013, 'f1_macro': 0.845939711299291, 'pr_macro': 0.9210081362807779, 'roc_macro': 0.9486312317602891}


In [None]:
from pathlib import Path
import json, joblib

WINNER_NAME = "ensemble_blend" if met_ens_test["f1_macro"] >= met_test_best["f1_macro"] else best_name
WINNER_THR  = thr_ens if WINNER_NAME=="ensemble_blend" else best_thr

if WINNER_NAME=="ensemble_blend":
    winner_model = {"names": names_for_ens, "models": [fitted_models[n] for n in names_for_ens]}
    print("GANADOR: ensemble_blend | F1_macro_test:", round(met_ens_test["f1_macro"],4))
else:
    winner_model = fitted_models[WINNER_NAME]
    print(f"GANADOR: {WINNER_NAME} | F1_macro_test:", round(met_test_best["f1_macro"],4))

WIN_DIR = Path("../models/experiments")/WINNER_NAME
WIN_DIR.mkdir(parents=True, exist_ok=True)

joblib.dump(winner_model, WIN_DIR/"model.joblib")
joblib.dump(tfidf,       WIN_DIR/"tfidf.joblib")
with open(WIN_DIR/"thresholds.json","w",encoding="utf-8") as f:
    json.dump({c: float(t) for c,t in zip(CLASSES, WINNER_THR)}, f, indent=2)

print("Artefactos guardados en", WIN_DIR)

# Verificación
for p in ["model.joblib","tfidf.joblib","thresholds.json"]:
    fp = WIN_DIR/p
    print(" -", p, "OK" if fp.exists() else "FALTA")


GANADOR: linsvm_C1_cal | F1_macro_test: 0.8628
Artefactos guardados en ..\models\experiments\linsvm_C1_cal
 - model.joblib OK
 - tfidf.joblib OK
 - thresholds.json OK
