In [None]:
# ============================================================
# NOTEBOOK: CLASIFICACI√ìN DE TEXTO (RNN) ‚Äî robusto estilo tu CNN
# ENTRADA:
#   - EXACTAMENTE 1 ZIP en /content (al inicio)
#   - Estructuras soportadas dentro del ZIP:
#       (A) Carpetas por clase con .txt:
#             /
#               clase_0/  *.txt
#               clase_1/  *.txt
#               ...
#       (B) Un CSV con texto+label (autodetecta columnas):
#             - 1 o 2 columnas de texto (p.ej. title + description)
#             - 1 columna label (string o num√©rica)
# SALIDAS:
#   - /content/resultados_texto.zip:
#       model.keras
#       weights.best.keras (si existe)
#       metadata.json
#       vocab.txt
#   - descarga autom√°tica del zip (Colab)
# ============================================================


# =========================
# CELDA 0 ‚Äî CONFIG GLOBAL + ZIP ‚Üí WORKDIR + autodetecci√≥n DATA MODE (texto)
# =========================
import os, glob, zipfile, shutil, random, time, json
import numpy as np
import tensorflow as tf

WORKDIR = "/content/dataset_text"
CLEAN_WORKDIR = True

SEED = 123
TRAIN_FRAC = 0.70
VAL_FRAC   = 0.15
TEST_FRAC  = 0.15

# -------- GPU T4 ----------
USE_MIXED_PRECISION = True
if USE_MIXED_PRECISION:
    try:
        from tensorflow.keras import mixed_precision
        mixed_precision.set_global_policy("mixed_float16")
        print("Mixed precision activada:", mixed_precision.global_policy())
    except Exception as e:
        print("No se pudo activar mixed precision:", e)

AUTOTUNE = tf.data.AUTOTUNE
tf.random.set_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# -------- EXACTAMENTE 1 ZIP al inicio ----------
zips = sorted(glob.glob("/content/*.zip"), key=os.path.getmtime)
assert len(zips) == 1, (
    f"Al INICIO se esperaba EXACTAMENTE 1 ZIP (el de datos) en /content, "
    f"pero encontr√© {len(zips)}.\n"
    "ZIPs encontrados:\n" + "\n".join([f" - {os.path.basename(z)}" for z in zips]) + "\n\n"
    "Deja SOLO el ZIP de datos antes de correr la Celda 0."
)
zip_name = zips[0]
ZIP_DATOS_BASENAME = os.path.basename(zip_name)

print("ZIP detectado (√∫nico al inicio):", ZIP_DATOS_BASENAME)
print("√öltima modificaci√≥n:", time.ctime(os.path.getmtime(zip_name)))

# -------- descomprimir ----------
if CLEAN_WORKDIR and os.path.isdir(WORKDIR):
    shutil.rmtree(WORKDIR)
os.makedirs(WORKDIR, exist_ok=True)

with zipfile.ZipFile(zip_name, "r") as z:
    z.extractall(WORKDIR)

print("Dataset extra√≠do en:", WORKDIR)
!ls -lah "{WORKDIR}"

# -------- autodetecci√≥n modo texto ----------
import pandas as pd

MODE = None
DATA_DIR = None
CSV_TEXT_PATH = None
TEXT_COLS = None
LABEL_COL = None

def walk_files(root):
    for dirpath, _, filenames in os.walk(root):
        for fn in filenames:
            yield os.path.join(dirpath, fn)

def find_text_folders_root(workdir):
    candidates = [workdir]
    candidates += [os.path.join(workdir, d) for d in os.listdir(workdir) if os.path.isdir(os.path.join(workdir, d))]

    def score_dir(d):
        if not os.path.isdir(d):
            return (-1, -1, -1)
        subdirs = [os.path.join(d, s) for s in os.listdir(d) if os.path.isdir(os.path.join(d, s))]
        if len(subdirs) < 2:
            return (-1, -1, -1)
        good, total = 0, 0
        for sd in subdirs:
            n = len(glob.glob(os.path.join(sd, "*.txt")))
            if n > 0:
                good += 1
                total += n
        return (good, total, len(subdirs))

    best, best_sc = None, (-1, -1, -1)
    for c in candidates:
        sc = score_dir(c)
        if sc > best_sc:
            best_sc = sc
            best = c

    if best is None or best_sc[0] < 2 or best_sc[1] == 0:
        return None, best_sc
    return best, best_sc

def find_csv_files(workdir):
    return sorted([p for p in walk_files(workdir) if p.lower().endswith(".csv")])

def detect_text_label_columns_textonly(df):
    """
    Texto:
      - 1 o 2 columnas de texto (object/string) con longitud promedio >=5
    Label:
      - pocos √∫nicos (categor√≠a real), puede ser num√©rica o string
    """
    df = df.dropna(axis=1, how="all")
    n = int(df.shape[0])
    if df.shape[1] < 2 or n < 20:
        return None

    text_cands = [c for c in df.columns if (df[c].dtype == "object" or str(df[c].dtype).startswith("string"))]
    if len(text_cands) == 0:
        return None

    def text_score(col):
        s = df[col].dropna().astype(str)
        if len(s) == 0:
            return -1.0
        L = s.str.len().clip(0, 10000)
        return float(L.mean())

    text_cands = sorted(text_cands, key=text_score, reverse=True)
    if text_score(text_cands[0]) < 5:
        return None

    def label_ok(col, text_cols):
        if col in text_cols:
            return False
        s = df[col].dropna()
        if len(s) == 0:
            return False
        u = int(s.astype(str).nunique())
        if u < 2:
            return False
        # categor√≠a: pocos √∫nicos
        if u > 100:
            return False
        if u > max(50, int(0.01 * n)):
            return False
        return True

    # 1 texto
    t1 = [text_cands[0]]
    label_cands = [c for c in df.columns if label_ok(c, t1)]
    if len(label_cands) > 0:
        lcol = min(label_cands, key=lambda c: df[c].dropna().astype(str).nunique())
        return (t1, lcol)

    # 2 textos
    if len(text_cands) >= 2:
        t2 = [text_cands[0], text_cands[1]]
        label_cands = [c for c in df.columns if label_ok(c, t2)]
        if len(label_cands) > 0:
            lcol = min(label_cands, key=lambda c: df[c].dropna().astype(str).nunique())
            return (t2, lcol)

    return None

TEXT_ROOT, text_sc = find_text_folders_root(WORKDIR)
CSV_FILES = find_csv_files(WORKDIR)

if TEXT_ROOT is not None:
    MODE = "folders_txt"
    DATA_DIR = TEXT_ROOT
else:
    for c in CSV_FILES:
        try:
            df0 = pd.read_csv(c)
            out = detect_text_label_columns_textonly(df0)
            if out is None:
                continue
            TEXT_COLS, LABEL_COL = out
            MODE = "csv_text"
            CSV_TEXT_PATH = c
            break
        except Exception:
            continue

if MODE is None:
    raise ValueError(
        "No detect√© dataset de texto.\n"
        "- O bien carpetas por clase con .txt\n"
        "- O bien CSV con columnas texto+label\n"
        f"WORKDIR={WORKDIR}"
    )

print("\nCONFIG FINAL (TEXTO):")
print("  WORKDIR:", WORKDIR)
print("  MODE   :", MODE)
if MODE == "folders_txt":
    print("  DATA_DIR:", DATA_DIR)
    print("  score (folders_con_txt, total_txt, subcarpetas):", text_sc)
else:
    print("  CSV_TEXT_PATH:", CSV_TEXT_PATH)
    print("  TEXT_COLS:", TEXT_COLS)
    print("  LABEL_COL:", LABEL_COL)
print("  GPU:", tf.config.list_physical_devices("GPU"))


In [None]:
# ==========================================================
# CELDA 1 ‚Äî CARGA + SPLIT + DESBALANCE (estilo tu CNN)
# ==========================================================
import numpy as np

def stratified_split(labels, train_frac, val_frac, test_frac, seed=123):
    assert abs(train_frac + val_frac + test_frac - 1.0) < 1e-9
    rng = np.random.default_rng(seed)
    idx = np.arange(len(labels))

    train_idx, val_idx, test_idx = [], [], []
    for c in np.unique(labels):
        c_idx = idx[labels == c]
        rng.shuffle(c_idx)
        n = len(c_idx)
        if n == 0:
            continue

        n_train = int(round(n * train_frac))
        n_val   = int(round(n * val_frac))

        n_train = max(1, min(n_train, n))
        n_val = min(n_val, n - n_train)

        train_idx.extend(c_idx[:n_train])
        val_idx.extend(c_idx[n_train:n_train+n_val])
        test_idx.extend(c_idx[n_train+n_val:])

    rng.shuffle(train_idx); rng.shuffle(val_idx); rng.shuffle(test_idx)
    return np.array(train_idx), np.array(val_idx), np.array(test_idx)

def compute_class_weight(train_labels, num_classes):
    counts = np.bincount(train_labels, minlength=num_classes).astype(np.int64)
    N = counts.sum()
    weights = {}
    for c in range(num_classes):
        weights[c] = 0.0 if counts[c] == 0 else float(N) / float(num_classes * counts[c])
    return counts, weights

def bincountK(y, K):
    return np.bincount(y, minlength=K)

# -------- modo folders_txt ----------
if MODE == "folders_txt":
    import os, glob

    classes = sorted([d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))])
    if len(classes) < 2:
        raise ValueError("Se requieren >=2 clases.")

    files, labels = [], []
    per_class = []
    for i, cls in enumerate(classes):
        cls_dir = os.path.join(DATA_DIR, cls)
        cls_files = sorted(glob.glob(os.path.join(cls_dir, "*.txt")))
        per_class.append((cls, len(cls_files)))
        files.extend(cls_files)
        labels.extend([i]*len(cls_files))

    all_files = np.array(files)
    all_labels = np.array(labels, dtype=np.int32)
    num_classes = len(classes)

    train_idx, val_idx, test_idx = stratified_split(all_labels, TRAIN_FRAC, VAL_FRAC, TEST_FRAC, SEED)

    print("\n[Texto por carpetas]")
    print("DATA_DIR:", DATA_DIR)
    print("Num clases:", num_classes)
    print("Total ejemplos:", len(all_files))
    print("\nConteo por clase (primeras 20):")
    for cls, n in per_class[:20]:
        print(f"  {cls:<30s} {n}")
    if len(per_class) > 20:
        print("  ...")

# -------- modo csv_text ----------
else:
    import pandas as pd
    df = pd.read_csv(CSV_TEXT_PATH)

    use_cols = list(TEXT_COLS) + [LABEL_COL]
    df = df[use_cols].dropna()

    if len(TEXT_COLS) == 1:
        all_texts = df[TEXT_COLS[0]].astype(str).values
    else:
        all_texts = (df[TEXT_COLS[0]].astype(str) + " " + df[TEXT_COLS[1]].astype(str)).values

    labels_raw = df[LABEL_COL].astype(str).values
    classes = sorted(list(set(labels_raw.tolist())))
    class_to_idx = {c:i for i,c in enumerate(classes)}
    all_labels = np.array([class_to_idx[c] for c in labels_raw], dtype=np.int32)
    num_classes = len(classes)

    # "archivos" dummy para que el resto sea uniforme
    all_files = np.array(all_texts, dtype=object)

    train_idx, val_idx, test_idx = stratified_split(all_labels, TRAIN_FRAC, VAL_FRAC, TEST_FRAC, SEED)

    global_counts = np.bincount(all_labels, minlength=num_classes).astype(int)
    per_class = [(classes[i], int(global_counts[i])) for i in range(num_classes)]

    print("\n[Texto en CSV]")
    print("CSV_TEXT_PATH:", CSV_TEXT_PATH)
    print("TEXT_COLS:", TEXT_COLS, "| LABEL_COL:", LABEL_COL)
    print("Num clases:", num_classes)
    print("Total ejemplos:", len(all_files))
    print("\nConteo por clase (primeras 20):")
    for cls, n in per_class[:20]:
        print(f"  {cls:<30s} {n}")
    if len(per_class) > 20:
        print("  ...")

# -------- validaci√≥n fuerte + desbalance estilo CNN ----------
print("\nSplit tama√±os:", "train", len(train_idx), "| val", len(val_idx), "| test", len(test_idx))
if len(train_idx) == 0 or len(val_idx) == 0 or len(test_idx) == 0:
    raise ValueError("Alguno de los splits qued√≥ vac√≠o.")

train_labels = all_labels[train_idx]
class_counts, class_weight = compute_class_weight(train_labels, num_classes)

min_count = int(class_counts.min()) if len(class_counts) else 0
max_count = int(class_counts.max()) if len(class_counts) else 0
imbalance_ratio = (max_count / min_count) if (min_count > 0) else float("inf")

IMBALANCED = (imbalance_ratio >= 2.0) or (min_count <= 10)

TINY_CLASS_THRESHOLD = 5
RARE_CLASS_THRESHOLD = 10

tiny_idx = np.where(class_counts <= TINY_CLASS_THRESHOLD)[0]
rare_idx = np.where((class_counts > TINY_CLASS_THRESHOLD) & (class_counts <= RARE_CLASS_THRESHOLD))[0]
zero_idx = np.where(class_counts == 0)[0]

HAS_TINY_CLASSES = len(tiny_idx) > 0
HAS_RARE_CLASSES = len(rare_idx) > 0

USE_CLASS_WEIGHT = IMBALANCED or HAS_TINY_CLASSES or HAS_RARE_CLASSES
MONITOR_METRIC = "val_loss" if (IMBALANCED or HAS_TINY_CLASSES) else "val_accuracy"

print("\nDistribuci√≥n TRAIN: min", min_count, "| max", max_count, "| ratio", imbalance_ratio)
print("IMBALANCED:", IMBALANCED)
print("HAS_TINY_CLASSES:", HAS_TINY_CLASSES, "| HAS_RARE_CLASSES:", HAS_RARE_CLASSES)
print("USE_CLASS_WEIGHT:", USE_CLASS_WEIGHT)
print("MONITOR_METRIC:", MONITOR_METRIC)

print("\n=== SANITY CHECK SPLITS ===")
print("Labels min/max:", int(all_labels.min()), int(all_labels.max()))
print("Num clases declarado:", num_classes)
print("Train per class:", bincountK(all_labels[train_idx], num_classes).tolist())
print("Val   per class:", bincountK(all_labels[val_idx],   num_classes).tolist())
print("Test  per class:", bincountK(all_labels[test_idx],  num_classes).tolist())


In [None]:
# ==========================================================
# CELDA 2 ‚Äî PIPELINE tf.data (texto) + TextVectorization.adapt(train)
# ==========================================================
from tensorflow.keras import layers
import tensorflow as tf
import numpy as np

MAX_TOKENS = 20000
SEQ_LEN = 256

text_vectorizer = layers.TextVectorization(
    max_tokens=MAX_TOKENS,
    output_mode="int",
    output_sequence_length=SEQ_LEN,
    standardize="lower_and_strip_punctuation",
    split="whitespace"
)

# -------- obtener textos raw (train) ----------
if MODE == "folders_txt":
    def read_txt_tf(path):
        x = tf.io.read_file(path)
        x = tf.strings.unicode_decode(x, "UTF-8", errors="replace")
        x = tf.strings.unicode_encode(x, "UTF-8")
        return x

    train_text_ds = tf.data.Dataset.from_tensor_slices(all_files[train_idx]).map(read_txt_tf, num_parallel_calls=AUTOTUNE).batch(256)
    text_vectorizer.adapt(train_text_ds)

    def make_text_ds_files(files, labels, training=False, batch=32, seed=123):
        ds = tf.data.Dataset.from_tensor_slices((files, labels))
        if training:
            ds = ds.shuffle(len(files), seed=seed, reshuffle_each_iteration=True)
        ds = ds.map(lambda p,y: (text_vectorizer(read_txt_tf(p)), y), num_parallel_calls=AUTOTUNE)
        ds = ds.batch(batch).prefetch(AUTOTUNE)
        return ds

    BATCH = 32
    train_ds = make_text_ds_files(all_files[train_idx], all_labels[train_idx], training=True, batch=BATCH, seed=SEED)
    val_ds   = make_text_ds_files(all_files[val_idx],   all_labels[val_idx],   training=False, batch=BATCH)
    test_ds  = make_text_ds_files(all_files[test_idx],  all_labels[test_idx],  training=False, batch=BATCH)

else:
    # all_files contiene los textos raw
    all_texts = all_files.astype(str)

    train_text_ds = tf.data.Dataset.from_tensor_slices(all_texts[train_idx]).batch(256)
    text_vectorizer.adapt(train_text_ds)

    def make_text_ds_texts(texts, labels, training=False, batch=32, seed=123):
        ds = tf.data.Dataset.from_tensor_slices((texts, labels))
        if training:
            ds = ds.shuffle(len(texts), seed=seed, reshuffle_each_iteration=True)
        ds = ds.map(lambda t,y: (text_vectorizer(t), y), num_parallel_calls=AUTOTUNE)
        ds = ds.batch(batch).prefetch(AUTOTUNE)
        return ds

    BATCH = 32
    train_ds = make_text_ds_texts(all_texts[train_idx], all_labels[train_idx], training=True, batch=BATCH, seed=SEED)
    val_ds   = make_text_ds_texts(all_texts[val_idx],   all_labels[val_idx],   training=False, batch=BATCH)
    test_ds  = make_text_ds_texts(all_texts[test_idx],  all_labels[test_idx],  training=False, batch=BATCH)

print("Datasets listos (texto). BATCH =", BATCH)


In [None]:
# ==========================================================
# CELDA 3 ‚Äî MODELO RNN (texto: binario o multiclase)
# ==========================================================
from tensorflow.keras import models, layers
import tensorflow as tf

def build_text_rnn(num_classes, vocab_size, embed_dim=128, rnn_units=128):
    inputs = layers.Input(shape=(None,), dtype=tf.int32)
    x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(inputs)
    x = layers.Bidirectional(layers.GRU(rnn_units, return_sequences=False))(x)
    x = layers.Dropout(0.30)(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.30)(x)

    if int(num_classes) == 2:
        outputs = layers.Dense(1, activation="sigmoid", dtype="float32")(x)
        loss = "binary_crossentropy"
        metrics = ["accuracy"]
    else:
        outputs = layers.Dense(num_classes, activation="softmax", dtype="float32")(x)
        loss = "sparse_categorical_crossentropy"
        metrics = ["accuracy"]
        if int(num_classes) >= 10:
            metrics.append(tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name="top5_acc"))

    model = models.Model(inputs, outputs)
    return model, loss, metrics

vocab_size = text_vectorizer.vocabulary_size()
model, loss_fn, metrics = build_text_rnn(num_classes=num_classes, vocab_size=vocab_size)

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=loss_fn,
    metrics=metrics
)

model.summary()


In [None]:
# ==========================================================
# CELDA 4 ‚Äî TRAIN (OOM-safe + class_weight + repeat + steps_per_epoch)
# ==========================================================
import gc
import tensorflow as tf
import numpy as np
import math

fit_class_weight = class_weight if USE_CLASS_WEIGHT else None
LR = 5e-4 if USE_CLASS_WEIGHT else 1e-3
PATIENCE = 8 if USE_CLASS_WEIGHT else 5

try:
    model.optimizer.learning_rate.assign(LR)
except Exception:
    model.optimizer.learning_rate = LR

print("LR usado:", LR)
print("PATIENCE usado:", PATIENCE)
print("MONITOR_METRIC:", MONITOR_METRIC)

callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor=MONITOR_METRIC,
        patience=PATIENCE,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ModelCheckpoint(
        filepath="/content/weights.best.keras",
        monitor=MONITOR_METRIC,
        save_best_only=True
    )
]

def batch_candidates(b0):
    cands = [int(b0)]
    while cands[-1] > 8:
        cands.append(cands[-1] // 2)
    cands = sorted(set([b for b in cands if b >= 8]), reverse=True)
    return cands

BATCH_TRIES = batch_candidates(BATCH)
print("BATCH tries:", BATCH_TRIES)

history = None
last_err = None

for b_try in BATCH_TRIES:
    try:
        train_ds_b = train_ds.unbatch().batch(b_try).prefetch(AUTOTUNE)
        val_ds_b   = val_ds.unbatch().batch(b_try).prefetch(AUTOTUNE)
        test_ds_b  = test_ds.unbatch().batch(b_try).prefetch(AUTOTUNE)

        steps_per_epoch   = int(math.ceil(len(train_idx) / b_try))
        validation_steps  = int(math.ceil(len(val_idx)   / b_try))

        print(f"\nEntrenando con BATCH={b_try} | monitor={MONITOR_METRIC} | class_weight={fit_class_weight is not None}")
        history = model.fit(
            train_ds_b.repeat(),
            steps_per_epoch=steps_per_epoch,
            validation_data=val_ds_b,
            validation_steps=validation_steps,
            epochs=30,
            callbacks=callbacks,
            class_weight=fit_class_weight
        )

        BATCH = b_try
        train_ds, val_ds, test_ds = train_ds_b, val_ds_b, test_ds_b
        last_err = None
        break

    except tf.errors.ResourceExhaustedError as e:
        last_err = e
        print(f"\n‚ö†Ô∏è OOM con BATCH={b_try}. Reintentando con batch menor...")
        try:
            del train_ds_b, val_ds_b, test_ds_b
        except Exception:
            pass
        gc.collect()

if history is None and last_err is not None:
    raise last_err

print("\n‚úÖ Entrenamiento finalizado. BATCH final usado:", BATCH)


In [None]:
# ==========================================================
# CELDA 4.5 ‚Äî RESUMEN DE ENTRENAMIENTO (BEST epoch real por MONITOR_METRIC)
# ==========================================================
import numpy as np

hist = history.history
mon = MONITOR_METRIC

if mon in hist:
    if "acc" in mon:
        best_epoch = int(np.argmax(hist[mon]) + 1)
        best_value = float(np.max(hist[mon]))
        mode = "max"
    else:
        best_epoch = int(np.argmin(hist[mon]) + 1)
        best_value = float(np.min(hist[mon]))
        mode = "min"

    print("\nüìå RESUMEN DE ENTRENAMIENTO")
    print(f"Monitor usado      : {mon} ({mode})")
    print(f"Epoch seleccionado : {best_epoch}")
    print(f"Mejor {mon}        : {best_value:.4f}")
    print("‚úî restore_best_weights=True ‚Üí el modelo en memoria qued√≥ en ese epoch")
else:
    print("\n‚ö†Ô∏è No se pudo determinar el epoch final (monitor no encontrado).")
    print("Keys disponibles:", list(hist.keys()))


In [None]:
# ==========================================================
# CELDA 4.7 ‚Äî DIAGN√ìSTICO AUTOM√ÅTICO DEL ENTRENAMIENTO (v2, texto)
# ==========================================================
import numpy as np

def diagnose_training_v2_text(history, num_classes, monitor_metric="val_loss", patience=None):
    h = history.history
    epochs_ran = len(next(iter(h.values()))) if len(h) else 0

    def arr(key):
        v = h.get(key, None)
        return None if v is None else np.array(v, dtype=float)

    acc   = arr("accuracy")
    vacc  = arr("val_accuracy")
    loss  = arr("loss")
    vloss = arr("val_loss")

    chance = 1.0 / float(num_classes) if num_classes else np.nan

    mon = arr(monitor_metric)
    if mon is None:
        print("‚ö†Ô∏è No existe monitor_metric en history:", monitor_metric)
        print("Keys:", list(h.keys()))
        return

    if "acc" in monitor_metric:
        best_i = int(np.nanargmax(mon))
        best_val = float(np.nanmax(mon))
        mode = "max"
    else:
        best_i = int(np.nanargmin(mon))
        best_val = float(np.nanmin(mon))
        mode = "min"

    def safe_get(a, i):
        return float(a[i]) if a is not None and len(a) > i else np.nan

    last_i = epochs_ran - 1

    last_acc  = safe_get(acc, last_i)
    last_vacc = safe_get(vacc, last_i)
    last_loss = safe_get(loss, last_i)
    last_vloss= safe_get(vloss, last_i)

    best_acc  = safe_get(acc, best_i)
    best_vacc = safe_get(vacc, best_i)
    best_loss = safe_get(loss, best_i)
    best_vloss= safe_get(vloss, best_i)

    degrade_loss = (not np.isnan(best_vloss) and not np.isnan(last_vloss) and last_vloss > best_vloss * 1.15)
    degrade_acc  = (not np.isnan(best_vacc) and not np.isnan(last_vacc) and last_vacc < best_vacc - 0.07)

    gap_best = best_acc - best_vacc if (not np.isnan(best_acc) and not np.isnan(best_vacc)) else np.nan
    gap_last = last_acc - last_vacc if (not np.isnan(last_acc) and not np.isnan(last_vacc)) else np.nan

    def slope(a):
        if a is None or len(a) < 6:
            return np.nan
        y = a[-5:]
        x = np.arange(len(y), dtype=float)
        return float(np.polyfit(x, y, 1)[0])

    s_acc  = slope(acc)
    s_vacc = slope(vacc)
    s_loss = slope(loss)
    s_vloss= slope(vloss)

    print("\n" + "="*60)
    print("DIAGN√ìSTICO ‚Äî RESUMEN (v2, TEXTO)")
    print("="*60)
    print(f"Clases: {num_classes} | azar‚âà {chance:.4f} | epochs corridos: {epochs_ran}")
    print(f"Monitor: {monitor_metric} ({mode}) | best_epoch={best_i+1} | best={best_val:.4f}")
    if patience is not None:
        print(f"Patience: {patience}")

    print("\n‚Äî En BEST epoch (lo que queda en memoria si restore_best_weights=True) ‚Äî")
    print(f"  acc={best_acc:.4f} | val_acc={best_vacc:.4f} | loss={best_loss:.4f} | val_loss={best_vloss:.4f}")
    print(f"  gap(train-val) en BEST: {gap_best:.4f}")

    print("\n‚Äî En √öLTIMO epoch entrenado (solo para ver tendencia) ‚Äî")
    print(f"  acc={last_acc:.4f} | val_acc={last_vacc:.4f} | loss={last_loss:.4f} | val_loss={last_vloss:.4f}")
    print(f"  gap(train-val) en √öLTIMO: {gap_last:.4f}")
    print(f"  slopes √∫ltimos 5: acc_tr={s_acc:.4f}, acc_val={s_vacc:.4f}, loss_tr={s_loss:.4f}, loss_val={s_vloss:.4f}")

    near_chance = chance + 0.03

    if not np.isnan(best_vacc) and best_vacc <= near_chance:
        print("\n‚ö†Ô∏è VALIDACI√ìN CERCA DE AZAR (pipeline/labels/split sospechoso)")
        return

    if (not np.isnan(best_acc) and best_acc < 0.60) and (not np.isnan(best_vacc) and best_vacc < 0.60):
        print("\nüü° SUBAPRENDIZAJE (UNDERFITTING)")
        return

    if (degrade_loss or degrade_acc) and (not np.isnan(gap_best) and gap_best >= 0.12):
        print("\nüî¥ OVERFITTING (MEMORIZACI√ìN) DESPU√âS DEL BEST")
        return

    if (not np.isnan(best_vacc) and best_vacc > chance + 0.20) and (not np.isnan(gap_best) and gap_best <= 0.12):
        print("\n‚úÖ TODO BIEN / GENERALIZA RAZONABLEMENTE")
        return

    print("\nüü¢ MIXTO (pero NO roto): aprende, con margen de mejora")

diagnose_training_v2_text(history, num_classes=num_classes, monitor_metric=MONITOR_METRIC, patience=PATIENCE)


In [None]:
# ==========================================================
# CELDA 5 ‚Äî EVALUACI√ìN EN TEST
# ==========================================================
test_out = model.evaluate(test_ds, verbose=0)
print("TEST metrics:")
for name, val in zip(model.metrics_names, test_out):
    print(f"  {name:>12s}: {float(val):.4f}")


In [None]:
# ==========================================================
# CELDA 5.5 ‚Äî REPORTE + MATRIZ DE CONFUSI√ìN
# ==========================================================
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

y_true, y_pred = [], []

for x, y in test_ds:
    p = model.predict(x, verbose=0)
    if int(num_classes) == 2:
        pred = (p.reshape(-1) >= 0.5).astype(int)
    else:
        pred = np.argmax(p, axis=1).astype(int)

    y_true.extend(y.numpy().tolist())
    y_pred.extend(pred.tolist())

cm = confusion_matrix(y_true, y_pred)
print("Matriz de confusi√≥n shape:", cm.shape)

print("\nClassification report:")
print(classification_report(y_true, y_pred, target_names=classes, digits=4))


In [None]:
# ==========================================================
# CELDA 6 ‚Äî PERSISTENCIA EN resultados_texto.zip + DESCARGA AUTOM√ÅTICA
# Contiene:
#   - model.keras
#   - weights.best.keras (si existe)
#   - metadata.json
#   - vocab.txt
# ==========================================================
import os, zipfile, shutil, time, json

OUT_ZIP = "/content/resultados_texto.zip"
BUNDLE_DIR = "/content/_bundle_resultados_texto"

if os.path.isdir(BUNDLE_DIR):
    shutil.rmtree(BUNDLE_DIR)
os.makedirs(BUNDLE_DIR, exist_ok=True)

# 1) Guardar modelo completo
MODEL_PATH = os.path.join(BUNDLE_DIR, "model.keras")
model.save(MODEL_PATH)

# 2) Copiar checkpoint best si existe
WEIGHTS_SRC = "/content/weights.best.keras"
WEIGHTS_DST = os.path.join(BUNDLE_DIR, "weights.best.keras")
if os.path.isfile(WEIGHTS_SRC):
    shutil.copy2(WEIGHTS_SRC, WEIGHTS_DST)

# 3) Guardar vocab fijo
VOCAB_PATH = os.path.join(BUNDLE_DIR, "vocab.txt")
vocab = text_vectorizer.get_vocabulary()
with open(VOCAB_PATH, "w", encoding="utf-8") as f:
    for tok in vocab:
        f.write(tok.replace("\n", " ") + "\n")

# 4) Metadata
meta = {
    "created_at": time.strftime("%Y-%m-%d %H:%M:%S"),
    "zip_train_source": ZIP_DATOS_BASENAME,
    "mode": MODE,
    "seed": int(SEED),
    "train_frac": float(TRAIN_FRAC),
    "val_frac": float(VAL_FRAC),
    "test_frac": float(TEST_FRAC),
    "batch_final": int(BATCH),
    "monitor_metric": MONITOR_METRIC,
    "classes": list(classes),
    "num_classes": int(num_classes),
    "text": {
        "max_tokens": int(MAX_TOKENS),
        "seq_len": int(SEQ_LEN),
        "standardize": "lower_and_strip_punctuation",
        "split": "whitespace",
        "vocab_file": "vocab.txt",
        "vocab_size": int(len(vocab)),
    },
    "prediction": {
        "type": "binary" if int(num_classes) == 2 else "multiclass",
        "label_type": "int index -> classes[index]",
    }
}
with open(os.path.join(BUNDLE_DIR, "metadata.json"), "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

# 5) Empaquetar zip
if os.path.isfile(OUT_ZIP):
    os.remove(OUT_ZIP)

with zipfile.ZipFile(OUT_ZIP, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for root, _, files in os.walk(BUNDLE_DIR):
        for fn in files:
            abs_path = os.path.join(root, fn)
            rel_path = os.path.relpath(abs_path, BUNDLE_DIR)
            z.write(abs_path, rel_path)

print("‚úÖ Creado:", OUT_ZIP)
!ls -lah /content/resultados_texto.zip

# 6) Descarga autom√°tica (Colab)
try:
    from google.colab import files
    files.download(OUT_ZIP)
except Exception as e:
    print("‚ö†Ô∏è No se pudo descargar autom√°ticamente:", e)
