In [None]:
# ============================================================
# NOTEBOOK / SCRIPT: INFERENCIA TEXTO DESDE 2 ZIPs
#   - resultados_texto.zip  (bundle entrenado)
#   - textos_nuevos.zip     (datos nuevos)
#
# Soporta formatos de textos_nuevos.zip:
#   (A) Carpetas por clase con .txt (las clases pueden existir o no; aquí se ignoran)
#       /
#         cualquier_carpeta/*.txt
#         ...
#       -> toma TODOS los .txt recursivamente
#
#   (B) CSV con 1 o 2 columnas de texto (autodetecta), sin label
#       /
#         *.csv
#       -> concatena 2 columnas si aplica
#
# Salida:
#   - /content/predicciones_texto.csv
# ============================================================


# =========================
# CELDA 0 — CONFIG + descomprimir resultados_texto.zip y textos_nuevos.zip
# =========================
import os, glob, zipfile, shutil, json, time
import numpy as np
import pandas as pd
import tensorflow as tf

RESULTS_ZIP = "/content/resultados_texto.zip"
NEW_ZIP     = "/content/textos_nuevos.zip"

assert os.path.isfile(RESULTS_ZIP), f"No existe: {RESULTS_ZIP}"
assert os.path.isfile(NEW_ZIP), f"No existe: {NEW_ZIP}"

RESULTS_DIR = "/content/_infer_resultados_texto"
NEW_DIR     = "/content/_infer_textos_nuevos"

for d in [RESULTS_DIR, NEW_DIR]:
    if os.path.isdir(d):
        shutil.rmtree(d)
    os.makedirs(d, exist_ok=True)

with zipfile.ZipFile(RESULTS_ZIP, "r") as z:
    z.extractall(RESULTS_DIR)

with zipfile.ZipFile(NEW_ZIP, "r") as z:
    z.extractall(NEW_DIR)

print("✅ resultados extraído en:", RESULTS_DIR)
print("✅ nuevos textos extraído en:", NEW_DIR)
!ls -lah "{RESULTS_DIR}"
!ls -lah "{NEW_DIR}"


# =========================
# CELDA 1 — Cargar metadata + vocab + modelo (preprocesamiento aprendido)
# =========================
META_PATH  = os.path.join(RESULTS_DIR, "metadata.json")
MODEL_PATH = os.path.join(RESULTS_DIR, "model.keras")
VOCAB_PATH = os.path.join(RESULTS_DIR, "vocab.txt")

assert os.path.isfile(META_PATH),  "metadata.json no encontrado en resultados_texto.zip"
assert os.path.isfile(MODEL_PATH), "model.keras no encontrado en resultados_texto.zip"
assert os.path.isfile(VOCAB_PATH), "vocab.txt no encontrado en resultados_texto.zip"

with open(META_PATH, "r", encoding="utf-8") as f:
    meta = json.load(f)

classes = meta["classes"]
num_classes = int(meta["num_classes"])
SEQ_LEN = int(meta["text"]["seq_len"])
MAX_TOKENS = int(meta["text"]["max_tokens"])

print("MODE entrenado:", meta.get("mode"))
print("Num clases:", num_classes)
print("SEQ_LEN:", SEQ_LEN, "| MAX_TOKENS:", MAX_TOKENS)
print("Clases (primeras 20):", classes[:20], "..." if len(classes) > 20 else "")

# ---- reconstruir TextVectorization con el vocab aprendido ----
from tensorflow.keras import layers

with open(VOCAB_PATH, "r", encoding="utf-8") as f:
    vocab = [line.rstrip("\n") for line in f]

text_vectorizer = layers.TextVectorization(
    max_tokens=MAX_TOKENS,
    output_mode="int",
    output_sequence_length=SEQ_LEN,
    standardize="lower_and_strip_punctuation",
    split="whitespace"
)
text_vectorizer.set_vocabulary(vocab)

# ---- cargar modelo ----
model = tf.keras.models.load_model(MODEL_PATH)
print("✅ Modelo cargado:", MODEL_PATH)


# =========================
# CELDA 2 — Detectar formato de textos_nuevos.zip y listar entradas
# =========================
TEXT_EXTS = (".txt",)
CSV_EXTS  = (".csv",)

def walk_files(root):
    for dirpath, _, filenames in os.walk(root):
        for fn in filenames:
            yield os.path.join(dirpath, fn)

def list_txt_files(root):
    return sorted([p for p in walk_files(root) if p.lower().endswith(TEXT_EXTS)])

def list_csv_files(root):
    return sorted([p for p in walk_files(root) if p.lower().endswith(CSV_EXTS)])

def detect_text_columns_unlabeled(df):
    """
    Autodetecta 1 o 2 columnas de texto para inferencia (sin label).
    Criterio: dtype object/string y longitud promedio mayor.
    Regresa: [col] o [col1, col2] o None
    """
    df = df.dropna(axis=1, how="all")
    if df.shape[1] < 1:
        return None

    text_cands = [c for c in df.columns if (df[c].dtype == "object" or str(df[c].dtype).startswith("string"))]
    if len(text_cands) == 0:
        return None

    def text_score(col):
        s = df[col].dropna().astype(str)
        if len(s) == 0:
            return -1.0
        L = s.str.len().clip(0, 10000)
        return float(L.mean())

    text_cands = sorted(text_cands, key=text_score, reverse=True)
    if text_score(text_cands[0]) < 2:
        return None

    # 1 texto por defecto
    cols1 = [text_cands[0]]

    # si hay segunda columna con score cercano, usar 2
    if len(text_cands) >= 2:
        s0 = text_score(text_cands[0])
        s1 = text_score(text_cands[1])
        if s1 >= 0.5 * s0 and s1 >= 2:
            return [text_cands[0], text_cands[1]]

    return cols1

txt_files = list_txt_files(NEW_DIR)
csv_files = list_csv_files(NEW_DIR)

NEW_MODE = None
CSV_PATH = None
CSV_TEXT_COLS = None

if len(txt_files) > 0:
    NEW_MODE = "txt_files"
    print("Detectado: TXT (archivos .txt). Total:", len(txt_files))
else:
    # busca un csv con texto
    for c in csv_files:
        try:
            df0 = pd.read_csv(c)
            cols = detect_text_columns_unlabeled(df0)
            if cols is None:
                continue
            NEW_MODE = "csv_unlabeled"
            CSV_PATH = c
            CSV_TEXT_COLS = cols
            break
        except Exception:
            continue

if NEW_MODE is None:
    raise ValueError(
        "No detecté textos en el ZIP nuevo.\n"
        "- Debe contener .txt (cualquier carpeta)\n"
        "- o un .csv con 1 o 2 columnas de texto.\n"
        f"NEW_DIR={NEW_DIR}"
    )

print("NEW_MODE:", NEW_MODE)
if NEW_MODE == "csv_unlabeled":
    print("CSV_PATH:", CSV_PATH)
    print("CSV_TEXT_COLS:", CSV_TEXT_COLS)


# =========================
# CELDA 3 — Construir Dataset tf.data y predecir
# =========================
BATCH = int(meta.get("batch_final", 32))

def read_txt_tf(path):
    x = tf.io.read_file(path)
    x = tf.strings.unicode_decode(x, "UTF-8", errors="replace")
    x = tf.strings.unicode_encode(x, "UTF-8")
    return x

def make_ds_from_txt_files(paths, batch):
    paths = tf.constant(paths)
    ds = tf.data.Dataset.from_tensor_slices(paths)
    ds = ds.map(lambda p: text_vectorizer(read_txt_tf(p)), num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(batch).prefetch(tf.data.AUTOTUNE)
    return ds

def make_ds_from_texts(texts, batch):
    texts = tf.constant([str(t) for t in texts])
    ds = tf.data.Dataset.from_tensor_slices(texts)
    ds = ds.map(lambda t: text_vectorizer(t), num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(batch).prefetch(tf.data.AUTOTUNE)
    return ds

# --- construir inputs + ids para salida ---
items_id = []
ds = None

if NEW_MODE == "txt_files":
    items_id = txt_files[:]  # filepath real
    ds = make_ds_from_txt_files(txt_files, batch=BATCH)

else:
    df = pd.read_csv(CSV_PATH)
    df = df.dropna(subset=CSV_TEXT_COLS)

    if len(CSV_TEXT_COLS) == 1:
        texts = df[CSV_TEXT_COLS[0]].astype(str).values
    else:
        texts = (df[CSV_TEXT_COLS[0]].astype(str) + " " + df[CSV_TEXT_COLS[1]].astype(str)).values

    # id para salida: índice original del df
    items_id = df.index.astype(int).tolist()
    ds = make_ds_from_texts(texts, batch=BATCH)

# --- predicción ---
probs = model.predict(ds, verbose=0)

# binario vs multiclase
if int(num_classes) == 2:
    # salida (N,1) con sigmoid
    p = probs.reshape(-1)
    pred_idx = (p >= 0.5).astype(int)
    conf = np.where(pred_idx == 1, p, 1.0 - p)
else:
    pred_idx = np.argmax(probs, axis=1).astype(int)
    conf = np.max(probs, axis=1)

pred_class = [classes[i] for i in pred_idx]

print("OK ✅ Predicciones:", len(pred_class))


# =========================
# CELDA 4 — Guardar CSV de salida
# =========================
OUT_CSV = "/content/predicciones_texto.csv"

rows = []
for rid, i, c, cf in zip(items_id, pred_idx.tolist(), pred_class, conf.tolist()):
    rows.append({
        "id": rid,                 # filepath (txt) o index (csv)
        "pred_idx": int(i),
        "pred_class": c,
        "confidence": float(cf)
    })

out_df = pd.DataFrame(rows)
out_df.to_csv(OUT_CSV, index=False, encoding="utf-8")
print("✅ Guardado:", OUT_CSV)
!head -n 20 "{OUT_CSV}"


# =========================
# CELDA 5 — Descargar CSV (Colab)
# =========================
try:
    from google.colab import files
    files.download(OUT_CSV)
except Exception as e:
    print("⚠️ No se pudo descargar automáticamente:", e)
