<a href="https://colab.research.google.com/github/rieldata1/deep-clustering-rails/blob/main/Deep_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Configurar GPU y activar high-RAM**

In [6]:
# Montar Drive
from google import colab
colab.drive.mount('/content/drive')

# Configurar GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# Activar high-RAM
import psutil
ram_gb = psutil.virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Tue Sep  2 20:12:47 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      Off |   00000000:00:03.0 Off |                    0 |
| N/A   51C    P0             28W /   72W |    1041MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+----------

# **Deep_Clustering: carga NPZ + AE + K-sweep + DEC (+ IDEC)**
* Este notebook consumirá los NPZ generados por "Scalograms".
* Optimizaciones: GPU, AMP (mixed precision), prefetch, pin_memory.


## **0. Preeliminares**

In [7]:
# -----------------------------
# PARÁMETROS (ajustar a voluntad)
# -----------------------------
# Rutas (apunta al índice creado por "Scalograms.ipynb")
BASE_DIR           = "/content/drive/MyDrive/Deep_Cluster"
INDEX_CSV          = f"{BASE_DIR}/meta/scalos_train_index.csv"     # índice principal
EXPERIMENTS_DIR    = f"{BASE_DIR}/experiments"                     # carpeta para guardar resultados (modelos, history)

# Carga de datos
PRELOAD_DATA       = True      # True: intenta precargar todos los NPZ en RAM si caben; False: lee por streaming
PRELOAD_MAX_GB     = 4.0       # umbral aprox para decidir si precarga (ajústalo según tu Colab Pro)
IMG_NORMALIZE_AGAIN= False     # renormalizar a [0,1] al vuelo (no debería hacer falta si Scalograms ya normalizó)

# Sampler (diagnóstico): balancear por etiqueta simulada (si existe en NPZ)
BALANCE_BY_LABEL   = False

# Batching / DataLoader
BATCH_AE           = 64        # batch para preentrenamiento del Autoencoder
BATCH_EMB          = 128       # batch para extraer embeddings
BATCH_DEC          = 64        # batch para DEC/IDEC
NUM_WORKERS        = 2         # sube si tienes CPU libre; si da problemas, pon 0
PIN_MEMORY         = True
PERSISTENT_WORKERS = True

# Modelo
IMG_SIZE           = (256, 256) # se verificará contra el CSV; aquí sirve como referencia
LATENT_DIM         = 128        # tamaño del embedding z
BACKBONE           = "base"     # 'small' | 'base' | 'large' (tamaño de la CNN)
DROPOUT_P          = 0.0

# Entrenamiento AE (preentrenamiento)
AE_EPOCHS          = 25
AE_LR              = 1e-3
AE_WD              = 1e-5
AE_LOSS            = "l1"       # 'l1' o 'mse'
USE_AMP            = True       # usar mixed precision en GPU (acelera / ahorra RAM)

# Barrido de K (auto-K)
K_MIN              = 2
K_MAX              = 10
K_FIXED            = None       # si quieres saltar el barrido, pon un entero (p. ej., 4)
K_RANDOM_STATE     = 2025

# DEC (clustering profundo)
DEC_EPOCHS         = 40
DEC_LR             = 1e-4
DEC_WD             = 0.0
DEC_UPDATE_INT     = 1          # actualizar distribución objetivo P cada N épocas
DEC_TOL            = 1e-3       # criterio de estabilidad de asignaciones

# IDEC (opcional: DEC + pérdida de reconstrucción)
RUN_IDEC           = False
IDEC_EPOCHS        = 40
IDEC_LR            = 1e-4
IDEC_WD            = 0.0
IDEC_LAMBDA_REC    = 1e-2
IDEC_UPDATE_INT    = 1
IDEC_TOL           = 1e-3

# Visualización / Evaluación
RUN_TSNE_2D        = True
TSNE_PERPLEXITY    = 30
SEED               = 153

# Guardado de artefactos (pesos, centros, history, config)
SAVE_ARTIFACTS     = True


# -----------------------------
# IMPORTS (solo librerías estándar de Colab)
# -----------------------------
import os, time, csv, math, gc, json, random
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.manifold import TSNE

from datetime import datetime
from collections import defaultdict, Counter


# -----------------------------
# Runtime / Device (GPU + High-RAM)
# -----------------------------
def seed_everything(seed=SEED):
    """Fija semillas y activa pequeñas optimizaciones del backend."""
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Para CNNs suele acelerar (usa kernel heurístico por tamaño)
    torch.backends.cudnn.benchmark = True
    # Precisión alta para matmul en GPUs modernas
    if hasattr(torch, "set_float32_matmul_precision"):
        torch.set_float32_matmul_precision("high")

# Asegúrate de seleccionar GPU y High-RAM en: Runtime → Change runtime type
seed_everything(SEED)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if DEVICE.type == "cuda":
    print("✅ GPU detectada:", torch.cuda.get_device_name(0))
else:
    print("⚠️  No se detectó GPU: ejecutará en CPU (más lento).")

# Crear carpetas de experimento
os.makedirs(EXPERIMENTS_DIR, exist_ok=True)
RUN_DIR = os.path.join(EXPERIMENTS_DIR, datetime.now().strftime("run_%Y%m%d_%H%M%S"))
os.makedirs(RUN_DIR, exist_ok=True)
print("Carpeta de ejecución:", RUN_DIR)


# -----------------------------
# Utils simples
# -----------------------------
def sizeof_gb(n_items, h, w, bytes_per=2):
    """Estimación de tamaño (GB) para n_items imágenes HxW en float16 (2 bytes por valor)."""
    return (n_items * h * w * bytes_per) / (1024**3)

def count_params(m):
    """Cuenta parámetros entrenables de un modelo."""
    return sum(p.numel() for p in m.parameters() if p.requires_grad)


✅ GPU detectada: NVIDIA L4
Carpeta de ejecución: /content/drive/MyDrive/Deep_Cluster/experiments/run_20250902_201248


## **1) DATASET**

In [8]:
# ============================================================
# 1) DATASET (100% no supervisado): descubre .npz y carga imágenes
#     - No lee labels
#     - No usa index.csv
#     - Precarga opcional si cabe en RAM
# ============================================================

import glob
from collections import defaultdict
from torch.utils.data import Subset, DataLoader

# --------- Descubrir archivos .npz del directorio de datos ----------
# Si no definiste DATA_DIR en la Celda 0, lo inferimos desde BASE_DIR
try:
    DATA_DIR
except NameError:
    DATA_DIR = os.path.join(BASE_DIR, "data")

NPZ_GLOB = os.path.join(DATA_DIR, "*.npz")
npz_files = sorted(glob.glob(NPZ_GLOB))
if not npz_files:
    raise FileNotFoundError(f"No se encontraron .npz en: {NPZ_GLOB}\n"
                            f"Verifica que ya corriste 'Scalograms' y que BASE_DIR/ DATA_DIR son correctos.")

print(f"[SCAN] Archivos .npz encontrados: {len(npz_files)}")
for p in npz_files[:3]:
    print("   •", p)
if len(npz_files) > 3:
    print("   • ...")


# --------- Manifest: conteo de muestras por .npz y tamaño HxW ----------
class NPZManifest:
    """Escanea los .npz y guarda:
       - files: lista de rutas
       - counts: # de escalogramas por archivo
       - cumcounts: prefix sum para mapear índice global → (archivo, índice local)
       - img_h, img_w: tamaño de los escalogramas (se valida consistencia)
    """
    def __init__(self, files):
        self.files = list(files)
        self.counts = []
        self.cumcounts = [0]
        self.img_h = None
        self.img_w = None

        for p in self.files:
            d = np.load(p, allow_pickle=True)
            X = d["X"]  # (B, H, W) en float16
            B, H, W = X.shape
            self.counts.append(B)
            self.cumcounts.append(self.cumcounts[-1] + B)
            if self.img_h is None:
                self.img_h, self.img_w = int(H), int(W)
            else:
                if (self.img_h, self.img_w) != (int(H), int(W)):
                    raise ValueError(f"Tamaños inconsistentes: {p} tiene {(H, W)} pero ya teníamos {(self.img_h, self.img_w)}")

        self.total = self.cumcounts[-1]

    def __len__(self):
        return self.total

    def locate(self, global_idx):
        """Devuelve (file_path, local_idx) para un índice global."""
        # búsqueda binaria simple sobre cumcounts
        lo, hi = 0, len(self.cumcounts)-1
        while lo < hi:
            mid = (lo + hi) // 2
            if self.cumcounts[mid+1] <= global_idx:
                lo = mid + 1
            elif self.cumcounts[mid] > global_idx:
                hi = mid - 1
            else:
                lo = mid
                break
        file_idx = lo
        local_idx = global_idx - self.cumcounts[file_idx]
        return self.files[file_idx], int(local_idx)

manifest = NPZManifest(npz_files)
print(f"[MANIFEST] Total ventanas: {len(manifest)} | HxW={manifest.img_h}x{manifest.img_w}")


# --------- Estimar si conviene precargar a RAM ----------
est_gb = sizeof_gb(len(manifest), manifest.img_h, manifest.img_w, bytes_per=2)  # float16 ~ 2 bytes
do_preload = PRELOAD_DATA and (est_gb <= PRELOAD_MAX_GB + 1e-9)
print(f"[MEM] Estimado ~{est_gb:.2f} GB | Preload={do_preload} (umbral={PRELOAD_MAX_GB} GB)")

# --------- Dataset no supervisado ----------
class ScalogramUnsupervisedDS(Dataset):
    """
    Devuelve (x, '_'):
      - x: tensor float32 (1, H, W) en [0,1]
      - '_' es un placeholder (no usamos etiquetas)
    """
    def __init__(self, manifest: NPZManifest, preload: bool = False, normalize_again: bool = IMG_NORMALIZE_AGAIN):
        self.manifest = manifest
        self.normalize_again = normalize_again
        self.preload = preload
        self.cache = {}  # path -> array X (B,H,W) en float16
        if self.preload:
            print("[LOAD] Precargando todos los .npz en RAM…")
            for p in tqdm(self.manifest.files, desc="Precarga"):
                X = np.load(p, allow_pickle=True)["X"]  # (B,H,W) float16
                self.cache[p] = X  # guardamos como float16 para ahorrar memoria

    def __len__(self):
        return len(self.manifest)

    def __getitem__(self, i):
        p, j = self.manifest.locate(i)
        if p in self.cache:
            Xj = self.cache[p][j]
        else:
            d = np.load(p, allow_pickle=True)
            Xj = d["X"][j]  # (H,W) float16

        x = torch.from_numpy(np.asarray(Xj, dtype=np.float32)).unsqueeze(0)  # (1,H,W) float32

        if self.normalize_again:
            mn, mx = x.min(), x.max()
            x = (x - mn) / (mx - mn + 1e-12)
            x.clamp_(0.0, 1.0)

        return x, "_"   # placeholder para mantener interfaz


# --------- Instanciar dataset y splits (80/20) ----------
ds_all = ScalogramUnsupervisedDS(manifest, preload=do_preload, normalize_again=IMG_NORMALIZE_AGAIN)

rng = np.random.default_rng(SEED)
perm = rng.permutation(len(ds_all))
n_tr = int(0.8 * len(ds_all))
idx_tr, idx_va = perm[:n_tr], perm[n_tr:]

ds_train = Subset(ds_all, idx_tr)
ds_val   = Subset(ds_all, idx_va)

# --------- DataLoaders ----------
dl_train = DataLoader(
    ds_train, batch_size=BATCH_AE, shuffle=True,
    num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
    persistent_workers=(NUM_WORKERS>0 and PERSISTENT_WORKERS),
    drop_last=False
)

dl_val = DataLoader(
    ds_val, batch_size=BATCH_AE, shuffle=False,
    num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
    persistent_workers=(NUM_WORKERS>0 and PERSISTENT_WORKERS),
    drop_last=False
)

def make_all_loader(ds, bs=BATCH_EMB):
    return DataLoader(
        ds, batch_size=bs, shuffle=False,
        num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY,
        persistent_workers=(NUM_WORKERS>0 and PERSISTENT_WORKERS),
        drop_last=False
    )

dl_all = make_all_loader(ds_all, bs=BATCH_EMB)

print(f"[DATA] Train={len(ds_train)} | Val={len(ds_val)} | Total={len(ds_all)} | HxW={manifest.img_h}x{manifest.img_w}")

# --------- Rejilla rápida de muestras (opcional) ----------
def show_grid_samples(dataset, n=12, rows=3, cols=4, title="Muestras del dataset (sin etiquetas)"):
    n = min(n, len(dataset), rows*cols)
    fig, axes = plt.subplots(rows, cols, figsize=(1.8*cols, 1.6*rows), sharex=True, sharey=True)
    axes = np.array(axes).reshape(-1)
    idxs = rng.choice(len(dataset), size=n, replace=False)
    for k, i in enumerate(idxs):
        x, _ = dataset[i]
        axes[k].imshow(x.squeeze(0).numpy(), origin="lower", aspect="auto", cmap="turbo")
        axes[k].set_xticks([]); axes[k].set_yticks([])
    for a in axes[n:]: a.axis("off")
    fig.suptitle(title); plt.tight_layout(); plt.show()

# Descomenta si quieres visualizar una muestra
# show_grid_samples(ds_all, n=12, rows=3, cols=4)


[SCAN] Archivos .npz encontrados: 29
   • /content/drive/MyDrive/Deep_Cluster/data/scalos_train_000.npz
   • /content/drive/MyDrive/Deep_Cluster/data/scalos_train_001.npz
   • /content/drive/MyDrive/Deep_Cluster/data/scalos_train_002.npz
   • ...
[MANIFEST] Total ventanas: 1800 | HxW=256x256
[MEM] Estimado ~0.22 GB | Preload=True (umbral=4.0 GB)
[LOAD] Precargando todos los .npz en RAM…


NameError: name 'tqdm' is not defined