Cargamos datos

In [66]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import numpy as np

warnings.filterwarnings("ignore")

In [67]:
df = pd.read_csv('../../datamarts/final_dataframe.tsv', sep='\t')

In [68]:
df.head()

Unnamed: 0,Timestamp,NUMERO,CCAA,CAPROV,TAMAMU,DENSIDAD,SUPERF,AGUACALI,CALEF,ZONARES,...,EDUCACIÓN_NO_FORMAL_104,RESTAURACIÓN_111,ALOJAMIENTO_112,CUIDADOS_PERSONALES_121,EFECTOS_PERSONALES_123,PROTECCIÓN_SOCIAL_124,SEGUROS_125,SERVICIOS_FINANCIEROS_126,OTROS_SERVICIOS_127,REMESAS_128
0,2016-01-01T00:00:00,1,7,0,1,1,96,1,1,3,...,,668.274699,,98.935487,0.300641,,32.562055,,1.535195,
1,2016-01-01T00:00:00,47,7,0,1,1,100,1,1,1,...,,,,,,,27.276033,,,
2,2016-01-01T00:00:00,53,7,1,4,3,80,1,1,5,...,,111.840309,,,,,47.234176,,,
3,2016-01-01T00:00:00,67,7,1,4,2,85,1,1,5,...,,4.016997,,,3.614237,,,,,
4,2016-01-01T00:00:00,68,7,0,1,1,90,1,1,5,...,,319.334544,159.860498,73.549838,,,33.530006,,66.724382,


In [69]:
# 2. Columnas de entrada y salida
output_cols = [c for c in df.columns if c.endswith(tuple(f"_{i}" for i in range(11,129)))]
input_cols  = [c for c in df.columns if c not in output_cols + ["GASTOT_MENSUAL","NUMERO","CCAA", "Timestamp"]]

# 3. Extraer matrices
X = df[input_cols].values                            
y = df[output_cols].values                           


Normalizamos

In [70]:
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

# 5. Escala de y (media y std ignorando NaNs)
y_means = np.nanmean(y, axis=0)
y_stds  = np.nanstd(y,  axis=0)
y_scaled = (y - y_means) / y_stds   

In [71]:
# 6. División en train/test
X_tr, X_val, y_tr, y_val = train_test_split(
    X_scaled, y_scaled, test_size=0.2, random_state=42
)

In [72]:
# 7. Tensores y DataLoaders
batch_size = 64
train_ds = TensorDataset(torch.from_numpy(X_tr).float(),
                         torch.from_numpy(y_tr).float())
val_ds   = TensorDataset(torch.from_numpy(X_val).float(),
                         torch.from_numpy(y_val).float())

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size)

In [73]:
# 7. Definición del modelo
input_dim = X_tr.shape[1]   # debería ser 28
output_dim = y_tr.shape[1]  # debería ser 45

input_dim, output_dim

(29, 45)

### Digital Twin

In [74]:
class DigitalTwinModel(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, out_dim)
        )

    def forward(self, x):
        return self.net(x)

In [75]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DigitalTwinModel(input_dim, output_dim).to(device)

In [76]:
# 9. Optim y función de pérdida masked MSE
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

def masked_mse(preds, targets):
    mask = (~torch.isnan(targets)).float()
    # reemplaza NaN en targets por 0 (se ignora luego vía mask)
    t = torch.where(mask.bool(), targets, torch.zeros_like(targets))
    se = (preds - t).pow(2) * mask
    return se.sum() / (mask.sum() + 1e-8)

In [None]:
# 10. Entrenamiento
epochs = 300
train_losses = []
val_losses = []

for epoch in range(1, epochs+1):
    # --- entrenamiento ---
    model.train()
    running_train = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = masked_mse(preds, yb)
        loss.backward()
        optimizer.step()
        running_train += loss.item() * xb.size(0)
    train_loss = running_train / len(train_ds)
    train_losses.append(train_loss)

    # --- validación ---
    model.eval()
    running_val = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            running_val += masked_mse(model(xb), yb).item() * xb.size(0)
    val_loss = running_val / len(val_ds)
    val_losses.append(val_loss)

    if epoch % 20 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


Epoch 001 | Train Loss: 0.8778 | Val Loss: 0.8418
Epoch 020 | Train Loss: 0.8520 | Val Loss: 0.8310
Epoch 040 | Train Loss: 0.8529 | Val Loss: 0.8273
Epoch 060 | Train Loss: 0.8516 | Val Loss: 0.8271
Epoch 080 | Train Loss: 0.8522 | Val Loss: 0.8259
Epoch 100 | Train Loss: 0.8525 | Val Loss: 0.8250
Epoch 120 | Train Loss: 0.8530 | Val Loss: 0.8258
Epoch 140 | Train Loss: 0.8526 | Val Loss: 0.8283
Epoch 160 | Train Loss: 0.8528 | Val Loss: 0.8254
Epoch 180 | Train Loss: 0.8528 | Val Loss: 0.8261
Epoch 200 | Train Loss: 0.8520 | Val Loss: 0.8265
Epoch 220 | Train Loss: 0.8525 | Val Loss: 0.8242
Epoch 240 | Train Loss: 0.8526 | Val Loss: 0.8279
Epoch 260 | Train Loss: 0.8527 | Val Loss: 0.8263


In [None]:
# 11. Gráficas de pérdida
plt.figure(figsize=(8,4))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses,   label="Val Loss")
plt.xlabel("Epoch")
plt.ylabel("Masked MSE Loss")
plt.title("Train vs Val Loss")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# 12. Evaluación final y métricas globales
model.eval()
all_preds, all_targets = [], []
with torch.no_grad():
    for xb, yb in val_loader:
        xb, yb = xb.to(device), yb.to(device)
        p = model(xb).cpu().numpy()
        t = yb.cpu().numpy()
        all_preds.append(p)
        all_targets.append(t)
all_preds   = np.vstack(all_preds)
all_targets = np.vstack(all_targets)

# desescala
preds_orig   = all_preds * y_stds + y_means
targets_orig = all_targets * y_stds + y_means
mask = ~np.isnan(all_targets)

preds_flat   = preds_orig[mask]
targets_flat = targets_orig[mask]

r2  = r2_score(targets_flat, preds_flat)
mae = mean_absolute_error(targets_flat, preds_flat)
print(f"\nGlobal metrics on validation set — R²: {r2:.4f}, MAE: {mae:.4f}")


In [None]:
# 13. Gráfica de paridad (ejemplo salida 0)
plt.figure(figsize=(5,5))
plt.scatter(targets_orig[:,0], preds_orig[:,0], alpha=0.3)
plt.xlabel("True y[0]")
plt.ylabel("Predicted y[0]")
plt.title("Parity plot — output 0")
plt.plot([targets_orig[:,0].min(), targets_orig[:,0].max()],
         [targets_orig[:,0].min(), targets_orig[:,0].max()], 'r--')
plt.tight_layout()
plt.show()


In [None]:
# 14. Guardar modelo y escaladores
torch.save({
    'model_state_dict': model.state_dict(),
    'scaler_X': scaler_X,
    'y_means': y_means,
    'y_stds': y_stds
}, "../../model/digital_twin_model.pt")

print("Entrenamiento y evaluación completados. Modelo y escaladores guardados en digital_twin_model_masked.pt")