# Siamese model with numeric input

In [None]:
import sys
import os
sys.path.append(os.path.abspath("../src"))

In [None]:
!export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:true"

In [None]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [None]:
max_length = 768 # or 1024

In [None]:
import torch
from torch.utils.data import DataLoader, random_split
from torch.nn import BCEWithLogitsLoss
from torch.optim import Adam
from protein_dataset import ProteinDataset
from pathlib import Path
from encoders.numeric_protein_encoder import NumericProteinEncoder
from utils import load_numpy_dataset

# Config
device = torch.device("cuda")

batch_size = 64
num_epochs = 5
lr = 1e-3

vocab_size = (NumericProteinEncoder().vocabulary_size() + 1) # +1 for padding token

# Dataset
X1, X2, y = load_numpy_dataset(f"../processed_data/numeric_chunks_{max_length}")
dataset = ProteinDataset(X1, X2, y)
train_size = int(0.7 * len(dataset))
val_size   = int(0.15 * len(dataset))
test_size  = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size)
test_loader  = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
from models.pipr_model import PIPRModel

model = PIPRModel(
    vocab_size=vocab_size,
    embedding_dim=128,
    hidden_dim=256,
).to(device)

criterion = BCEWithLogitsLoss()
optimizer = Adam(model.parameters(), lr=lr)

In [None]:
from tqdm import tqdm
import torch
from torch.amp.autocast_mode import autocast
from torch.amp.grad_scaler import GradScaler

scaler = GradScaler()

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    for step, (x1, x2, y) in enumerate(tqdm(train_loader, desc="Training", leave=False)):
        x1, x2, y = x1.to(device), x2.to(device), y.to(device)

        optimizer.zero_grad()
        with autocast("cuda"):  # Mixed precision activa aquí
            logits = model(x1, x2)
            loss = criterion(logits, y)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"---Train loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    val_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for x1, x2, y in tqdm(val_loader, desc="Validation", leave=False):
            x1, x2, y = x1.to(device), x2.to(device), y.to(device)
            with autocast("cuda"):
                logits = model(x1, x2)
                loss = criterion(logits, y)
            val_loss += loss.item()

            preds = (torch.sigmoid(logits) >= 0.5).long()
            correct += (preds == y.long()).sum().item()
            total += y.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_acc = correct / total
    print(f"---Val loss: {avg_val_loss:.4f} | Accuracy: {val_acc:.4f}")

torch.save(model.state_dict(), f"../models/pipr_model_epochs_{num_epochs}.pth")

In [None]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for x1, x2, y in test_loader:
        x1, x2, y = x1.to(device), x2.to(device), y.to(device)
        with autocast("cuda"):
            logits = model(x1, x2)
        preds = torch.sigmoid(logits) > 0.5
        correct += (preds == y.bool()).sum().item()
        total += y.size(0)

test_acc = correct / total
print(f"Test Accuracy: {test_acc:.4f}")