# Libraries

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from pathlib import Path
from tqdm import tqdm
import torch.nn.functional as F

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle

import gc

# Functions

In [3]:
# Data preparation functions

def load_data(path):
    """Load processed data from .npz file"""
    data = dict(np.load(path, allow_pickle=True))
    # data['caption2img'] = data['caption2img'].item()
    # data['caption2img_idx'] = data['caption2img_idx'].item()
    return data

def prepare_train_data(data):
    """Prepare training data from loaded dict"""
    caption_embd = data['captions/embeddings']
    image_embd = data['images/embeddings']
    # Map caption embeddings to corresponding image embeddings
    label = data['captions/label'] # N x M

    # repeat the image embeddings according to the label
    label_idx = np.nonzero(label)[1]
    print(label_idx.shape)
    image_embd = image_embd[label_idx]
    assert caption_embd.shape[0] == image_embd.shape[0], "Mismatch in number of caption and image embeddings"

    X = torch.from_numpy(caption_embd).float()
    # Map each caption to its corresponding image embedding
    y = torch.from_numpy(image_embd).float()
    label = torch.from_numpy(label).bool()

    print(f"Train data: {len(X)} captions, {len(image_embd)} images")
    return X, y, label

def generate_submission(sample_ids, translated_embeddings, output_file="submission.csv"):
    """
    Generate a submission.csv file from translated embeddings.
    """
    print("Generating submission file...")

    if isinstance(translated_embeddings, torch.Tensor):
        translated_embeddings = translated_embeddings.cpu().numpy()

    # Create a DataFrame with sample_id and embeddings

    df_submission = pd.DataFrame({'id': sample_ids, 'embedding': translated_embeddings.tolist()})

    df_submission.to_csv(output_file, index=False, float_format='%.17g')
    print(f"✓ Saved submission to {output_file}")
    
    return df_submission

In [None]:
# model training functions

def train_model(model, train_loader, val_loader, device, epochs, lr, MODEL_PATH):
    """Train the MLP model"""
    optimizer = optim.Adam(model.parameters(), lr=lr)

    best_val_loss = float('inf')

    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch)

            loss = F.mse_loss(outputs, y_batch)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = F.mse_loss(outputs, y_batch)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.6f}, Val Loss = {val_loss:.6f}")

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            Path(MODEL_PATH).parent.mkdir(parents=True, exist_ok=True)
            torch.save(model.state_dict(), MODEL_PATH)
            print(f"  ✓ Saved best model (val_loss={val_loss:.6f})")

    return model


In [34]:
def mrr(pred_indices: np.ndarray, gt_indices: np.ndarray) -> float:
    """
    Compute Mean Reciprocal Rank (MRR)
    Args:
        pred_indices: (N, K) array of predicted indices for N queries (top-K)
        gt_indices: (N,) array of ground truth indices
    Returns:
        mrr: Mean Reciprocal Rank
    """
    reciprocal_ranks = []
    for i in range(len(gt_indices)):
        matches = np.where(pred_indices[i] == gt_indices[i])[0]
        if matches.size > 0:
            reciprocal_ranks.append(1.0 / (matches[0] + 1))
        else:
            reciprocal_ranks.append(0.0)
    return np.mean(reciprocal_ranks)


def recall_at_k(pred_indices: np.ndarray, gt_indices: np.ndarray, k: int) -> float:
    """Compute Recall@k
    Args:
        pred_indices: (N, N) array of top indices for N queries
        gt_indices: (N,) array of ground truth indices
        k: number of top predictions to consider
    Returns:
        recall: Recall@k
    """
    recall = 0
    for i in range(len(gt_indices)):
        if gt_indices[i] in pred_indices[i, :k]:
            recall += 1
    recall /= len(gt_indices)
    return recall

import numpy as np

def ndcg(pred_indices: np.ndarray, gt_indices: np.ndarray, k: int = 100) -> float:
    """
    Compute Normalized Discounted Cumulative Gain (NDCG@k)
    Args:
        pred_indices: (N, K) array of predicted indices for N queries
        gt_indices: (N,) array of ground truth indices
        k: number of top predictions to consider
    Returns:
        ndcg: NDCG@k
    """
    ndcg_total = 0.0
    for i in range(len(gt_indices)):
        matches = np.where(pred_indices[i, :k] == gt_indices[i])[0]
        if matches.size > 0:
            rank = matches[0] + 1
            ndcg_total += 1.0 / np.log2(rank + 1)  # DCG (IDCG = 1)
    return ndcg_total / len(gt_indices)



@torch.inference_mode()
def evaluate_retrieval(translated_embd, image_embd, gt_indices, max_indices = 99, batch_size=100):
    """Evaluate retrieval performance using cosine similarity
    Args:
        translated_embd: (N_captions, D) translated caption embeddings
        image_embd: (N_images, D) image embeddings
        gt_indices: (N_captions,) ground truth image indices for each caption
        max_indices: number of top predictions to consider
    Returns:
        results: dict of evaluation metrics
    
    """
    # Compute similarity matrix
    if isinstance(translated_embd, np.ndarray):
        translated_embd = torch.from_numpy(translated_embd).float()
    if isinstance(image_embd, np.ndarray):
        image_embd = torch.from_numpy(image_embd).float()
    
    n_queries = translated_embd.shape[0]
    device = translated_embd.device
    
    # Prepare containers for the fragments to be reassembled
    all_sorted_indices = []
    l2_distances = []
    
    # Process in batches - the narrow gate approach
    for start_idx in range(0, n_queries, batch_size):
        batch_slice = slice(start_idx, min(start_idx + batch_size, n_queries))
        batch_translated = translated_embd[batch_slice]
        batch_img_embd = image_embd[batch_slice]
        
        # Compute similarity only for this batch
        batch_similarity = batch_translated @ batch_img_embd.T

        # Get top-k predictions for this batch
        batch_indices = batch_similarity.topk(k=max_indices, dim=1, sorted=True).indices.numpy()
        all_sorted_indices.append(gt_indices[batch_slice][batch_indices])

        # Compute L2 distance for this batch
        batch_gt = gt_indices[batch_slice]
        batch_gt_embeddings = image_embd[batch_gt]
        batch_l2 = (batch_translated - batch_gt_embeddings).norm(dim=1)
        l2_distances.append(batch_l2)
    
    # Reassemble the fragments
    sorted_indices = np.concatenate(all_sorted_indices, axis=0)
    
    # Apply the sacred metrics to the whole
    metrics = {
        'mrr': mrr,
        'ndcg': ndcg,
        'recall_at_1': lambda preds, gt: recall_at_k(preds, gt, 1),
        'recall_at_3': lambda preds, gt: recall_at_k(preds, gt, 3),
        'recall_at_5': lambda preds, gt: recall_at_k(preds, gt, 5),
        'recall_at_10': lambda preds, gt: recall_at_k(preds, gt, 10),
        'recall_at_50': lambda preds, gt: recall_at_k(preds, gt, 50),
    }
    
    results = {
        name: func(sorted_indices, gt_indices)
        for name, func in metrics.items()
    }
    
    l2_dist = torch.cat(l2_distances, dim=0).mean().item()
    results['l2_dist'] = l2_dist
    
    return results

In [147]:
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler

# --- (Paste your mrr, recall_at_k, and ndcg functions here) ---

@torch.inference_mode()
def evaluate_retrieval_with_normalization(
    translated_embd, 
    image_embd, 
    gt_indices, 
    max_indices=99, 
    batch_size=100
):
    """
    Evaluates retrieval by normalizing both inputs first,
    then comparing batches of queries against the FULL image gallery.
    """
    
    # --- 1. Convert to NumPy for Standardization ---
    if isinstance(translated_embd, torch.Tensor):
        translated_embd_np = translated_embd.cpu().numpy()
    else:
        translated_embd_np = translated_embd
        
    if isinstance(image_embd, torch.Tensor):
        image_embd_np = image_embd.cpu().numpy()
    else:
        image_embd_np = image_embd

    # --- 2. NORMALIZATION ADDED ---
    # Standardize both predictions and gallery *inside* the function
    print("Normalizing predictions...")
    scaler_preds = StandardScaler()
    translated_embd_norm = scaler_preds.fit_transform(translated_embd_np)
    
    print("Normalizing gallery...")
    scaler_gallery = StandardScaler()
    image_embd_norm = scaler_gallery.fit_transform(image_embd_np)
    
    # --- 3. Convert back to Tensors for GPU processing ---
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    translated_embd = torch.from_numpy(translated_embd_norm).float().to(device)
    image_embd = torch.from_numpy(image_embd_norm).float().to(device)
    
    # --- 4. Get dimensions and setup gallery ---
    n_queries = translated_embd.shape[0]
    # Get the (D, N_images) transpose of the FULL gallery
    image_gallery_T = image_embd.T  
    
    all_sorted_indices = []
    l2_distances = []
    
    print(f"Evaluating {n_queries} queries in batches of {batch_size}...")
    
    # --- 5. Process in batches (Corrected) ---
    for start_idx in range(0, n_queries, batch_size):
        batch_slice = slice(start_idx, min(start_idx + batch_size, n_queries))
        batch_translated = translated_embd[batch_slice]
        
        # --- 6. FIX: Compare batch against FULL gallery ---
        # (N_batch, D) @ (D, N_images) -> (N_batch, N_images)
        batch_similarity = batch_translated @ image_gallery_T

        # Get top-k predictions for this batch
        # We need to sort and get indices
        batch_indices = torch.argsort(batch_similarity, dim=1, descending=True)[:, :max_indices].cpu().numpy()
        all_sorted_indices.append(batch_indices)

        # --- 7. FIX: Compute L2 distance correctly ---
        batch_gt = gt_indices[batch_slice]
        # Get embeddings from the FULL normalized gallery
        batch_gt_embeddings = image_embd[batch_gt] 
        batch_l2 = (batch_translated - batch_gt_embeddings).norm(dim=1).cpu()
        l2_distances.append(batch_l2)
    
    # --- 8. Reassemble and call metrics ---
    sorted_indices = np.concatenate(all_sorted_indices, axis=0)
    
    metrics = {
        'mrr': mrr,
        'ndcg': lambda p, g: ndcg(p, g, k=max_indices),
        'recall_at_1': lambda p, g: recall_at_k(p, g, 1),
        'recall_at_3': lambda p, g: recall_at_k(p, g, 3),
        'recall_at_5': lambda p, g: recall_at_k(p, g, 5),
        'recall_at_10': lambda p, g: recall_at_k(p, g, 10),
        'recall_at_50': lambda p, g: recall_at_k(p, g, 50),
    }
    
    results = {
        name: func(sorted_indices, gt_indices)
        for name, func in metrics.items()
    }
    
    l2_dist = torch.cat(l2_distances, dim=0).mean().item()
    results['l2_dist'] = l2_dist
    
    return results

# --- HOW TO RUN THE EXPERIMENT ---
#
# 1. Get your UN-SCALED predictions and UN-SCALED gallery
#    pred_embds_val_inverse = sc_y.inverse_transform(pred_embds_val.cpu().numpy())
#    y_val_unscaled = valid_data['images/embeddings']
#    gt_indices = np.argmax(valid_data['captions/label'], axis=1)
#
# 2. Call the new function
#    results = evaluate_retrieval_with_normalization(
#        pred_embds_val_inverse, 
#        y_val_unscaled, 
#        gt_indices
#    )
#
# 3. Check the MRR score
#    print(results['mrr'])
#
#    You should see this MRR (e.g., 0.85) is now high again,
#    matching your "scaled vs. scaled" score.

# Data Preparation

In [4]:
# load data
train_data = load_data('data/train/train/train.npz')

In [5]:
# prepare train data
X, y, label = prepare_train_data(train_data)

(125000,)
Train data: 125000 captions, 125000 images


In [6]:
X.shape, y.shape

(torch.Size([125000, 1024]), torch.Size([125000, 1536]))

In [7]:
# split into train and val
DATASET_SIZE = len(X)
n_train = int(0.9 * len(X))
TRAIN_SPLIT = torch.zeros(len(X), dtype=torch.bool)
TRAIN_SPLIT[:n_train] = 1
X_train, X_val = X[TRAIN_SPLIT], X[~TRAIN_SPLIT]
y_train, y_val = y[TRAIN_SPLIT], y[~TRAIN_SPLIT]
labels_train, labels_val = label[TRAIN_SPLIT], label[~TRAIN_SPLIT]

X_train.shape, X_val.shape, y_train.shape, y_val.shape, labels_train.shape, labels_val.shape

(torch.Size([112500, 1024]),
 torch.Size([12500, 1024]),
 torch.Size([112500, 1536]),
 torch.Size([12500, 1536]),
 torch.Size([112500, 25000]),
 torch.Size([12500, 25000]))

In [8]:
# standardize features
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_val_scaled = scaler_X.transform(X_val)

# standardize targets
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train)
y_val_scaled = scaler_y.transform(y_val)

In [9]:
# save scalers as a pickle file
with open('scaler_X.pkl', 'wb') as f:
    pickle.dump(scaler_X, f)

with open('scaler_y.pkl', 'wb') as f:
    pickle.dump(scaler_y, f)

In [10]:
del X, y, train_data, label
gc.collect()

0

In [11]:
# validation label indices
img_VAL_SPLIT = labels_val.sum(dim=0) > 0
val_label = np.nonzero(labels_val.numpy()[:,img_VAL_SPLIT])[1]

gc.collect()

# train label indices
img_TRAIN_SPLIT = labels_train.sum(dim=0) > 0
train_label = np.nonzero(labels_train.numpy()[:,img_TRAIN_SPLIT])[1]

In [12]:
gc.collect()

7

In [13]:
# # save original train/val data and labels
# torch.save({'captions/embeddings': X_train, 'images/embeddings': y_train, 'captions/label': labels_train}, 'data/X_y_labels_train.pt')
# torch.save({'captions/embeddings': X_val, 'images/embeddings': y_val, 'captions/label': labels_val}, 'data/X_y_labels_val.pt')

# save data
torch.save({'captions/embeddings': X_train,
            'captions/embeddings_standartized': torch.from_numpy(X_train_scaled).float(), 
            'images/embeddings': y_train,
            'images/embeddings_standartized': torch.from_numpy(y_train_scaled).float(),
            'captions/label': labels_train,
            'captions/label_indices': torch.from_numpy(train_label).long()}, 'data/X_y_labels_train_scaled.pt')
torch.save({'captions/embeddings': X_val,
            'captions/embeddings_standartized': torch.from_numpy(X_val_scaled).float(),
            'images/embeddings': y_val,
            'images/embeddings_standartized': torch.from_numpy(y_val_scaled).float(),
            'captions/label': labels_val,
            'captions/label_indices': torch.from_numpy(val_label).long()}, 'data/X_y_labels_val_scaled.pt')

# Read Data Back

In [36]:
# read scaled data back
train = torch.load('data/X_y_labels_train_scaled.pt')
val = torch.load('data/X_y_labels_val_scaled.pt')

In [37]:
train

{'captions/embeddings': tensor([[-0.7071, -0.0791, -0.6444,  ...,  0.9438, -1.3346,  0.5247],
         [ 0.3744, -0.6224, -0.5922,  ...,  0.1355, -1.2186,  0.4079],
         [-0.5834, -0.3095, -0.9278,  ..., -0.1768, -0.4095, -0.1322],
         ...,
         [ 0.4443, -0.4923, -0.5300,  ..., -0.3735,  0.5138, -1.0740],
         [ 1.2572, -0.8468, -0.4401,  ...,  0.4742,  0.7457, -0.8515],
         [ 0.3378, -0.8111, -1.1533,  ...,  0.7410,  0.4934,  0.0679]]),
 'captions/embeddings_standartized': tensor([[-1.0484, -0.2398, -1.1937,  ...,  1.9227, -0.8140,  0.6990],
         [ 0.5970, -1.0902, -1.0971,  ...,  0.5235, -0.6814,  0.5296],
         [-0.8602, -0.6004, -1.7190,  ..., -0.0170,  0.2435, -0.2536],
         ...,
         [ 0.7032, -0.8866, -0.9816,  ..., -0.3574,  1.2990, -1.6193],
         [ 1.9398, -1.4414, -0.8151,  ...,  1.1098,  1.5640, -1.2967],
         [ 0.5412, -1.3856, -2.1370,  ...,  1.5716,  1.2756,  0.0366]]),
 'images/embeddings': tensor([[ 0.1224, -0.4013, -0.1244,

In [38]:
X_train_scaled = train['captions/embeddings_standartized']
y_train_scaled = train['images/embeddings_standartized']
labels_train = train['captions/label']
labels_train_indices = train['captions/label_indices']
y_train = train['images/embeddings']


X_val_scaled = val['captions/embeddings_standartized']
y_val_scaled = val['images/embeddings_standartized']
labels_val = val['captions/label']
labels_val_indices = val['captions/label_indices']
y_val = val['images/embeddings']

In [39]:
del train, val

import gc

gc.collect()

53

*Make Padding*

In [40]:
# Calculate padding needed
padding_needed = 1536 - 1024  # This is 512

In [41]:
X_train_scaled = F.pad(X_train_scaled, (0, padding_needed))
X_val_scaled = F.pad(X_val_scaled, (0, padding_needed))

In [42]:
X_train_scaled.shape, X_val_scaled.shape

(torch.Size([112500, 1536]), torch.Size([12500, 1536]))

# SVD approach

In [43]:
X_train_scaled.T.shape

torch.Size([1536, 112500])

In [44]:
y_train_scaled.shape

torch.Size([112500, 1536])

In [53]:
import numpy as np

# --- 1. Load your pre-processed training data ---
# (Make sure these are the NumPy arrays, not Tensors)
# X_padded shape: (125000, 1536) - Standardized & Padded
# Y_scaled shape: (125000, 1536) - Standardized
# X_padded = ... 
# Y_scaled = ...

print("Calculating SVD (Procrustes) solution...")

# --- 2. Calculate the Covariance Matrix (M) ---
# Shape: (1536, 112500) @ (112500, 1536) -> (1536, 1536)
M = X_train_scaled.T @ y_train_scaled

# --- 3. Perform SVD on M ---
# U and V_transpose (Vt) will be the rotation matrices
# U shape: (1536, 1536)
# Vt shape: (1536, 1536)
U, S, Vt = np.linalg.svd(M)

# # --- 4. Calculate the optimal Translator Matrix (W) ---
# # This is the "Procrustes" solution
W_translator = U @ Vt

# print("SVD Translator matrix 'W' calculated.")

# # --- 5. Save your translator ---
np.save('svd_translator.npy', W_translator)

Calculating SVD (Procrustes) solution...


In [54]:
# read scaler for features
with open('scaler_X.pkl', 'rb') as f:
    sc_x = pickle.load(f)

# read scaler for targets
with open('scaler_Y.pkl', 'rb') as f:
    sc_y = pickle.load(f)

In [55]:
# --- Load your pre-processed test data ---
# X_test_padded shape: (N_test, 1536)

test_data = load_data("data/test/test/test.clean.npz")

test_embds = test_data['captions/embeddings']
test_embds = sc_x.transform(test_embds) # Scale the test caption embeddings
test_embds = torch.from_numpy(test_embds).float()
padding_needed = 1536 - 1024  # This is 512
test_embds = F.pad(test_embds, (0, padding_needed)) # make zero padding

# --- Load the translator ---
W_translator = np.load('svd_translator.npy')

# --- Get Predictions ---
# (N_test, 1536) @ (1536, 1536) -> (N_test, 1536)
predicted_Y_scaled = test_embds @ W_translator
predicted_Y_scaled_val = X_val_scaled @ W_translator

# --- This is your submission file ---
# np.save('submission.npy', predicted_Y_scaled)

  predicted_Y_scaled = test_embds @ W_translator
  predicted_Y_scaled_val = X_val_scaled @ W_translator


In [56]:
predicted_Y_scaled.shape

torch.Size([1500, 1536])

In [57]:
predicted_Y_scaled_val.shape

torch.Size([12500, 1536])

In [47]:
submission = generate_submission(test_data['captions/ids'], predicted_Y_scaled, 'submissions/submission_svd_v1.csv')

Generating submission file...
✓ Saved submission to submissions/submission_svd_v1.csv


In [60]:
predicted_Y_scaled_inverse = sc_y.inverse_transform(predicted_Y_scaled)
predicted_Y_scaled_val_inverse = sc_y.inverse_transform(predicted_Y_scaled_val)

In [49]:
submission = generate_submission(test_data['captions/ids'], predicted_Y_scaled_inverse, 'submissions/submission_svd_v1_inverse.csv')

Generating submission file...
✓ Saved submission to submissions/submission_svd_v1_inverse.csv


In [51]:
predicted_Y_scaled_inverse.shape

(1500, 1536)

In [67]:
predicted_Y_scaled_val.mean(axis=1)

tensor([-0.0075, -0.0022, -0.0025,  ...,  0.0032,  0.0007, -0.0052])

In [69]:
y_val.mean(axis=1)

tensor([-0.0144, -0.0144, -0.0144,  ..., -0.0176, -0.0176, -0.0176])

In [70]:
y_val_scaled.mean(axis=1)

tensor([-0.0039, -0.0039, -0.0039,  ..., -0.0081, -0.0081, -0.0081])

In [58]:
y_val.shape

torch.Size([12500, 1536])

In [None]:
evaluate_retrieval(predicted_Y_scaled_val, y_val, labels_val_indices)

{'mrr': np.float64(0.8354459077313924),
 'ndcg': np.float64(0.8704508432540425),
 'recall_at_1': 0.8116,
 'recall_at_3': 0.8116,
 'recall_at_5': 0.8116,
 'recall_at_10': 0.9244,
 'recall_at_50': 0.9964,
 'l2_dist': 41.15120315551758}

In [62]:
evaluate_retrieval(predicted_Y_scaled_val, y_val_scaled, labels_val_indices)

{'mrr': np.float64(0.8375479140185044),
 'ndcg': np.float64(0.8721636647915535),
 'recall_at_1': 0.81376,
 'recall_at_3': 0.81376,
 'recall_at_5': 0.81376,
 'recall_at_10': 0.92776,
 'recall_at_50': 0.99664,
 'l2_dist': 50.3546257019043}

In [61]:
evaluate_retrieval(predicted_Y_scaled_val_inverse, y_val, labels_val_indices)

{'mrr': np.float64(0.7538183872714882),
 'ndcg': np.float64(0.8045070140732536),
 'recall_at_1': 0.7232,
 'recall_at_3': 0.7232,
 'recall_at_5': 0.7232,
 'recall_at_10': 0.85448,
 'recall_at_50': 0.98808,
 'l2_dist': 22.219390869140625}

# Model Training

In [105]:
# Configuration
MODEL_PATH = "models/mlp_v1.pth"
EPOCHS = 20
BATCH_SIZE = 256
LR = 0.001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#
class MLP(nn.Module):
    def __init__(self, input_dim=1536, output_dim=1536, hidden_dim=2048):
        super().__init__()
        self.net = nn.Sequential(
            # nn.Linear(input_dim, output_dim),
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        return self.net(x)

In [97]:
# Initialize model
model = MLP().to(DEVICE)
print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}")


train_loader = DataLoader(TensorDataset(X_train_scaled, y_train), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_scaled, y_val), batch_size=BATCH_SIZE)
X_train_scaled.shape, X_val_scaled.shape

# Train
print("\n3. Training...")
model = train_model(model, train_loader, val_loader, DEVICE, EPOCHS, LR, MODEL_PATH)

# Load best model for evaluation
model.load_state_dict(torch.load(MODEL_PATH))

   Parameters: 6,295,040

3. Training...


Epoch 1/20: 100%|██████████| 440/440 [00:35<00:00, 12.30it/s]


Epoch 1: Train Loss = 0.155099, Val Loss = 0.150115
  ✓ Saved best model (val_loss=0.150115)


Epoch 2/20: 100%|██████████| 440/440 [00:49<00:00,  8.86it/s]


Epoch 2: Train Loss = 0.143359, Val Loss = 0.147338
  ✓ Saved best model (val_loss=0.147338)


Epoch 3/20: 100%|██████████| 440/440 [00:42<00:00, 10.42it/s]


Epoch 3: Train Loss = 0.139890, Val Loss = 0.145687
  ✓ Saved best model (val_loss=0.145687)


Epoch 4/20: 100%|██████████| 440/440 [00:39<00:00, 11.12it/s]


Epoch 4: Train Loss = 0.137485, Val Loss = 0.145248
  ✓ Saved best model (val_loss=0.145248)


Epoch 5/20: 100%|██████████| 440/440 [00:40<00:00, 10.89it/s]


Epoch 5: Train Loss = 0.135426, Val Loss = 0.144707
  ✓ Saved best model (val_loss=0.144707)


Epoch 6/20: 100%|██████████| 440/440 [00:43<00:00, 10.21it/s]


Epoch 6: Train Loss = 0.133618, Val Loss = 0.144191
  ✓ Saved best model (val_loss=0.144191)


Epoch 7/20: 100%|██████████| 440/440 [00:35<00:00, 12.40it/s]


Epoch 7: Train Loss = 0.131914, Val Loss = 0.144025
  ✓ Saved best model (val_loss=0.144025)


Epoch 8/20: 100%|██████████| 440/440 [00:32<00:00, 13.43it/s]


Epoch 8: Train Loss = 0.130419, Val Loss = 0.144089


Epoch 9/20: 100%|██████████| 440/440 [00:32<00:00, 13.67it/s]


Epoch 9: Train Loss = 0.128988, Val Loss = 0.144036


Epoch 10/20: 100%|██████████| 440/440 [00:31<00:00, 14.06it/s]


Epoch 10: Train Loss = 0.127614, Val Loss = 0.144009
  ✓ Saved best model (val_loss=0.144009)


Epoch 11/20: 100%|██████████| 440/440 [00:31<00:00, 13.78it/s]


Epoch 11: Train Loss = 0.126366, Val Loss = 0.144255


Epoch 12/20: 100%|██████████| 440/440 [00:31<00:00, 13.83it/s]


Epoch 12: Train Loss = 0.125086, Val Loss = 0.144491


Epoch 13/20: 100%|██████████| 440/440 [00:38<00:00, 11.33it/s]


Epoch 13: Train Loss = 0.123935, Val Loss = 0.144613


Epoch 14/20: 100%|██████████| 440/440 [01:18<00:00,  5.63it/s]


Epoch 14: Train Loss = 0.122755, Val Loss = 0.145130


Epoch 15/20: 100%|██████████| 440/440 [00:32<00:00, 13.34it/s]


Epoch 15: Train Loss = 0.121673, Val Loss = 0.145207


Epoch 16/20: 100%|██████████| 440/440 [00:32<00:00, 13.42it/s]


Epoch 16: Train Loss = 0.120605, Val Loss = 0.145296


Epoch 17/20: 100%|██████████| 440/440 [00:33<00:00, 13.18it/s]


Epoch 17: Train Loss = 0.119607, Val Loss = 0.145779


Epoch 18/20: 100%|██████████| 440/440 [00:34<00:00, 12.77it/s]


Epoch 18: Train Loss = 0.118585, Val Loss = 0.146121


Epoch 19/20: 100%|██████████| 440/440 [00:34<00:00, 12.80it/s]


Epoch 19: Train Loss = 0.117617, Val Loss = 0.146234


Epoch 20/20: 100%|██████████| 440/440 [00:34<00:00, 12.79it/s]


Epoch 20: Train Loss = 0.116763, Val Loss = 0.146683


<All keys matched successfully>

# Generate Submission

In [106]:
# read scaler for features
with open('scaler_X.pkl', 'rb') as f:
    sc_x = pickle.load(f)

# read scaler for targets
with open('scaler_Y.pkl', 'rb') as f:
    sc_y = pickle.load(f)

In [107]:
# Load best model for evaluation
model = MLP().to(DEVICE)
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [108]:
test_data = load_data("data/test/test/test.clean.npz")

test_embds = test_data['captions/embeddings']
test_embds = sc_x.transform(test_embds) # Scale the test caption embeddings
test_embds = torch.from_numpy(test_embds).float()
padding_needed = 1536 - 1024  # This is 512
test_embds = F.pad(test_embds, (0, padding_needed)) # make zero padding

with torch.no_grad():
    pred_embds = model(test_embds.to(DEVICE)).cpu()
    pred_embds_val = model(X_val_scaled.to(DEVICE)).cpu()
    pred_embds_train = model(X_train_scaled.to(DEVICE)).cpu()

# submission = generate_submission(test_data['captions/ids'], pred_embds, 'submissions/submission_v1.csv')
# print(f"Model saved to: {MODEL_PATH}")

In [109]:
pred_embds_inversed = torch.from_numpy(sc_y.inverse_transform(pred_embds)).float()
pred_embds_val_inversed = torch.from_numpy(sc_y.inverse_transform(pred_embds_val)).float()
pred_embds_train_inversed = torch.from_numpy(sc_y.inverse_transform(pred_embds_train)).float()

In [142]:
StandardScaler().fit_transform(pred_embds).mean()#.mean(axis=1)

np.float64(-1.6576246548222822e-20)

In [143]:
pred_embds.mean()#.mean(axis=1)

tensor(1.8557e-05)

In [144]:
submission = generate_submission(test_data['captions/ids'], StandardScaler().fit_transform(pred_embds), 'submissions/submission_v10.csv')
print(f"Model saved to: {MODEL_PATH}")

Generating submission file...
✓ Saved submission to submissions/submission_v10.csv
Model saved to: models/mlp_v1.pth


# Evaluation

In [148]:
evaluate_retrieval(pred_embds_val, y_val, labels_val_indices)

{'mrr': np.float64(0.8509497813398018),
 'ndcg': np.float64(0.8829141404885074),
 'recall_at_1': 0.82856,
 'recall_at_3': 0.82856,
 'recall_at_5': 0.82856,
 'recall_at_10': 0.93672,
 'recall_at_50': 0.998,
 'l2_dist': 33.63210678100586}

In [123]:
evaluate_retrieval(pred_embds_val, y_val, labels_val_indices)

{'mrr': np.float64(0.8509497813398018),
 'ndcg': np.float64(0.8829141404885074),
 'recall_at_1': 0.82856,
 'recall_at_3': 0.82856,
 'recall_at_5': 0.82856,
 'recall_at_10': 0.93672,
 'recall_at_50': 0.998,
 'l2_dist': 33.63210678100586}

In [87]:
evaluate_retrieval(pred_embds_val_inversed, y_val, labels_val_indices)

{'mrr': np.float64(0.7426362021608336),
 'ndcg': np.float64(0.7953180105908544),
 'recall_at_1': 0.71144,
 'recall_at_3': 0.71144,
 'recall_at_5': 0.71144,
 'recall_at_10': 0.8436,
 'recall_at_50': 0.98496,
 'l2_dist': 19.629213333129883}

In [149]:
evaluate_retrieval_with_normalization(pred_embds_val_inversed, y_val, labels_val_indices)

Normalizing predictions...
Normalizing gallery...
Evaluating 12500 queries in batches of 100...


{'mrr': np.float64(0.0003233540050187994),
 'ndcg': np.float64(0.0014014752824043528),
 'recall_at_1': 8e-05,
 'recall_at_3': 0.00016,
 'recall_at_5': 0.00016,
 'recall_at_10': 0.00048,
 'recall_at_50': 0.00352,
 'l2_dist': 54.64468002319336}

In [112]:
evaluate_retrieval(pred_embds_val, y_val_scaled, labels_val_indices)

{'mrr': np.float64(0.8548669682827283),
 'ndcg': np.float64(0.8860412542483317),
 'recall_at_1': 0.83288,
 'recall_at_3': 0.83288,
 'recall_at_5': 0.83288,
 'recall_at_10': 0.93976,
 'recall_at_50': 0.99808,
 'l2_dist': 44.37937545776367}

In [92]:
evaluate_retrieval(pred_embds_train, y_train_scaled, labels_train_indices)

{'mrr': np.float64(0.923201795805102),
 'ndcg': np.float64(0.9400569794924678),
 'recall_at_1': 0.9103288888888889,
 'recall_at_3': 0.9103288888888889,
 'recall_at_5': 0.9103288888888889,
 'recall_at_10': 0.9775022222222223,
 'recall_at_50': 0.9996977777777778,
 'l2_dist': 44.5223388671875}

In [132]:
evaluate_retrieval(pred_embds_train_inversed, y_train, labels_train_indices)

{'mrr': np.float64(0.8400417776251033),
 'ndcg': np.float64(0.8736264769635009),
 'recall_at_1': 0.8181422222222222,
 'recall_at_3': 0.8181422222222222,
 'recall_at_5': 0.8181422222222222,
 'recall_at_10': 0.9190311111111111,
 'recall_at_50': 0.9948977777777778,
 'l2_dist': 19.781431198120117}

In [93]:
y_val

tensor([[ 0.3185,  0.0177,  0.2274,  ..., -0.0558,  0.2325,  0.3599],
        [ 0.3185,  0.0177,  0.2274,  ..., -0.0558,  0.2325,  0.3599],
        [ 0.3185,  0.0177,  0.2274,  ..., -0.0558,  0.2325,  0.3599],
        ...,
        [ 0.6575, -0.4332, -0.2424,  ..., -0.4057,  1.0259, -0.5469],
        [ 0.6575, -0.4332, -0.2424,  ..., -0.4057,  1.0259, -0.5469],
        [ 0.6575, -0.4332, -0.2424,  ..., -0.4057,  1.0259, -0.5469]])

In [95]:
sc_y.inverse_transform(y_val_scaled)

array([[ 0.31854625,  0.01766749,  0.22744764, ..., -0.05579448,
         0.23245949,  0.35991552],
       [ 0.31854625,  0.01766749,  0.22744764, ..., -0.05579448,
         0.23245949,  0.35991552],
       [ 0.31854625,  0.01766749,  0.22744764, ..., -0.05579448,
         0.23245949,  0.35991552],
       ...,
       [ 0.65752709, -0.43324454, -0.24238909, ..., -0.4056725 ,
         1.0258552 , -0.54689348],
       [ 0.65752709, -0.43324454, -0.24238909, ..., -0.4056725 ,
         1.0258552 , -0.54689348],
       [ 0.65752709, -0.43324454, -0.24238909, ..., -0.4056725 ,
         1.0258552 , -0.54689348]])

In [120]:
torch.var(y_val_scaled, dim=0).max()

tensor(1.1352)

In [117]:
torch.var(y_val, dim=0).max()

tensor(3.5552)

In [121]:
torch.var(pred_embds_val, dim=0).max()

tensor(0.6458)

In [122]:
torch.var(pred_embds_val_inversed, dim=0).max()

tensor(1.7957)

In [124]:
y_train

tensor([[ 0.1224, -0.4013, -0.1244,  ...,  0.2686,  0.4888,  1.1778],
        [ 0.1224, -0.4013, -0.1244,  ...,  0.2686,  0.4888,  1.1778],
        [ 0.1224, -0.4013, -0.1244,  ...,  0.2686,  0.4888,  1.1778],
        ...,
        [-0.0528,  0.6580,  0.2983,  ...,  0.2993, -0.4380,  0.6484],
        [-0.0528,  0.6580,  0.2983,  ...,  0.2993, -0.4380,  0.6484],
        [-0.0528,  0.6580,  0.2983,  ...,  0.2993, -0.4380,  0.6484]])

In [125]:
y_val

tensor([[ 0.3185,  0.0177,  0.2274,  ..., -0.0558,  0.2325,  0.3599],
        [ 0.3185,  0.0177,  0.2274,  ..., -0.0558,  0.2325,  0.3599],
        [ 0.3185,  0.0177,  0.2274,  ..., -0.0558,  0.2325,  0.3599],
        ...,
        [ 0.6575, -0.4332, -0.2424,  ..., -0.4057,  1.0259, -0.5469],
        [ 0.6575, -0.4332, -0.2424,  ..., -0.4057,  1.0259, -0.5469],
        [ 0.6575, -0.4332, -0.2424,  ..., -0.4057,  1.0259, -0.5469]])

In [126]:
pred_embds_val.mean(axis=1)

tensor([-0.0051, -0.0037, -0.0018,  ...,  0.0019, -0.0012, -0.0058])

In [127]:
y_val_scaled.mean(axis=1)

tensor([-0.0039, -0.0039, -0.0039,  ..., -0.0081, -0.0081, -0.0081])

In [128]:
pred_embds_val_inversed.mean(axis=1)

tensor([-0.0161, -0.0163, -0.0171,  ..., -0.0168, -0.0159, -0.0158])

In [129]:
y_val.mean(axis=1)

tensor([-0.0144, -0.0144, -0.0144,  ..., -0.0176, -0.0176, -0.0176])

In [130]:
pred_embds_val.mean()

tensor(0.0002)

In [131]:
y_val_scaled.mean()

tensor(1.2440e-05)

In [None]:
0.00001