# Libraries

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from pathlib import Path
from tqdm import tqdm
import torch.nn.functional as F

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle

# Functions

In [13]:
# Data preparation functions

def load_data(path):
    """Load processed data from .npz file"""
    data = dict(np.load(path, allow_pickle=True))
    # data['caption2img'] = data['caption2img'].item()
    # data['caption2img_idx'] = data['caption2img_idx'].item()
    return data

def prepare_train_data(data):
    """Prepare training data from loaded dict"""
    caption_embd = data['captions/embeddings']
    image_embd = data['images/embeddings']
    # Map caption embeddings to corresponding image embeddings
    label = data['captions/label'] # N x M

    # repeat the image embeddings according to the label
    label_idx = np.nonzero(label)[1]
    print(label_idx.shape)
    image_embd = image_embd[label_idx]
    assert caption_embd.shape[0] == image_embd.shape[0], "Mismatch in number of caption and image embeddings"

    X = torch.from_numpy(caption_embd).float()
    # Map each caption to its corresponding image embedding
    y = torch.from_numpy(image_embd).float()
    label = torch.from_numpy(label).bool()

    print(f"Train data: {len(X)} captions, {len(image_embd)} images")
    return X, y, label

def generate_submission(sample_ids, translated_embeddings, output_file="submission.csv"):
    """
    Generate a submission.csv file from translated embeddings.
    """
    print("Generating submission file...")

    if isinstance(translated_embeddings, torch.Tensor):
        translated_embeddings = translated_embeddings.cpu().numpy()

    # Create a DataFrame with sample_id and embeddings

    df_submission = pd.DataFrame({'id': sample_ids, 'embedding': translated_embeddings.tolist()})

    df_submission.to_csv(output_file, index=False, float_format='%.17g')
    print(f"✓ Saved submission to {output_file}")
    
    return df_submission

In [18]:
# model training functions

def train_model(model, train_loader, val_loader, device, epochs, lr, MODEL_PATH):
    """Train the MLP model"""
    optimizer = optim.Adam(model.parameters(), lr=lr)

    best_val_loss = float('inf')

    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch)

            loss = F.mse_loss(outputs, y_batch)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = F.mse_loss(outputs, y_batch)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.6f}, Val Loss = {val_loss:.6f}")

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            Path(MODEL_PATH).parent.mkdir(parents=True, exist_ok=True)
            torch.save(model.state_dict(), MODEL_PATH)
            print(f"  ✓ Saved best model (val_loss={val_loss:.6f})")

    return model


# Data Preparation

In [None]:
# load data
train_data = load_data('data/train/train/train.npz')

In [None]:
# prepare train data
X, y, label = prepare_train_data(train_data)

(125000,)
Train data: 125000 captions, 125000 images


In [5]:
X.shape, y.shape

(torch.Size([125000, 1024]), torch.Size([125000, 1536]))

In [6]:
# split into train and val
DATASET_SIZE = len(X)
n_train = int(0.9 * len(X))
TRAIN_SPLIT = torch.zeros(len(X), dtype=torch.bool)
TRAIN_SPLIT[:n_train] = 1
X_train, X_val = X[TRAIN_SPLIT], X[~TRAIN_SPLIT]
y_train, y_val = y[TRAIN_SPLIT], y[~TRAIN_SPLIT]
labels_train, labels_val = label[TRAIN_SPLIT], label[~TRAIN_SPLIT]

X_train.shape, X_val.shape, y_train.shape, y_val.shape, labels_train.shape, labels_val.shape

(torch.Size([112500, 1024]),
 torch.Size([12500, 1024]),
 torch.Size([112500, 1536]),
 torch.Size([12500, 1536]),
 torch.Size([112500, 25000]),
 torch.Size([12500, 25000]))

In [7]:
# standardize features
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_val_scaled = scaler_X.transform(X_val)

# standardize targets
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train)
y_val_scaled = scaler_y.transform(y_val)

In [8]:
# save scalers as a pickle file
with open('scaler_X.pkl', 'wb') as f:
    pickle.dump(scaler_X, f)

with open('scaler_y.pkl', 'wb') as f:
    pickle.dump(scaler_y, f)

In [9]:
# save original train/val data and labels
torch.save({'captions/embeddings': X_train, 'images/embeddings': y_train, 'captions/label': labels_train}, 'data/X_y_labels_train.pt')
torch.save({'captions/embeddings': X_val, 'images/embeddings': y_val, 'captions/label': labels_val}, 'data/X_y_labels_val.pt')

# save scaled train/val data
torch.save({'captions/embeddings_standartized': torch.from_numpy(X_train_scaled).float(), 'images/embeddings_standartized': torch.from_numpy(y_train_scaled).float(), 'captions/label': labels_train}, 'data/X_y_labels_train_scaled.pt')
torch.save({'captions/embeddings_standartized': torch.from_numpy(X_val_scaled).float(), 'images/embeddings_standartized': torch.from_numpy(y_val_scaled).float(), 'captions/label': labels_val}, 'data/X_y_labels_val_scaled.pt')

*Read Data Back*

In [3]:
# read scaled data back
train = torch.load('data/X_y_labels_train_scaled.pt')
val = torch.load('data/X_y_labels_val_scaled.pt')

In [5]:
train

{'captions/embeddings_standartized': tensor([[-1.0484, -0.2398, -1.1937,  ...,  1.9227, -0.8140,  0.6990],
         [ 0.5970, -1.0902, -1.0971,  ...,  0.5235, -0.6814,  0.5296],
         [-0.8602, -0.6004, -1.7190,  ..., -0.0170,  0.2435, -0.2536],
         ...,
         [ 0.7032, -0.8866, -0.9816,  ..., -0.3574,  1.2990, -1.6193],
         [ 1.9398, -1.4414, -0.8151,  ...,  1.1098,  1.5640, -1.2967],
         [ 0.5412, -1.3856, -2.1370,  ...,  1.5716,  1.2756,  0.0366]]),
 'images/embeddings_standartized': tensor([[-1.0878, -1.4801, -0.0469,  ...,  0.5059,  0.0360,  2.7207],
         [-1.0878, -1.4801, -0.0469,  ...,  0.5059,  0.0360,  2.7207],
         [-1.0878, -1.4801, -0.0469,  ...,  0.5059,  0.0360,  2.7207],
         ...,
         [-1.5101,  1.4579,  1.0158,  ...,  0.5842, -1.9490,  1.4042],
         [-1.5101,  1.4579,  1.0158,  ...,  0.5842, -1.9490,  1.4042],
         [-1.5101,  1.4579,  1.0158,  ...,  0.5842, -1.9490,  1.4042]]),
 'captions/label': tensor([[ True, False, Fals

In [8]:
X_train_scaled = train['captions/embeddings_standartized']
y_train_scaled = train['images/embeddings_standartized']
labels_train = train['captions/label']


X_val_scaled = val['captions/embeddings_standartized']
y_val_scaled = val['images/embeddings_standartized']
labels_val = val['captions/label']

In [9]:
del train, val

import gc

gc.collect()

7

*Make Padding*

In [7]:
# Calculate padding needed
padding_needed = 1536 - 1024  # This is 512

In [10]:
X_train_scaled = F.pad(X_train_scaled, (0, padding_needed))
X_val_scaled = F.pad(X_val_scaled, (0, padding_needed))

In [12]:
X_train_scaled.shape, X_val_scaled.shape

(torch.Size([112500, 1536]), torch.Size([12500, 1536]))

# Model Training

In [19]:
# Configuration
MODEL_PATH = "models/mlp_v1.pth"
EPOCHS = 20
BATCH_SIZE = 256
LR = 0.001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#
class MLP(nn.Module):
    def __init__(self, input_dim=1536, output_dim=1536, hidden_dim=2048):
        super().__init__()
        self.net = nn.Sequential(
            # nn.Linear(input_dim, output_dim),
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
        )

    def forward(self, x):
        return self.net(x)

In [20]:
# Initialize model
model = MLP().to(DEVICE)
print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}")


train_loader = DataLoader(TensorDataset(X_train_scaled, y_train_scaled), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_scaled, y_val_scaled), batch_size=BATCH_SIZE)
X_train_scaled.shape, X_val_scaled.shape

# Train
print("\n3. Training...")
model = train_model(model, train_loader, val_loader, DEVICE, EPOCHS, LR, MODEL_PATH)

# Load best model for evaluation
model.load_state_dict(torch.load(MODEL_PATH))

   Parameters: 6,295,040

3. Training...


Epoch 1/20: 100%|██████████| 440/440 [01:11<00:00,  6.14it/s]


Epoch 1: Train Loss = 0.791533, Val Loss = 0.781824
  ✓ Saved best model (val_loss=0.781824)


Epoch 2/20: 100%|██████████| 440/440 [01:28<00:00,  5.00it/s]


Epoch 2: Train Loss = 0.748296, Val Loss = 0.770284
  ✓ Saved best model (val_loss=0.770284)


Epoch 3/20: 100%|██████████| 440/440 [00:43<00:00, 10.14it/s]


Epoch 3: Train Loss = 0.730276, Val Loss = 0.765619
  ✓ Saved best model (val_loss=0.765619)


Epoch 4/20: 100%|██████████| 440/440 [00:40<00:00, 10.78it/s]


Epoch 4: Train Loss = 0.717556, Val Loss = 0.763948
  ✓ Saved best model (val_loss=0.763948)


Epoch 5/20: 100%|██████████| 440/440 [00:43<00:00, 10.10it/s]


Epoch 5: Train Loss = 0.707239, Val Loss = 0.763132
  ✓ Saved best model (val_loss=0.763132)


Epoch 6/20: 100%|██████████| 440/440 [00:46<00:00,  9.47it/s]


Epoch 6: Train Loss = 0.698404, Val Loss = 0.762343
  ✓ Saved best model (val_loss=0.762343)


Epoch 7/20: 100%|██████████| 440/440 [00:42<00:00, 10.34it/s]


Epoch 7: Train Loss = 0.690626, Val Loss = 0.763222


Epoch 8/20: 100%|██████████| 440/440 [01:19<00:00,  5.52it/s]


Epoch 8: Train Loss = 0.683179, Val Loss = 0.764550


Epoch 9/20: 100%|██████████| 440/440 [00:40<00:00, 10.94it/s]


Epoch 9: Train Loss = 0.676641, Val Loss = 0.765372


Epoch 10/20: 100%|██████████| 440/440 [00:39<00:00, 11.27it/s]


Epoch 10: Train Loss = 0.670239, Val Loss = 0.767353


Epoch 11/20: 100%|██████████| 440/440 [01:21<00:00,  5.40it/s]


Epoch 11: Train Loss = 0.664453, Val Loss = 0.768423


Epoch 12/20: 100%|██████████| 440/440 [01:15<00:00,  5.83it/s]


Epoch 12: Train Loss = 0.658855, Val Loss = 0.771243


Epoch 13/20: 100%|██████████| 440/440 [00:42<00:00, 10.48it/s]


Epoch 13: Train Loss = 0.653620, Val Loss = 0.772506


Epoch 14/20: 100%|██████████| 440/440 [00:42<00:00, 10.48it/s]


Epoch 14: Train Loss = 0.648486, Val Loss = 0.772450


Epoch 15/20: 100%|██████████| 440/440 [00:48<00:00,  9.12it/s]


Epoch 15: Train Loss = 0.643483, Val Loss = 0.777190


Epoch 16/20: 100%|██████████| 440/440 [00:44<00:00,  9.86it/s]


Epoch 16: Train Loss = 0.638990, Val Loss = 0.778562


Epoch 17/20: 100%|██████████| 440/440 [00:44<00:00,  9.92it/s]


Epoch 17: Train Loss = 0.634541, Val Loss = 0.779180


Epoch 18/20: 100%|██████████| 440/440 [00:42<00:00, 10.42it/s]


Epoch 18: Train Loss = 0.630257, Val Loss = 0.781181


Epoch 19/20: 100%|██████████| 440/440 [00:43<00:00, 10.07it/s]


Epoch 19: Train Loss = 0.626201, Val Loss = 0.784228


Epoch 20/20: 100%|██████████| 440/440 [00:43<00:00, 10.14it/s]


Epoch 20: Train Loss = 0.622398, Val Loss = 0.783102


<All keys matched successfully>

# Generate Submission

In [None]:
# read scaler for features
with open('scaler_X.pkl', 'rb') as f:
    sc_x = pickle.load(f)

In [None]:
# Load best model for evaluation
model.load_state_dict(torch.load(MODEL_PATH))

In [28]:
test_data = load_data("data/test/test/test.clean.npz")

test_embds = test_data['captions/embeddings']
test_embds = sc_x.transform(test_embds) # Scale the test caption embeddings
test_embds = torch.from_numpy(test_embds).float()
# padding_needed = 1536 - 1024  # This is 512
test_embds = F.pad(test_embds, (0, padding_needed)) # make zero padding

with torch.no_grad():
    pred_embds = model(test_embds.to(DEVICE)).cpu()

submission = generate_submission(test_data['captions/ids'], pred_embds, 'submissions/submission_v1.csv')
print(f"Model saved to: {MODEL_PATH}")

Generating submission file...
✓ Saved submission to submissions/submission_v1.csv
Model saved to: models/mlp_v1.pth
