In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from pathlib import Path
from tqdm import tqdm

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle

import gc


from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

In [2]:
def load_data(path):
    data = dict(np.load(path, allow_pickle=True))
    return data



def prepare_train_data(data):
    """Prepare training data from loaded dict"""
    caption_embd = data['captions/embeddings']
    image_embd = data['images/embeddings']
    # Map caption embeddings to corresponding image embeddings
    label = data['captions/label'] # N x M

    # repeat the image embeddings according to the label
    label_idx = np.nonzero(label)[1]
    print(label_idx.shape)
    image_embd = image_embd[label_idx]
    assert caption_embd.shape[0] == image_embd.shape[0], "Mismatch in number of caption and image embeddings"

    X = torch.from_numpy(caption_embd).float()
    # Map each caption to its corresponding image embedding
    y = torch.from_numpy(image_embd).float()
    label = torch.from_numpy(label).bool()

    print(f"Train data: {len(X)} captions, {len(image_embd)} images")
    return X, y, label

In [3]:
# train_data = load_data('/content/data/train/train.npz')
train_data = load_data('data/train/train/train.npz')
X, y, labels = prepare_train_data(train_data)
del train_data
gc.collect()
print(f"Data shapes - X: {X.shape}, y: {y.shape}, labels: {labels.shape}")

(125000,)
Train data: 125000 captions, 125000 images
Data shapes - X: torch.Size([125000, 1024]), y: torch.Size([125000, 1536]), labels: torch.Size([125000, 25000])


In [4]:
def preprocess_data(X, y, label, train_split=0.9):
    """Preprocess data: split, standardize, and pad"""
    # Calculate split indices first
    n_train = int(train_split * len(X))
    perm = torch.randperm(len(X))
    train_indices = perm[:n_train]
    val_indices = perm[n_train:]
    
    # Process and delete original data in chunks to reduce memory usage
    # Split data first
    X_train, X_val = X[train_indices], X[val_indices]
    y_train, y_val = y[train_indices], y[val_indices]
    
    # Free original arrays immediately
    del X, y
    gc.collect()
    
    # Normalize in-place where possible to avoid creating temporary copies
    sc_x = StandardScaler()
    X_train_scaled = torch.from_numpy(sc_x.fit_transform(X_train.numpy())).float()
    X_val_scaled = torch.from_numpy(sc_x.transform(X_val.numpy())).float()

    sc_y = StandardScaler()
    y_train_scaled = torch.from_numpy(sc_y.fit_transform(y_train.numpy())).float()
    y_val_scaled = torch.from_numpy(sc_y.transform(y_val.numpy())).float()
    
    # Free intermediate arrays
    # del X_train, X_val, y_train, y_val
    # gc.collect()
    
    # Process labels more efficiently
    # Use boolean indexing directly without creating intermediate variables
    label_train_subset = label[train_indices]
    label_val_subset = label[val_indices]
    
    # Calculate label indices
    img_TRAIN_SPLIT = label_train_subset.sum(dim=0) > 0
    labels_train_ind = torch.nonzero(label_train_subset[:, img_TRAIN_SPLIT])[:, 1]
    
    img_VAL_SPLIT = label_val_subset.sum(dim=0) > 0
    labels_val_ind = torch.nonzero(label_val_subset[:, img_VAL_SPLIT])[:, 1]
    
    # Free label subsets
    del label_train_subset, label_val_subset
    gc.collect()
    
    print(f" Preprocessed data:")
    print(f"  Train: X={X_train_scaled.shape}, y={y_train_scaled.shape}")
    print(f"  Val:   X={X_val_scaled.shape}, y={y_val_scaled.shape}")
    print(f"  Train: labels={labels_train_ind.shape}")
    print(f"  Val:   labels={labels_val_ind.shape}")
    
    return X_train, X_val, y_train, y_val, X_train_scaled, X_val_scaled, y_train_scaled, y_val_scaled, labels_train_ind, labels_val_ind, sc_x, sc_y

X_train, X_val, y_train, y_val, X_train_scaled, X_val_scaled, y_train_scaled, y_val_scaled, labels_train, labels_val, sc_x, sc_y = preprocess_data(X, y, labels)

 Preprocessed data:
  Train: X=torch.Size([112500, 1024]), y=torch.Size([112500, 1536])
  Val:   X=torch.Size([12500, 1024]), y=torch.Size([12500, 1536])
  Train: labels=torch.Size([112500])
  Val:   labels=torch.Size([12500])


In [5]:
# save scalers as a pickle file
with open('scaler_x_15_11_25.pkl', 'wb') as f:
    pickle.dump(sc_x, f)

with open('scaler_y_15_11_25.pkl', 'wb') as f:
    pickle.dump(sc_y, f)

In [6]:
# save data tensors
torch.save({
    'X_train': X_train,
    'X_val': X_val,
    'y_train': y_train,
    'y_val': y_val,
    'X_train_scaled': X_train_scaled,
    'X_val_scaled': X_val_scaled,
    'y_train_scaled': y_train_scaled,
    'y_val_scaled': y_val_scaled,
    'labels_train': labels_train,
    'labels_val': labels_val
}, 'data/sandros_data_15_11_25.pt')

In [None]:
# read data back
data = torch.load('data/sandros_data_15_11_25.pt')

X_train = data['X_train']
X_train_scaled = data['X_train_scaled']
X_val = data['X_val']
X_val_scaled = data['X_val_scaled']

y_train = data['y_train']
y_train_scaled = data['y_train_scaled']
y_val = data['y_val']
y_val_scaled = data['y_val_scaled']

labels_train = data['labels_train']
labels_val = data['labels_val']