In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score


Mounted at /content/drive


In [2]:
BASE_PATH = "/content/drive/MyDrive/IIT_Roorkee_Project/data/"

X_train = pd.read_csv(BASE_PATH + "X_train.csv")
X_val   = pd.read_csv(BASE_PATH + "X_val.csv")
y_train = pd.read_csv(BASE_PATH + "y_train.csv").values.ravel()
y_val   = pd.read_csv(BASE_PATH + "y_val.csv").values.ravel()


In [3]:
image_embeddings = np.load(BASE_PATH + "image_embeddings.npy")
image_ids = pd.read_csv(BASE_PATH + "image_ids.csv")

image_ids['id'] = image_ids['image_id'].str.replace('.jpg','').astype(int)
id2emb = dict(zip(image_ids['id'], image_embeddings))


In [4]:
img_train = np.vstack([id2emb[i] for i in X_train['id']])
img_val   = np.vstack([id2emb[i] for i in X_val['id']])

# Drop id from tabular
X_train = X_train.drop(columns=['id'])
X_val   = X_val.drop(columns=['id'])


In [5]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)


In [6]:
class MultiDataset(Dataset):
    def __init__(self, xt, xi, y):
        self.xt = torch.tensor(xt, dtype=torch.float32)
        self.xi = torch.tensor(xi, dtype=torch.float32)
        self.y  = torch.tensor(y, dtype=torch.float32).view(-1,1)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.xt[idx], self.xi[idx], self.y[idx]


In [7]:
train_ds = MultiDataset(X_train_scaled, img_train, y_train)
val_ds   = MultiDataset(X_val_scaled, img_val, y_val)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False)


In [8]:
class ShallowMultimodal(nn.Module):
    def __init__(self, tab_dim):
        super().__init__()

        # Weak tabular branch (same as shallow MLP)
        self.tab = nn.Sequential(
            nn.Linear(tab_dim, 32),
            nn.ReLU()
        )

        # Image branch (stronger)
        self.img = nn.Sequential(
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3)
        )

        # Fusion head
        self.out = nn.Sequential(
            nn.Linear(32 + 128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, xt, xi):
        t = self.tab(xt)
        i = self.img(xi)
        return self.out(torch.cat([t, i], dim=1))


In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = ShallowMultimodal(X_train_scaled.shape[1]).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

EPOCHS = 40

for epoch in range(EPOCHS):
    model.train()
    loss_sum = 0

    for xt, xi, yb in train_loader:
        xt, xi, yb = xt.to(device), xi.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(xt, xi)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

        loss_sum += loss.item()

    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/{EPOCHS}, Train MSE: {loss_sum/len(train_loader):.4f}")


Epoch 10/40, Train MSE: 0.2176
Epoch 20/40, Train MSE: 0.0535
Epoch 30/40, Train MSE: 0.0446
Epoch 40/40, Train MSE: 0.0365


In [10]:
model.eval()
val_preds, val_true = [], []

with torch.no_grad():
    for xt, xi, yb in val_loader:
        xt, xi = xt.to(device), xi.to(device)
        preds = model(xt, xi).cpu().numpy()

        val_preds.extend(preds)
        val_true.extend(yb.numpy())

val_preds = np.array(val_preds).ravel()
val_true  = np.array(val_true).ravel()

rmse = np.sqrt(mean_squared_error(val_true, val_preds))
r2   = r2_score(val_true, val_preds)

print(f"Shallow Multimodal RMSE: {rmse:.4f}")
print(f"Shallow Multimodal R²  : {r2:.4f}")


Shallow Multimodal RMSE: 0.2309
Shallow Multimodal R²  : 0.8068
