I trained all the models on Kaggle and hence the file paths are according to them. If running locally, replace them with the commented one beside them (they are fixed in beginning of the code only).<br> Also change for the test dataset accordingly

In [1]:
# Importing all the required libraries
import os
import numpy as np
from scipy.sparse import issparse
from PIL import Image
import tqdm
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
data_path = '/kaggle/input/tabular-data/' # '../data/processed/'
image_dir = '/kaggle/input/imagery-train/' # '../data/images/train/images/'
save_model_dir = '/kaggle/working/' # '/model_output/'

# LOADING DATA

print("Loading data...")

if os.path.exists(f'{data_path}X_train.npy'):
    X_train = np.load(f'{data_path}X_train.npy', allow_pickle=True)
    y_train = np.load(f'{data_path}y_train.npy', allow_pickle=True)
    train_ids = np.load(f'{data_path}train_ids.npy', allow_pickle=True)
    X_val = np.load(f'{data_path}X_val.npy', allow_pickle=True)
    y_val = np.load(f'{data_path}y_val.npy', allow_pickle=True)
    val_ids = np.load(f'{data_path}val_ids.npy', allow_pickle=True)

    # Handle Sparse/Wrapped Inputs
    if X_train.shape == (): X_train = X_train.item()
    if X_val.shape == (): X_val = X_val.item()
    if issparse(X_train): X_train = X_train.toarray().astype(np.float32)
    else: X_train = np.array(X_train, dtype=np.float32)
    if issparse(X_val): X_val = X_val.toarray().astype(np.float32)
    else: X_val = np.array(X_val, dtype=np.float32)

    y_train = y_train.astype(np.float32)
    y_val = y_val.astype(np.float32)
    print(f"Data Loaded. Train: {X_train.shape}, Val: {X_val.shape}")
else:
    raise FileNotFoundError("Please run the preprocessing script to generate .npy files first.")

Loading data...
Data Loaded. Train: (12888, 64), Val: (3222, 64)


In [3]:
# 1. Train XGBoost (The "Base" Model)
print("Training XGBoost Baseline...")


xgb_reg = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    early_stopping_rounds=50,
    random_state=42,
    n_jobs=-1
)

xgb_reg.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

xgb_train_preds = xgb_reg.predict(X_train)
xgb_val_preds = xgb_reg.predict(X_val)


train_residuals = y_train - xgb_train_preds
val_residuals = y_val - xgb_val_preds


baseline_rmse = np.sqrt(mean_squared_error(y_val, xgb_val_preds))
baseline_r2 = r2_score(y_val, xgb_val_preds)
print(f"XGBoost Baseline RMSE: {baseline_rmse:.4f}")
print(f"XGBoost Baseline R2: {baseline_r2:.4f}")

Training XGBoost Baseline...
XGBoost Baseline RMSE: 0.1847
XGBoost Baseline R2: 0.8743


In [4]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from torch.utils.data import Dataset, DataLoader

class MultimodalDataset(Dataset):
    def __init__(self, X_tabular, y_labels, property_ids, image_dir, transform=None):
        self.X_tabular = torch.FloatTensor(X_tabular)
        self.y_labels = torch.FloatTensor(y_labels)
        self.property_ids = property_ids
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.property_ids)

    def __getitem__(self, idx):
        tabular_features = self.X_tabular[idx]
        label = self.y_labels[idx]

        property_id = self.property_ids[idx]
        image_path = os.path.join(self.image_dir, f"{property_id}.png")

        try:
            image = Image.open(image_path).convert('RGB')
        except:
            image = Image.new('RGB', (224, 224), color='black') # Fallback

        if self.transform:
            image = self.transform(image)

        return {
            'image': image,
            'tabular': tabular_features,
            'label': label,
            'id': property_id
        }

class ResidualMultimodalModel(nn.Module):
    """
    Residual Multimodal Model:
    price = base_tabular_prediction + image_residual(tabular, image)
    """

    def __init__(self, tabular_dim):
        super().__init__()

        # Vision Backbone (Frozen)
        self.backbone = models.resnet50(
            weights=models.ResNet50_Weights.DEFAULT
        )
        img_dim = self.backbone.fc.in_features
        self.backbone.fc = nn.Identity()

        # Freeze ENTIRE CNN
        for p in self.backbone.parameters():
            p.requires_grad = False

        # Tabular Projection (Small)
        self.tabular_proj = nn.Sequential(
            nn.Linear(tabular_dim, 64),
            nn.ReLU()
        )

        # Residual Head (HEAVILY REGULARIZED)
        self.residual_head = nn.Sequential(
            nn.Linear(img_dim + 64, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 1)
        )

    def forward(self, image, tabular):
        with torch.no_grad():
            img_feats = self.backbone(image)  # frozen

        tab_feats = self.tabular_proj(tabular)

        x = torch.cat([img_feats, tab_feats], dim=1)
        residual = self.residual_head(x)

        return residual.squeeze(-1)

In [6]:
# Standard Transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset_res = MultimodalDataset(X_train, train_residuals, train_ids, image_dir, transform)
val_dataset_res = MultimodalDataset(X_val, val_residuals, val_ids, image_dir, transform)

train_loader_res = DataLoader(train_dataset_res, batch_size=32, shuffle=True, num_workers=2)
val_loader_res = DataLoader(val_dataset_res, batch_size=32, shuffle=False, num_workers=2)


# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResidualMultimodalModel(tabular_dim=X_train.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-4,
    weight_decay=1e-2
)

# Training Loop
epochs = 15
best_rmse = float("inf")

for epoch in range(epochs):
    model.train()
    train_loss = 0.0

    for batch in tqdm.tqdm(train_loader_res, desc=f"Epoch {epoch+1}", leave=False):
        images = batch['image'].to(device)
        tabular = batch['tabular'].to(device)
        targets = batch['label'].to(device)

        optimizer.zero_grad()
        preds = model(images, tabular)
        loss = criterion(preds, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # VALIDATION
    model.eval()
    cnn_residual_preds = []

    with torch.no_grad():
        for batch in val_loader_res:
            images = batch['image'].to(device)
            tabular = batch['tabular'].to(device)
            preds = model(images, tabular)
            cnn_residual_preds.extend(preds.cpu().numpy())

    cnn_residual_preds = np.array(cnn_residual_preds)

    # COMBINE WITH TABULAR BASE
    alpha = 0.5
    final_preds = xgb_val_preds + alpha * cnn_residual_preds

    rmse = np.sqrt(mean_squared_error(y_val, final_preds))
    r2   = r2_score(y_val, final_preds)

    print(
        f"Epoch {epoch+1} | "
        f"Train Residual Loss: {train_loss/len(train_loader_res):.6f} | "
        f"Combined RMSE: {rmse:.4f} | "
        f"R²: {r2:.4f}"
    )

    if rmse < best_rmse:
        best_rmse = rmse
        torch.save(model.state_dict(), f"{save_model_dir}best_residual_multimodal.pth")
        print(f"  Saved new best model (RMSE: {best_rmse:.4f})")

                                                          

Epoch 1 | Train Residual Loss: 0.015399 | Combined RMSE: 0.1842 | R²: 0.8750
  Saved new best model (RMSE: 0.1842)


                                                          

Epoch 2 | Train Residual Loss: 0.014765 | Combined RMSE: 0.1838 | R²: 0.8754
  Saved new best model (RMSE: 0.1838)


                                                          

Epoch 3 | Train Residual Loss: 0.014570 | Combined RMSE: 0.1837 | R²: 0.8756
  Saved new best model (RMSE: 0.1837)


                                                          

Epoch 4 | Train Residual Loss: 0.014299 | Combined RMSE: 0.1837 | R²: 0.8756


                                                          

Epoch 5 | Train Residual Loss: 0.014167 | Combined RMSE: 0.1834 | R²: 0.8760
  Saved new best model (RMSE: 0.1834)


                                                          

Epoch 6 | Train Residual Loss: 0.013771 | Combined RMSE: 0.1834 | R²: 0.8760


                                                          

Epoch 7 | Train Residual Loss: 0.013530 | Combined RMSE: 0.1836 | R²: 0.8758


                                                          

Epoch 8 | Train Residual Loss: 0.013047 | Combined RMSE: 0.1830 | R²: 0.8766
  Saved new best model (RMSE: 0.1830)


                                                          

Epoch 9 | Train Residual Loss: 0.012594 | Combined RMSE: 0.1827 | R²: 0.8770
  Saved new best model (RMSE: 0.1827)


                                                           

Epoch 10 | Train Residual Loss: 0.012296 | Combined RMSE: 0.1835 | R²: 0.8759


                                                           

Epoch 11 | Train Residual Loss: 0.011847 | Combined RMSE: 0.1830 | R²: 0.8765


                                                           

Epoch 12 | Train Residual Loss: 0.011390 | Combined RMSE: 0.1827 | R²: 0.8769


                                                           

Epoch 13 | Train Residual Loss: 0.011168 | Combined RMSE: 0.1825 | R²: 0.8772
  Saved new best model (RMSE: 0.1825)


                                                           

Epoch 14 | Train Residual Loss: 0.010631 | Combined RMSE: 0.1823 | R²: 0.8775
  Saved new best model (RMSE: 0.1823)


                                                           

Epoch 15 | Train Residual Loss: 0.010401 | Combined RMSE: 0.1830 | R²: 0.8765


Apparantely, I could not figure out the visualising part for this model and hence only training part is there