### Phase 4 — Multimodal Fusion Modeling & Final Evaluation

#### Objective
Evaluate and compare multiple multimodal fusion strategies for property price prediction by combining tabular features and satellite image representations. This notebook systematically benchmarks different fusion approaches and selects the final model based on empirical performance.

#### Fusion Strategies Evaluated
- Tabular-only baseline (XGBoost)
- Late fusion (neural network)
- Early fusion (neural network)
- Two-stage hybrid fusion (tree-based)
- Residual fusion
- Stacked generalization (meta-learning)

#### Key Steps
- Load tabular features and image embeddings
- Train and evaluate each fusion strategy
- Compare models using RMSE and R² metrics
- Select the final model architecture based on validation performance

#### Output
- Performance comparison table
- Justification for the chosen final multimodal model


In [None]:
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

TABULAR_PATH = "../data/processed/train_clean.csv"
EMB_PATH = "../data/processed/image_embeddings.npy"
EMB_ID_PATH = "../data/processed/image_ids.npy"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 64
EPOCHS = 40
LR = 1e-3

SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [16]:
df = pd.read_csv(TABULAR_PATH)

img_emb = np.load(EMB_PATH)
img_ids = np.load(EMB_ID_PATH)

emb_df = pd.DataFrame(img_emb)
emb_df["id"] = img_ids

df = df.merge(emb_df, on="id", how="inner")
print("Aligned shape:", df.shape)

Aligned shape: (16406, 529)


In [4]:
# Training XGBoost on image embeddings only to check their quality
X_img = df.iloc[:, -img_emb.shape[1]:].values
y_img = df["log_price"].values

xgb_img = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_img.fit(X_img, y_img)

preds = xgb_img.predict(X_img)

rmse = root_mean_squared_error(
    y_img,
    preds,
)

rmse

0.17622979762522115

Satellite imagery alone provides good predictive signal (RMSE ≈ 0.18), indicating that environmental context contains meaningful information about property value.

In [17]:
TABULAR_FEATURES = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'size_quality', 'living_density_ratio']

X_tab = df[TABULAR_FEATURES].values
X_img = df.iloc[:, -img_emb.shape[1]:].values
y = df["log_price"].values

In [18]:
X_tab_train, X_tab_val, X_img_train, X_img_val, y_train, y_val = train_test_split(
    X_tab, X_img, y, test_size=0.2, random_state=42
)

In [19]:
tab_scaler = StandardScaler()
X_tab_train = tab_scaler.fit_transform(X_tab_train)
X_tab_val = tab_scaler.transform(X_tab_val)

img_scaler = StandardScaler()
X_img_train = img_scaler.fit_transform(X_img_train)
X_img_val = img_scaler.transform(X_img_val)

pca = PCA(n_components=32, random_state=42)
X_img_train_nn = pca.fit_transform(X_img_train)
X_img_val_nn = pca.transform(X_img_val)

IMG_DIM = X_img_train_nn.shape[1]

#### Tabular Baseline (XGBoost)

In [8]:
tab_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

tab_model.fit(X_tab_train, y_train)
tab_preds = tab_model.predict(X_tab_val)

rmse_tab = root_mean_squared_error(y_val, tab_preds)
r2_tab = r2_score(y_val, tab_preds)

rmse_tab, r2_tab

(0.17074308510789515, 0.8949758774393711)

#### Late Fusion Neural Network

In [20]:
class LateFusionModel(nn.Module):
    def __init__(self, tab_dim, img_dim):
        super().__init__()
        self.tab = nn.Sequential(
            nn.Linear(tab_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32)
        )
        self.img = nn.Sequential(
            nn.Linear(img_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 32)
        )
        self.reg = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, xt, xi):
        return self.reg(torch.cat([self.tab(xt), self.img(xi)], dim=1)).squeeze()

In [None]:
dataset = TensorDataset(
    torch.tensor(X_tab_train, dtype=torch.float32),
    torch.tensor(X_img_train_nn, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
)

loader = DataLoader(dataset, batch_size=64, shuffle=True)

model = LateFusionModel(X_tab_train.shape[1], IMG_DIM).to(DEVICE) 
opt = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4) 
loss_fn = nn.MSELoss()

In [22]:
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0.0
    for xt, xi, y in loader:
        xt, xi, y = xt.to(DEVICE), xi.to(DEVICE), y.to(DEVICE)

        opt.zero_grad()
        preds = model(xt, xi)
        loss = loss_fn(preds, y)
        loss.backward()
        opt.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1} | Train Loss: {epoch_loss/len(loader):.4f}")

model.eval()
with torch.no_grad():
    nn_preds = model(
        torch.tensor(X_tab_val, dtype=torch.float32).to(DEVICE),
        torch.tensor(X_img_val_nn, dtype=torch.float32).to(DEVICE)
    ).cpu().numpy()

rmse_late = root_mean_squared_error(y_val, nn_preds)
r2_late = r2_score(y_val, nn_preds)

rmse_late, r2_late

Epoch 1 | Train Loss: 20.2788
Epoch 2 | Train Loss: 1.7654
Epoch 3 | Train Loss: 0.8981
Epoch 4 | Train Loss: 0.4102
Epoch 5 | Train Loss: 0.1951
Epoch 6 | Train Loss: 0.1169
Epoch 7 | Train Loss: 0.0900
Epoch 8 | Train Loss: 0.0760
Epoch 9 | Train Loss: 0.0686
Epoch 10 | Train Loss: 0.0608
Epoch 11 | Train Loss: 0.0571
Epoch 12 | Train Loss: 0.0558
Epoch 13 | Train Loss: 0.0529
Epoch 14 | Train Loss: 0.0511
Epoch 15 | Train Loss: 0.0481
Epoch 16 | Train Loss: 0.0483
Epoch 17 | Train Loss: 0.0466
Epoch 18 | Train Loss: 0.0435
Epoch 19 | Train Loss: 0.0447
Epoch 20 | Train Loss: 0.0464
Epoch 21 | Train Loss: 0.0426
Epoch 22 | Train Loss: 0.0410
Epoch 23 | Train Loss: 0.0467
Epoch 24 | Train Loss: 0.0409
Epoch 25 | Train Loss: 0.0424
Epoch 26 | Train Loss: 0.0442
Epoch 27 | Train Loss: 0.0418
Epoch 28 | Train Loss: 0.0482
Epoch 29 | Train Loss: 0.0452
Epoch 30 | Train Loss: 0.0401
Epoch 31 | Train Loss: 0.0408
Epoch 32 | Train Loss: 0.0383
Epoch 33 | Train Loss: 0.0363
Epoch 34 | Train L

(0.21436719254998202, 0.8344537094599237)

#### Early Fusion Neural Network

In [23]:
class EarlyFusionModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze()

In [24]:
X_train_early = np.hstack([X_tab_train, X_img_train_nn])
X_val_early = np.hstack([X_tab_val, X_img_val_nn])

train_dataset = TensorDataset(
    torch.tensor(X_train_early, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32)
)

train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True
)

model_early = EarlyFusionModel(X_train_early.shape[1]).to(DEVICE)
optimizer = torch.optim.Adam(model_early.parameters(), lr=LR)

In [25]:
for epoch in range(EPOCHS):
    model_early.train()
    epoch_loss = 0.0

    for xb, yb in train_loader:
        xb = xb.to(DEVICE)
        yb = yb.to(DEVICE)

        optimizer.zero_grad()
        preds = model_early(xb)
        loss = loss_fn(preds, yb)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {epoch_loss / len(train_loader):.4f}")

rmse_early = root_mean_squared_error(
    y_val,
    model_early(torch.tensor(X_val_early, dtype=torch.float32).to(DEVICE)).cpu().detach().numpy(),
)

r2_early = r2_score(
    y_val,
    model_early(torch.tensor(X_val_early, dtype=torch.float32).to(DEVICE)).cpu().detach().numpy()
)

rmse_early, r2_early

Epoch 1/40 | Train Loss: 20.9034
Epoch 2/40 | Train Loss: 2.8816
Epoch 3/40 | Train Loss: 2.1501
Epoch 4/40 | Train Loss: 1.6333
Epoch 5/40 | Train Loss: 1.2337
Epoch 6/40 | Train Loss: 0.9274
Epoch 7/40 | Train Loss: 0.6753
Epoch 8/40 | Train Loss: 0.4608
Epoch 9/40 | Train Loss: 0.3236
Epoch 10/40 | Train Loss: 0.2369
Epoch 11/40 | Train Loss: 0.1725
Epoch 12/40 | Train Loss: 0.1426
Epoch 13/40 | Train Loss: 0.1209
Epoch 14/40 | Train Loss: 0.1097
Epoch 15/40 | Train Loss: 0.1028
Epoch 16/40 | Train Loss: 0.0971
Epoch 17/40 | Train Loss: 0.0879
Epoch 18/40 | Train Loss: 0.0810
Epoch 19/40 | Train Loss: 0.0777
Epoch 20/40 | Train Loss: 0.0789
Epoch 21/40 | Train Loss: 0.0732
Epoch 22/40 | Train Loss: 0.0724
Epoch 23/40 | Train Loss: 0.0729
Epoch 24/40 | Train Loss: 0.0720
Epoch 25/40 | Train Loss: 0.0718
Epoch 26/40 | Train Loss: 0.0659
Epoch 27/40 | Train Loss: 0.0647
Epoch 28/40 | Train Loss: 0.0656
Epoch 29/40 | Train Loss: 0.0716
Epoch 30/40 | Train Loss: 0.0615
Epoch 31/40 | Trai

(0.29251761839223694, 0.6917473130232774)

#### Two-Stage Hybrid (Tree-Based)

In [26]:
X_hybrid_train = np.hstack([X_tab_train, X_img_train])
X_hybrid_val = np.hstack([X_tab_val, X_img_val])

hybrid_model = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

hybrid_model.fit(X_hybrid_train, y_train)
hybrid_preds = hybrid_model.predict(X_hybrid_val)

rmse_hybrid = root_mean_squared_error(y_val, hybrid_preds)
r2_hybrid = r2_score(y_val, hybrid_preds)

rmse_hybrid, r2_hybrid

(0.16584030156480697, 0.9009206905878419)

#### Residual Fusion

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_residuals = np.zeros_like(y_train)

for tr_idx, va_idx in kf.split(X_tab_train):
    X_tr, X_va = X_tab_train[tr_idx], X_tab_train[va_idx]
    y_tr, y_va = y_train[tr_idx], y_train[va_idx]

    tab_cv = XGBRegressor(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    tab_cv.fit(X_tr, y_tr)
    oof_residuals[va_idx] = y_va - tab_cv.predict(X_va)

In [28]:
residual_train = oof_residuals 
residual_val = y_val - tab_preds

res_model = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

res_model.fit(X_img_train, residual_train)
res_preds= res_model.predict(X_img_val)

final_preds = tab_preds + res_preds
rmse_res = root_mean_squared_error(y_val, final_preds)
r2_res = r2_score(y_val, final_preds)
rmse_res, r2_res

(0.15535057527169696, 0.9130582165147851)

#### Stacked Generalization (Meta-Learning)

In [31]:
img_only_model = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

img_only_model.fit(X_img_train, y_train)

img_preds = img_only_model.predict(X_img_val)

In [32]:
meta_X_train = np.column_stack([
    tab_model.predict(X_tab_train),
    img_only_model.predict(X_img_train)
])

meta_X_val = np.column_stack([
    tab_preds,
    img_preds
])

meta_model = LinearRegression()
meta_model.fit(meta_X_train, y_train)

stacked_preds = meta_model.predict(meta_X_val)

rmse_stacked = root_mean_squared_error(y_val, stacked_preds)
r2_stacked = r2_score(y_val, stacked_preds)

rmse_stacked, r2_stacked

(0.23407638692273325, 0.8026132375191068)

#### Comparison

In [33]:
results = pd.DataFrame({
    "Model": [
        "Tabular (XGB)",
        "Late Fusion (NN)",
        "Early Fusion (NN)",
        "Hybrid Fusion (XGB)",
        "Residual Fusion",
        "Stacked Generalization"
    ],
    "RMSE": [
        rmse_tab,
        rmse_late,
        rmse_early,
        rmse_hybrid,
        rmse_res,
        rmse_stacked
    ],
    "R2": [
        r2_tab,
        r2_late,
        r2_early,
        r2_hybrid,
        r2_res,
        r2_stacked
    ]
})

results.sort_values("RMSE")

Unnamed: 0,Model,RMSE,R2
4,Residual Fusion,0.155351,0.913058
3,Hybrid Fusion (XGB),0.16584,0.900921
0,Tabular (XGB),0.170743,0.894976
1,Late Fusion (NN),0.214367,0.834454
5,Stacked Generalization,0.234076,0.802613
2,Early Fusion (NN),0.292518,0.691747
