### Phase 6 â€” Final Inference & Prediction File Generation

#### Objective
Run the finalized multimodal pipeline on the held-out test dataset and generate the final price predictions for submission. This notebook strictly performs **inference only**, using previously trained models.

#### Key Steps
- Load trained tabular and residual image models
- Fetch and preprocess satellite images for the test set
- Generate tabular predictions and image-based residual corrections
- Combine predictions into final price estimates
- Export results in the required CSV format

#### Output
- `outputs/predictions.csv` containing:
  - `id`
  - `predicted_price`

In [24]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torchvision import transforms
from torchvision.models import resnet18, ResNet18_Weights

from xgboost import XGBRegressor, Booster
from joblib import load

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

TEST_DATA_PATH = "../data/processed/test_clean.csv"
IMAGE_DIR = "../data/images/test"
TABULAR_MODEL_PATH = "../models/tabular_xgb.json"
TABULAR_SCALER_PATH = "../models/tabular_scaler.joblib"
RESIDUAL_IMAGE_MODEL_PATH = "../models/residual_cnn.pt"
OUTPUT_DIR = "../outputs"

In [25]:
test_df = pd.read_csv(TEST_DATA_PATH)
print("Test samples:", len(test_df))
test_df.head()

Test samples: 5404


Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,lat,long,sqft_living15,sqft_lot15,size_quality,living_density_ratio
0,2591820310,4,2.25,2070,8893,2.0,0,0,4,8,47.4388,-122.162,2390,7700,16560,0.865747
1,7974200820,5,3.0,2900,6730,1.0,0,0,5,8,47.6784,-122.285,2370,6283,23200,1.223113
2,7701450110,4,2.5,3770,10893,2.0,0,2,3,11,47.5646,-122.129,3710,9685,41470,1.015899
3,9522300010,3,3.5,4560,14608,2.0,0,2,3,12,47.6995,-122.228,4050,14226,54720,1.125648
4,9510861140,3,2.5,2550,5376,2.0,0,0,3,9,47.6647,-122.083,2250,4050,22950,1.13283


In [26]:
TABULAR_FEATURES = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'size_quality', 'living_density_ratio']

scaler = load(TABULAR_SCALER_PATH)

X_tab_test = scaler.transform(
    test_df[TABULAR_FEATURES].values
)

booster = Booster()
booster.load_model(TABULAR_MODEL_PATH)

tab_model = XGBRegressor()
tab_model._Booster = booster
tab_model._estimator_type = "regressor"

tab_preds = tab_model.predict(X_tab_test)
print("Tabular predictions shape:", tab_preds.shape)

Tabular predictions shape: (5404,)


In [27]:
def image_exists(pid):
    return os.path.exists(
        os.path.join(IMAGE_DIR, f"{int(pid)}.0.png")
    )

In [28]:
image_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

In [29]:
class ResidualImageModel(nn.Module):
    def __init__(self, backbone):
        super().__init__()
        self.backbone = backbone
        self.regressor = nn.Linear(512, 1)

    def forward(self, x):
        feat = self.backbone(x)
        out = self.regressor(feat)
        return out.squeeze()

In [30]:
backbone = resnet18(weights=ResNet18_Weights.DEFAULT)
backbone.fc = nn.Identity()

residual_model = ResidualImageModel(backbone).to(DEVICE)
checkpoint = torch.load(RESIDUAL_IMAGE_MODEL_PATH, map_location=DEVICE)
residual_model.load_state_dict(checkpoint["model_state_dict"])
residual_model.eval()

ResidualImageModel(
  (backbone): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=Tru

In [32]:
residual_preds = np.zeros(len(test_df))

for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
    pid = row["id"]

    if not image_exists(pid):
        residual_preds[i] = 0.0
        continue

    img_path = os.path.join(IMAGE_DIR, f"{int(pid)}.0.png")
    image = Image.open(img_path).convert("RGB")
    x = image_transform(image).unsqueeze(0).to(DEVICE)

    with torch.no_grad():
        residual = residual_model(x).item()

    residual_preds[i] = residual

  0%|          | 0/5404 [00:00<?, ?it/s]

In [33]:
missing = sum(
    not image_exists(pid)
    for pid in test_df["id"]
)

print("Missing images:", missing, "/", len(test_df))

Missing images: 4 / 5404


In [34]:
print("Residual predictions stats:")
pd.Series(residual_preds).describe()

Residual predictions stats:


count    5404.000000
mean       -0.092196
std         0.064457
min        -0.565514
25%        -0.133106
50%        -0.090469
75%        -0.049234
max         0.205594
dtype: float64

In [35]:
final_log_preds = tab_preds + residual_preds
final_price_preds = np.exp(final_log_preds)

In [36]:
submission = pd.DataFrame({
    "id": test_df["id"],
    "predicted_price": final_price_preds
})

submission.head()

Unnamed: 0,id,predicted_price
0,2591820310,331091.523653
1,7974200820,474078.762154
2,7701450110,691090.387186
3,9522300010,757857.106799
4,9510861140,431889.224651


In [37]:
submission_path = os.path.join(OUTPUT_DIR, "predictions.csv")
submission.to_csv(submission_path, index=False)

submission_path

'../outputs\\predictions.csv'

In [38]:
assert len(submission) == len(test_df)
assert not submission["predicted_price"].isna().any()
assert (submission["predicted_price"] > 0).all()