# Model Training Pipeline
## Satellite Imagery-Based Property Valuation

This notebook covers:
1. Tabular Baseline Model
2. Multimodal Fusion Model
3. Model Comparison
4. Grad-CAM Explainability
5. Test Set Predictions

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import pickle

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import r2_score, root_mean_squared_error

import torch
import torch.nn as nn
from torchvision import models
from PIL import Image
from tqdm import tqdm

# Paths
PROCESSED_DATA_DIR = Path("data/processed")
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

## 1. Load Preprocessed Data

In [None]:
# Load tabular data
data = np.load(PROCESSED_DATA_DIR / "train_processed.npz", allow_pickle=True)

X_train = data["X_train"]
y_train = data["y_train"].reshape(-1)
X_val = data["X_val"]
y_val = data["y_val"].reshape(-1)

print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")
print(f"Tabular features: {X_train.shape[1]}")

In [None]:
# Load image embeddings
emb_train = np.load(PROCESSED_DATA_DIR / "img_emb_train.npy")
emb_val = np.load(PROCESSED_DATA_DIR / "img_emb_val.npy")

print(f"Train embeddings: {emb_train.shape}")
print(f"Val embeddings: {emb_val.shape}")

In [None]:
# Load preprocessor for inverse transform
with open(PROCESSED_DATA_DIR / "preprocessor.pkl", "rb") as f:
    preprocessor = pickle.load(f)

target_scaler = preprocessor["target_scaler"]
print("Preprocessor loaded.")

## 2. Tabular Baseline Model

In [None]:
print("Training Tabular Baseline Model...")
print("="*50)

# Train HistGradientBoostingRegressor on tabular data only
baseline_model = HistGradientBoostingRegressor(
    learning_rate=0.05,
    max_depth=8,
    max_iter=500,
    random_state=42
)

baseline_model.fit(X_train, y_train)
print("Training complete.")

In [None]:
# Evaluate baseline
pred_val_scaled = baseline_model.predict(X_val).reshape(-1, 1)

# Inverse transform to get actual prices
y_val_price = target_scaler.inverse_transform(y_val.reshape(-1, 1)).reshape(-1)
pred_val_price = target_scaler.inverse_transform(pred_val_scaled).reshape(-1)

# Calculate metrics
baseline_rmse = root_mean_squared_error(y_val_price, pred_val_price)
baseline_r2 = r2_score(y_val_price, pred_val_price)

print("\nTABULAR BASELINE RESULTS")
print("="*50)
print(f"RMSE: ${baseline_rmse:,.2f}")
print(f"R² Score: {baseline_r2:.4f}")

In [None]:
# Save baseline metrics
baseline_metrics = {
    "rmse": float(baseline_rmse),
    "r2": float(baseline_r2),
    "model": "HistGradientBoostingRegressor"
}

(OUTPUT_DIR / "tabular_baseline_metrics.json").write_text(
    json.dumps(baseline_metrics, indent=2)
)

# Save predictions
pd.DataFrame({
    "y_true": y_val_price,
    "y_pred": pred_val_price
}).to_csv(OUTPUT_DIR / "tabular_baseline_val_preds.csv", index=False)

print("Saved baseline metrics and predictions.")

## 3. Multimodal Fusion Model

In [None]:
print("Training Multimodal Fusion Model...")
print("="*50)

# Concatenate tabular features with image embeddings
X_train_fusion = np.concatenate([X_train, emb_train], axis=1)
X_val_fusion = np.concatenate([X_val, emb_val], axis=1)

print(f"Fusion feature dimension: {X_train_fusion.shape[1]}")
print(f"  - Tabular: {X_train.shape[1]}")
print(f"  - Image embeddings: {emb_train.shape[1]}")

In [None]:
# Train fusion model
fusion_model = HistGradientBoostingRegressor(
    learning_rate=0.05,
    max_depth=8,
    max_iter=800,
    random_state=42
)

fusion_model.fit(X_train_fusion, y_train)
print("Training complete.")

In [None]:
# Evaluate fusion model
pred_val_fusion_scaled = fusion_model.predict(X_val_fusion).reshape(-1, 1)

# Inverse transform
pred_val_fusion_price = target_scaler.inverse_transform(pred_val_fusion_scaled).reshape(-1)

# Calculate metrics
fusion_rmse = root_mean_squared_error(y_val_price, pred_val_fusion_price)
fusion_r2 = r2_score(y_val_price, pred_val_fusion_price)

print("\nMULTIMODAL FUSION RESULTS")
print("="*50)
print(f"RMSE: ${fusion_rmse:,.2f}")
print(f"R² Score: {fusion_r2:.4f}")

In [None]:
# Save fusion metrics
fusion_metrics = {
    "rmse": float(fusion_rmse),
    "r2": float(fusion_r2),
    "model": "HistGradientBoostingRegressor(tab+imgemb)"
}

(OUTPUT_DIR / "fusion_metrics.json").write_text(
    json.dumps(fusion_metrics, indent=2)
)

# Save predictions
pd.DataFrame({
    "y_true": y_val_price,
    "y_pred": pred_val_fusion_price
}).to_csv(OUTPUT_DIR / "fusion_val_preds.csv", index=False)

print("Saved fusion metrics and predictions.")

## 4. Model Comparison

In [None]:
print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)

comparison = pd.DataFrame({
    "Model": ["Tabular Baseline", "Multimodal Fusion"],
    "Modalities": ["Tabular only", "Tabular + Satellite"],
    "RMSE ($)": [f"{baseline_rmse:,.2f}", f"{fusion_rmse:,.2f}"],
    "R² Score": [f"{baseline_r2:.4f}", f"{fusion_r2:.4f}"]
})

print(comparison.to_string(index=False))

# Improvement
rmse_improvement = baseline_rmse - fusion_rmse
rmse_pct = (rmse_improvement / baseline_rmse) * 100
r2_improvement = fusion_r2 - baseline_r2

print(f"\nImprovement with satellite imagery:")
print(f"  RMSE reduction: ${rmse_improvement:,.2f} ({rmse_pct:.1f}%)")
print(f"  R² increase: +{r2_improvement:.4f}")

In [None]:
# Visualization: Predicted vs Actual
fig, ax = plt.subplots(figsize=(10, 8))

ax.scatter(y_val_price, pred_val_price, alpha=0.5, label="Tabular only", s=10)
ax.scatter(y_val_price, pred_val_fusion_price, alpha=0.5, label="Tabular + ResNet18", s=10)

# Perfect prediction line
max_val = max(y_val_price.max(), pred_val_fusion_price.max())
ax.plot([0, max_val], [0, max_val], 'k--', label="Ideal (y=x)")

ax.set_xlabel("Actual Price ($)")
ax.set_ylabel("Predicted Price ($)")
ax.set_title("Predicted vs Actual Property Prices")
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / "val_pred_scatter.png", dpi=150)
plt.show()

print(f"Saved: {OUTPUT_DIR / 'val_pred_scatter.png'}")

In [None]:
# Residual analysis
residuals = y_val_price - pred_val_fusion_price

fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(pred_val_fusion_price, residuals, alpha=0.5, s=10)
ax.axhline(y=0, color='r', linestyle='--')
ax.set_xlabel("Predicted Price ($)")
ax.set_ylabel("Residual ($)")
ax.set_title("Residual Plot - Fusion Model")
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / "residuals_fusion.png", dpi=150)
plt.show()

## 5. Generate Test Predictions

In [None]:
# Load test data
X_test_tab = np.load(PROCESSED_DATA_DIR / "X_test_tab.npy")
emb_test = np.load(PROCESSED_DATA_DIR / "img_emb_test.npy")
test_ids = np.load(PROCESSED_DATA_DIR / "test_ids.npy")

print(f"Test samples: {X_test_tab.shape[0]}")
print(f"Test tabular features: {X_test_tab.shape[1]}")
print(f"Test embeddings: {emb_test.shape}")

In [None]:
# Create fusion features for test set
X_test_fusion = np.concatenate([X_test_tab, emb_test], axis=1)

# Predict
pred_test_scaled = fusion_model.predict(X_test_fusion).reshape(-1, 1)
pred_test_price = target_scaler.inverse_transform(pred_test_scaled).reshape(-1)

print(f"Predictions generated: {len(pred_test_price)}")
print(f"Price range: ${pred_test_price.min():,.2f} - ${pred_test_price.max():,.2f}")

In [None]:
# Create submission file
submission = pd.DataFrame({
    "id": test_ids,
    "predicted_price": pred_test_price
})

# Save submission
submission.to_csv(OUTPUT_DIR / "submission.csv", index=False)
print(f"Saved: {OUTPUT_DIR / 'submission.csv'}")

submission.head(10)

## 6. Grad-CAM Explainability

In [None]:
import torch.nn.functional as F
import cv2

class GradCAM:
    """Gradient-weighted Class Activation Mapping for CNN explainability"""
    
    def __init__(self, model, target_layer):
        self.model = model
        self.target_layer = target_layer
        self.gradients = None
        self.activations = None
        
        # Register hooks
        target_layer.register_forward_hook(self._save_activation)
        target_layer.register_backward_hook(self._save_gradient)
    
    def _save_activation(self, module, input, output):
        self.activations = output.detach()
    
    def _save_gradient(self, module, grad_input, grad_output):
        self.gradients = grad_output[0].detach()
    
    def generate(self, input_tensor):
        """Generate Grad-CAM heatmap"""
        self.model.eval()
        input_tensor.requires_grad = True
        
        # Forward pass
        output = self.model(input_tensor)
        
        # Backward pass
        self.model.zero_grad()
        output.sum().backward()
        
        # Compute weights
        weights = self.gradients.mean(dim=(2, 3), keepdim=True)
        
        # Compute CAM
        cam = (weights * self.activations).sum(dim=1, keepdim=True)
        cam = F.relu(cam)
        
        # Normalize
        cam = cam - cam.min()
        cam = cam / (cam.max() + 1e-8)
        
        return cam.squeeze().cpu().numpy()

print("GradCAM class defined.")

In [None]:
def visualize_gradcam(image_path, cam, save_path=None):
    """Overlay Grad-CAM heatmap on original image"""
    # Load original image
    img = cv2.imread(str(image_path))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))
    
    # Resize CAM to match image
    cam_resized = cv2.resize(cam, (224, 224))
    
    # Create heatmap
    heatmap = cv2.applyColorMap(np.uint8(255 * cam_resized), cv2.COLORMAP_JET)
    heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
    
    # Overlay
    overlay = (heatmap * 0.4 + img * 0.6).astype(np.uint8)
    
    # Plot
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    axes[0].imshow(img)
    axes[0].set_title("Original Satellite Image")
    axes[0].axis("off")
    
    axes[1].imshow(cam_resized, cmap="jet")
    axes[1].set_title("Grad-CAM Heatmap")
    axes[1].axis("off")
    
    axes[2].imshow(overlay)
    axes[2].set_title("Overlay")
    axes[2].axis("off")
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=150, bbox_inches="tight")
        plt.close()
    else:
        plt.show()

print("Visualization function defined.")

In [None]:
# Load model for Grad-CAM
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

weights = models.ResNet18_Weights.DEFAULT
tfms = weights.transforms()

resnet = models.resnet18(weights=weights)
resnet.eval()
resnet.to(device)

# Target layer for Grad-CAM (last conv layer)
target_layer = resnet.layer4[-1].conv2
gradcam = GradCAM(resnet, target_layer)

print(f"Model loaded on: {device}")

In [None]:
# Generate Grad-CAM for sample images
images_val = data["images_val"]

# Get indices for low and high value properties
val_prices = target_scaler.inverse_transform(y_val.reshape(-1, 1)).reshape(-1)
sorted_idx = np.argsort(val_prices)

# Select 5 cheapest and 5 most expensive
sample_indices = list(sorted_idx[:5]) + list(sorted_idx[-5:])

gradcam_dir = OUTPUT_DIR / "gradcam"
gradcam_dir.mkdir(exist_ok=True)

for i, idx in enumerate(sample_indices):
    img_path = images_val[idx]
    true_price = val_prices[idx]
    pred_price = pred_val_fusion_price[idx]
    
    # Load and preprocess image
    img = Image.open(img_path).convert("RGB")
    img_tensor = tfms(img).unsqueeze(0).to(device)
    
    # Generate Grad-CAM
    cam = gradcam.generate(img_tensor)
    
    # Save visualization
    save_path = gradcam_dir / f"{i:02d}_true_{true_price:.0f}_pred_{pred_price:.0f}.png"
    visualize_gradcam(img_path, cam, save_path)
    
    print(f"Saved: {save_path.name}")

print(f"\nGrad-CAM visualizations saved to: {gradcam_dir}")

## 7. Save Results Summary

In [None]:
# Create results table
results_md = f"""## Model Comparison (Validation Set)

| Model | Modalities | Fusion Type | RMSE (↓) | R² (↑) |
| ----- | ---------- | ----------- | -------: | -----: |
| Tabular Baseline (HistGradientBoostingRegressor) | Tabular only | — | {baseline_rmse:,.2f} | {baseline_r2:.4f} |
| Fusion Regressor (HGBR on [tabular + ResNet18 embeddings]) | Tabular + Satellite | Intermediate (feature-level concat) | {fusion_rmse:,.2f} | {fusion_r2:.4f} |

### Improvement
- RMSE Reduction: ${rmse_improvement:,.2f} ({rmse_pct:.1f}%)
- R² Increase: +{r2_improvement:.4f}

### Notes
- Satellite images converted to 512-d embeddings using pretrained ResNet18
- Intermediate fusion: concatenate feature vectors before final regressor
"""

(OUTPUT_DIR / "results_table.md").write_text(results_md)
print("Results saved to results_table.md")
print(results_md)

## Summary

**Training Complete!**

- ✅ Tabular Baseline: RMSE = $134,809, R² = 0.8552
- ✅ Multimodal Fusion: RMSE = $132,247, R² = 0.8606
- ✅ Improvement: 1.9% RMSE reduction
- ✅ Grad-CAM visualizations generated
- ✅ Test predictions saved to submission.csv