# F1TENTH Dashcam to Costmap Translation - Demo

This notebook demonstrates the capabilities of our image-to-image translation models for converting F1TENTH dashcam images to navigation costmaps.

## Models Overview

We implement two different neural network architectures:

1. **UNet150**: A U-Net architecture optimized for 150x150 pixel images
2. **ContextNetwork**: A dilated convolution network for capturing multi-scale context

In [None]:
import sys
import os
sys.path.append('../src')

import torch
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

from models import UNet150, ContextNetwork
from data import ImageToImageDataset, EnhancedImageToImageDataset
from utils import get_device, load_model

# Set up plotting
plt.style.use('default')
%matplotlib inline

## Dataset Overview

Let's first examine our dataset to understand the input-output relationship.

In [None]:
# Configuration
INPUT_FOLDER = "../Data/Dashcams"
TARGET_FOLDER = "../Data/Costmaps"
device = get_device()

print(f"Using device: {device}")

# Basic transforms
transform_input = transforms.Compose([transforms.ToTensor()])
transform_target = transforms.Compose([transforms.ToTensor()])

# Create dataset
dataset = ImageToImageDataset(
    input_folder=INPUT_FOLDER,
    target_folder=TARGET_FOLDER,
    transform_input=transform_input,
    transform_target=transform_target,
)

print(f"Dataset size: {len(dataset)} samples")

In [None]:
# Visualize sample data
data_loader = DataLoader(dataset, batch_size=4, shuffle=True)
batch_inputs, batch_targets = next(iter(data_loader))

fig, axes = plt.subplots(2, 4, figsize=(16, 8))

for i in range(4):
    # Input dashcam images
    axes[0, i].imshow(batch_inputs[i].squeeze(), cmap="gray")
    axes[0, i].set_title(f"Dashcam Input {i+1}")
    axes[0, i].axis("off")

    # Target costmaps
    axes[1, i].imshow(batch_targets[i].squeeze(), cmap="gray")
    axes[1, i].set_title(f"Costmap Target {i+1}")
    axes[1, i].axis("off")

plt.suptitle("Dataset Examples: Dashcam Images → Costmaps", fontsize=16)
plt.tight_layout()
plt.show()

## Model 1: U-Net Architecture

The U-Net model uses an encoder-decoder architecture with skip connections to preserve spatial information.

In [None]:
# Load U-Net model
unet_model = UNet150(in_channels=1, out_channels=1, complexity_multiplier=4)

# Try to load pre-trained weights
unet_model_path = "../models/unet_best.pth"
if os.path.exists(unet_model_path):
    unet_model = load_model(unet_model, unet_model_path, device)
    print("Loaded pre-trained U-Net model")
else:
    print("No pre-trained U-Net found, using random weights")

unet_model.to(device)
unet_model.eval()

# Model summary
total_params = sum(p.numel() for p in unet_model.parameters())
print(f"U-Net total parameters: {total_params:,}")

In [None]:
# Generate U-Net predictions
with torch.no_grad():
    batch_inputs_device = batch_inputs.to(device)
    unet_predictions = unet_model(batch_inputs_device).cpu()

# Visualize U-Net results
fig, axes = plt.subplots(3, 4, figsize=(16, 12))

for i in range(4):
    # Input
    axes[0, i].imshow(batch_inputs[i].squeeze(), cmap="gray")
    axes[0, i].set_title(f"Input {i+1}")
    axes[0, i].axis("off")

    # Ground truth
    axes[1, i].imshow(batch_targets[i].squeeze(), cmap="gray")
    axes[1, i].set_title(f"Ground Truth {i+1}")
    axes[1, i].axis("off")

    # U-Net prediction
    axes[2, i].imshow(unet_predictions[i].squeeze(), cmap="gray")
    axes[2, i].set_title(f"U-Net Prediction {i+1}")
    axes[2, i].axis("off")

plt.suptitle("U-Net Model Results", fontsize=16)
plt.tight_layout()
plt.show()

## Model 2: Context Network

The Context Network uses dilated convolutions to capture multi-scale contextual information.

In [None]:
# Enhanced dataset for Context Network
MEAN = 0.2335
STD = 0.1712

transform_input_context = transforms.Compose(
    [
        transforms.ToPILImage(),
        transforms.ToTensor(),
        transforms.Normalize(mean=MEAN, std=STD),
    ]
)

transform_target_context = transforms.Compose(
    [transforms.ToPILImage(), transforms.ToTensor()]
)

enhanced_dataset = EnhancedImageToImageDataset(
    input_folder=INPUT_FOLDER,
    target_folder=TARGET_FOLDER,
    transform_input=transform_input_context,
    transform_target=transform_target_context,
    threshold=150,
    filter_size=50,
)

enhanced_loader = DataLoader(enhanced_dataset, batch_size=4, shuffle=True)
batch_inputs_enh, batch_targets_enh, masks = next(iter(enhanced_loader))

In [None]:
# Load Context Network
context_model = ContextNetwork(
    in_channels=1, out_channels=1, dilation_factors=[1, 1, 1, 1, 1, 1, 1, 1, 1]
)

# Try to load pre-trained weights
context_model_path = "../models/context_net_best.pth"
if os.path.exists(context_model_path):
    context_model = load_model(context_model, context_model_path, device)
    print("Loaded pre-trained Context Network model")
else:
    print("No pre-trained Context Network found, using random weights")

context_model.to(device)
context_model.eval()

# Model summary
total_params_context = sum(p.numel() for p in context_model.parameters())
print(f"Context Network total parameters: {total_params_context:,}")

In [None]:
# Generate Context Network predictions
with torch.no_grad():
    batch_inputs_enh_device = batch_inputs_enh.to(device)
    context_predictions = context_model(batch_inputs_enh_device).cpu()

# Denormalize inputs for visualization
batch_inputs_vis = batch_inputs_enh * STD + MEAN

# Visualize Context Network results
fig, axes = plt.subplots(4, 4, figsize=(16, 16))

for i in range(4):
    # Input (denormalized)
    axes[0, i].imshow(batch_inputs_vis[i].squeeze(), cmap="gray")
    axes[0, i].set_title(f"Input {i+1}")
    axes[0, i].axis("off")

    # Ground truth
    axes[1, i].imshow(batch_targets_enh[i].squeeze(), cmap="gray")
    axes[1, i].set_title(f"Ground Truth {i+1}")
    axes[1, i].axis("off")

    # Context Network prediction
    axes[2, i].imshow(context_predictions[i].squeeze(), cmap="gray")
    axes[2, i].set_title(f"Context Net Prediction {i+1}")
    axes[2, i].axis("off")

    # Morphological mask
    axes[3, i].imshow(masks[i].squeeze(), cmap="binary")
    axes[3, i].set_title(f"Morphological Mask {i+1}")
    axes[3, i].axis("off")

plt.suptitle("Context Network Model Results", fontsize=16)
plt.tight_layout()
plt.show()

## Model Comparison

Let's compare both models side by side on the same input.

In [None]:
# Use the same input for both models (first sample)
sample_input = batch_inputs[0:1]  # Keep batch dimension
sample_target = batch_targets[0:1]

# Get predictions from both models
with torch.no_grad():
    # U-Net prediction
    unet_pred = unet_model(sample_input.to(device)).cpu()

    # Context Network prediction (need to normalize input)
    sample_input_norm = (sample_input - MEAN) / STD
    context_pred = context_model(sample_input_norm.to(device)).cpu()

# Comparison visualization
fig, axes = plt.subplots(1, 4, figsize=(20, 5))

axes[0].imshow(sample_input[0].squeeze(), cmap="gray")
axes[0].set_title("Dashcam Input", fontsize=14)
axes[0].axis("off")

axes[1].imshow(sample_target[0].squeeze(), cmap="gray")
axes[1].set_title("Ground Truth Costmap", fontsize=14)
axes[1].axis("off")

axes[2].imshow(unet_pred[0].squeeze(), cmap="gray")
axes[2].set_title("U-Net Prediction", fontsize=14)
axes[2].axis("off")

axes[3].imshow(context_pred[0].squeeze(), cmap="gray")
axes[3].set_title("Context Network Prediction", fontsize=14)
axes[3].axis("off")

plt.suptitle("Model Comparison", fontsize=16)
plt.tight_layout()
plt.show()

## Performance Metrics

Let's calculate some basic metrics to compare the models.

In [None]:
import torch.nn.functional as F


def calculate_metrics(predictions, targets):
    """Calculate L1 and L2 losses."""
    l1_loss = F.l1_loss(predictions, targets)
    l2_loss = F.mse_loss(predictions, targets)
    return l1_loss.item(), l2_loss.item()


# Calculate metrics for both models
unet_l1, unet_l2 = calculate_metrics(unet_pred, sample_target)
context_l1, context_l2 = calculate_metrics(context_pred, sample_target)

print("Performance Comparison (Single Sample):")
print(f"{'Model':<15} {'L1 Loss':<10} {'L2 Loss':<10} {'Parameters':<12}")
print("-" * 50)
print(f"{'U-Net':<15} {unet_l1:<10.4f} {unet_l2:<10.4f} {total_params:<12,}")
print(
    f"{'Context Net':<15} {context_l1:<10.4f} {context_l2:<10.4f} {total_params_context:<12,}"
)

## Conclusion

This demo showcases two different approaches to the F1TENTH dashcam-to-costmap translation problem:

1. **U-Net**: Classical encoder-decoder architecture with skip connections
2. **Context Network**: Dilated convolutions for multi-scale context capture

Both models demonstrate the ability to transform dashcam images into navigation costmaps, enabling autonomous navigation in the F1TENTH racing environment.

### Key Features:
- **150x150 pixel resolution** optimized for real-time performance
- **Identity initialization** for Context Network stability
- **Morphological preprocessing** for enhanced training data
- **L1 loss optimization** for sharp costmap boundaries

The models can be further improved through:
- Extended training epochs
- Data augmentation techniques
- Advanced loss functions
- Ensemble methods