In [28]:
import torch
import torch.nn as nn

import os
from argparse import ArgumentParser
from glob import glob

import cv2
import numpy as np

from fiery.models.encoder import Encoder
from fiery.models.temporal_model import TemporalModelIdentity, TemporalModel
from fiery.models.distributions import DistributionModule
from fiery.models.future_prediction import FuturePrediction
from fiery.models.decoder import Decoder
from fiery.utils.network import pack_sequence_dim, unpack_sequence_dim, set_bn_momentum
from fiery.utils.geometry import cumulative_warp_features, calculate_birds_eye_view_parameters, VoxelsSumming
import visualise

from fiery.trainer import TrainingModule
from fiery.utils.network import NormalizeInverse
from fiery.utils.instance import predict_instance_segmentation_and_trajectories
from fiery.utils.visualisation import plot_instance_map, generate_instance_colours, make_contour, convert_figure_numpy

## Model I/O

**image**: torch.Tensor float (T, N, 3, H, W) - normalised cameras images with T the sequence length, and N the number of cameras.

**intrinsics**: torch.Tensor float (T, N, 3, 3) - intrinsics containing resizing and cropping parameters.

**extrinsics**: torch.Tensor float  (T, N, 4, 4) - 6 DoF pose from world coordinates to camera coordinates.

**future_egomotion**: torch.Tensor float (T, 6) - 6 DoF egomotion where  t -> t+1

In [29]:
trainer = TrainingModule.load_from_checkpoint('fiery.ckpt', strict=True)

device = torch.device('cpu')
trainer = trainer.to(device)
trainer.eval()

Loaded pretrained weights for efficientnet-b4


                not been set for this class (IntersectionOverUnion). The property determines if `update` by
                default needs access to the full metric state. If this is not the case, significant speedups can be
                achieved and we recommend setting this to `False`.
                We provide an checking function
                `from torchmetrics.utilities import check_forward_full_state_property`
                that can be used to check if the `full_state_update=True` (old and potential slower behaviour,
                default for now) or if `full_state_update=False` can be used safely.
                
                not been set for this class (PanopticMetric). The property determines if `update` by
                default needs access to the full metric state. If this is not the case, significant speedups can be
                achieved and we recommend setting this to `False`.
                We provide an checking function
                `from torchme

TrainingModule(
  (model): Fiery(
    (encoder): Encoder(
      (backbone): EfficientNet(
        (_conv_stem): Conv2dStaticSamePadding(
          3, 48, kernel_size=(3, 3), stride=(2, 2), bias=False
          (static_padding): ZeroPad2d(padding=(1, 1, 1, 1), value=0.0)
        )
        (_bn0): BatchNorm2d(48, eps=0.001, momentum=0.05, affine=True, track_running_stats=True)
        (_blocks): ModuleList(
          (0): MBConvBlock(
            (_depthwise_conv): Conv2dStaticSamePadding(
              48, 48, kernel_size=(3, 3), stride=[1, 1], groups=48, bias=False
              (static_padding): ZeroPad2d(padding=(1, 1, 1, 1), value=0.0)
            )
            (_bn1): BatchNorm2d(48, eps=0.001, momentum=0.05, affine=True, track_running_stats=True)
            (_se_reduce): Conv2dStaticSamePadding(
              48, 12, kernel_size=(1, 1), stride=(1, 1)
              (static_padding): Identity()
            )
            (_se_expand): Conv2dStaticSamePadding(
              12, 48, k

In [30]:
# Download and extract example input data
visualise.download_example_data()

EXAMPLE_DATA_PATH = 'example_data/example_1.npz'

data = np.load(EXAMPLE_DATA_PATH)
image = torch.from_numpy(data['image']).to(device)
intrinsics = torch.from_numpy(data['intrinsics']).to(device)
extrinsics = torch.from_numpy(data['extrinsics']).to(device)
future_egomotions = torch.from_numpy(data['future_egomotion']).to(device)

print(f"The Image shape is '{image.shape}")
print(f"The Intrinsics shape is '{intrinsics.shape}")
print(f"The Extrinsics shape is '{extrinsics.shape}")
print(f"The Future Egomotions shape is '{future_egomotions.shape}")

The Image shape is 'torch.Size([1, 3, 6, 3, 224, 480])
The Intrinsics shape is 'torch.Size([1, 3, 6, 3, 3])
The Extrinsics shape is 'torch.Size([1, 3, 6, 4, 4])
The Future Egomotions shape is 'torch.Size([1, 3, 6])


## Frustum Creation

First, a frustum is created, which is a three-dimensional shape that is a pyramid with a flat top and base, and four triangular sides. The frustum is defined in terms of a grid in the image plane, and has three dimensions: left-right, top-bottom, and depth.

The function first defines the height and width of the image plane, as well as the downsampled versions of these values. It then creates a depth grid by creating a 1D tensor of depth values between the bounds specified in the configuration (D_bound), and reshapes this tensor into a 3D tensor with dimensions (n_depth_slices, downsampled_h, downsampled_w).

Next, the function creates x and y grids that are also 3D tensors with dimensions (n_depth_slices, downsampled_h, downsampled_w). These grids contain the x and y coordinates of each point in the image plane, respectively.

Finally, the function stacks these three grids along the last dimension to create a frustum tensor with dimensions (n_depth_slices, downsampled_h, downsampled_w, 3). This tensor contains the x, y, and depth coordinates of each point in the frustum. The frustum tensor is then wrapped in a PyTorch nn.Parameter and returned. 

**_note_**: I noticed, however, that the shape of the final tensor has consistent x & y coordinates, making a rectangle and not a frustum shape. I beleive the shape is transformed to a frustum downstream in the 'get_geometry' method

In [31]:
image_dim = (224, 480)
encoder_downsample = 8
D_bound = [2.0, 50.0, 1.0]
def create_frustum():
    # Create grid in image plane
    h, w = image_dim
    downsampled_h, downsampled_w = h // encoder_downsample, w // encoder_downsample

    # Depth grid
    depth_grid = torch.arange(*D_bound, dtype=torch.float)
    depth_grid = depth_grid.view(-1, 1, 1).expand(-1, downsampled_h, downsampled_w)
    n_depth_slices = depth_grid.shape[0]

    # x and y grids
    x_grid = torch.linspace(0, w - 1, downsampled_w, dtype=torch.float)
    x_grid = x_grid.view(1, 1, downsampled_w).expand(n_depth_slices, downsampled_h, downsampled_w)
    y_grid = torch.linspace(0, h - 1, downsampled_h, dtype=torch.float)
    y_grid = y_grid.view(1, downsampled_h, 1).expand(n_depth_slices, downsampled_h, downsampled_w)

    # Dimension (n_depth_slices, downsampled_h, downsampled_w, 3)
    # containing data points in the image: left-right, top-bottom, depth
    frustum = torch.stack((x_grid, y_grid, depth_grid), -1)
    return nn.Parameter(frustum, requires_grad=False)

In [32]:
# Depth grid creation
D_bound = [2.0, 50.0, 1.0]
h, w = image_dim
downsampled_h, downsampled_w = h // encoder_downsample, w // encoder_downsample

depth_grid = torch.arange(*D_bound, dtype=torch.float)
depth_grid = depth_grid.view(-1, 1, 1).expand(-1, downsampled_h, downsampled_w)
n_depth_slices = depth_grid.shape[0]
print(f"The depth grid shape is'{depth_grid.shape}")
print(depth_grid)

The depth grid shape is'torch.Size([48, 28, 60])
tensor([[[ 2.,  2.,  2.,  ...,  2.,  2.,  2.],
         [ 2.,  2.,  2.,  ...,  2.,  2.,  2.],
         [ 2.,  2.,  2.,  ...,  2.,  2.,  2.],
         ...,
         [ 2.,  2.,  2.,  ...,  2.,  2.,  2.],
         [ 2.,  2.,  2.,  ...,  2.,  2.,  2.],
         [ 2.,  2.,  2.,  ...,  2.,  2.,  2.]],

        [[ 3.,  3.,  3.,  ...,  3.,  3.,  3.],
         [ 3.,  3.,  3.,  ...,  3.,  3.,  3.],
         [ 3.,  3.,  3.,  ...,  3.,  3.,  3.],
         ...,
         [ 3.,  3.,  3.,  ...,  3.,  3.,  3.],
         [ 3.,  3.,  3.,  ...,  3.,  3.,  3.],
         [ 3.,  3.,  3.,  ...,  3.,  3.,  3.]],

        [[ 4.,  4.,  4.,  ...,  4.,  4.,  4.],
         [ 4.,  4.,  4.,  ...,  4.,  4.,  4.],
         [ 4.,  4.,  4.,  ...,  4.,  4.,  4.],
         ...,
         [ 4.,  4.,  4.,  ...,  4.,  4.,  4.],
         [ 4.,  4.,  4.,  ...,  4.,  4.,  4.],
         [ 4.,  4.,  4.,  ...,  4.,  4.,  4.]],

        ...,

        [[47., 47., 47.,  ..., 47., 47., 47

In [33]:
# x and y grids
x_grid = torch.linspace(0, w - 1, downsampled_w, dtype=torch.float)
x_grid = x_grid.view(1, 1, downsampled_w).expand(n_depth_slices, downsampled_h, downsampled_w)
print(f"The x grid grid shape is {x_grid.shape}")
y_grid = torch.linspace(0, h - 1, downsampled_h, dtype=torch.float)
y_grid = y_grid.view(1, downsampled_h, 1).expand(n_depth_slices, downsampled_h, downsampled_w)
print(f"The y grid shape is {y_grid.shape}")

The x grid grid shape is torch.Size([48, 28, 60])
The y grid shape is torch.Size([48, 28, 60])


In [34]:
# Create frustum of Dimension (n_depth_slices, downsampled_h, downsampled_w, 3)
# containing data points in the image: x, y, depth. The x and y values are the same here
# This is not representative of the shape of a frustum, I beleive the frustum is created
# Later on through scaling

frustum = torch.stack((x_grid, y_grid, depth_grid), -1)
print(f"The y frustum shape is {frustum.shape}")
#print(frustum)
#print(frustum[1,27])

The y frustum shape is torch.Size([48, 28, 60, 3])


## Forward Step 1: Lifting and Projecting Images to BEV

The method that handles this operation is in the function 'calculate_birds_eye_view_features' as such:
1. Packs the sequence dimensions with the batch size to process the images in a time-agnostic manner.
2. Apply Intrinsic & Extrinsic Transformations to the Frustums to transform them to the ego frame
3. Pass the images through the encoder to extract 2D features coupled with depth probabilities
4. Project the images to BEV using the transformed frustums & images' features
   using the 'splat' method of lift-splat-shoot paper.
5. Unpack the sequence dimensions

In [35]:
# Commenting out to not crash computer
#trainer.model.calculate_birds_eye_view_features(image, intrinsics, extrinsics)

### Pack sequence dimensions

In [36]:
# The first step in this funtion is to pack the sequence dimension with the batches into one consolidate dimension

b, s, n, c, h, w = image.shape
# Reshape
x = pack_sequence_dim(image)
intrinsics = pack_sequence_dim(intrinsics)
extrinsics = pack_sequence_dim(extrinsics)

print(extrinsics.shape)

torch.Size([3, 6, 4, 4])


### Transform from Camera Frustums to Ego Frame

In [37]:
# The camera intriniscs and extrinsics are used to convert the images to the ego vehicle's reference frame 
# in the 'get_geometry' method as follows:

def get_geometry(self, intrinsics, extrinsics):
    """Calculate the (x, y, z) 3D position of the features.
    """
    rotation, translation = extrinsics[..., :3, :3], extrinsics[..., :3, 3]
    B, N, _ = translation.shape
    # Add batch, camera dimension, and a dummy dimension at the end
    points = trainer.model.frustum.unsqueeze(0).unsqueeze(0).unsqueeze(-1)

    # Camera to ego reference frame
    points = torch.cat((points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3], points[:, :, :, :, :, 2:3]), 5)
    combined_transformation = rotation.matmul(torch.inverse(intrinsics))
    points = combined_transformation.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
    points += translation.view(B, N, 1, 1, 1, 3)

    # The 3 dimensions in the ego reference frame are: (forward, sides, height)
    return points



In [38]:
# Lets dive into the function some more:

rotation, translation = extrinsics[..., :3, :3], extrinsics[..., :3, 3]
B, N, _ = translation.shape
# Add batch, camera dimension, and a dummy dimension at the end
points = trainer.model.frustum.unsqueeze(0).unsqueeze(0).unsqueeze(-1)

print(f'The rotation shape is: {rotation.shape}')
print(f'The translation shape is: {translation.shape}')
print(f'The points shape is: {points.shape}')

The rotation shape is: torch.Size([3, 6, 3, 3])
The translation shape is: torch.Size([3, 6, 3])
The points shape is: torch.Size([1, 1, 48, 28, 60, 3, 1])


In [39]:
# Camera to ego reference frame

points = trainer.model.frustum.unsqueeze(0).unsqueeze(0).unsqueeze(-1)

# x-y points extraction
x_y = points[:, :, :, :, :, :2]
#print(x_y)
# depth extraction
depth = points[:, :, :, :, :, 2:3]

# The x and y points are being multiplied by the depth and concatenated along the depth axis 
# This could be the actual creation of the frustum since the x & y points are increasing at a scale
# of the depth
points = torch.cat((x_y*depth, depth), 5)
print(f'The frustum shape is {points.shape}')

The frustum shape is torch.Size([1, 1, 48, 28, 60, 3, 1])


In [40]:
# Here we transform the frustum from the respective cameras to the ego frame of reference
# The final 3 dimensions in the ego reference frame are: (forward, sides, height)

combined_transformation = rotation.matmul(torch.inverse(intrinsics))
points = combined_transformation.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
points += translation.view(B, N, 1, 1, 1, 3)
print(f'The transformed frustums are returned transformed in the ego frame as: {points.shape}')

The transformed frustums are returned transformed in the ego frame as: torch.Size([3, 6, 48, 28, 60, 3])


### Encode Images

In [41]:
x = trainer.model.encoder_forward(x)

### Project to BEV

Following the method 'trainer.model.projection_to_birds_eye_view(x, points)', which is primarilty adapted from lift-splat-shoot at [this](https://github.com/nv-tlabs/lift-splat-shoot/blob/master/src/models.py#L200) point. 



In [None]:
#some thing