In this notebook is based entirely on the centernet baseline notebook of Ruslan Baynazarov (https://www.kaggle.com/hocop1/centernet-baseline). I am only trying to understand Ruslan's code in this notebook.

* The CenterNet paper ("Objects as Points) https://arxiv.org/pdf/1904.07850.pdf

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
from tqdm import tqdm#_notebook as tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
import os
from scipy.optimize import minimize
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torchvision import models
from torchvision import transforms, utils

In [None]:
PATH = '../input/pku-autonomous-driving/'
os.listdir(PATH)

In [None]:
import pandas as pd
#sample_submission = pd.read_csv("../input/pku-autonomous-driving/sample_submission.csv")
train = pd.read_csv("../input/pku-autonomous-driving/train.csv")

In [None]:
camera_matrix = np.array([[2304.5479, 0,  1686.2379],
                          [0, 2305.8757, 1354.9849],
                          [0, 0, 1]], dtype=np.float32)
camera_matrix_inv = np.linalg.inv(camera_matrix)


In [None]:
def imread(path, fast_mode=False):
    img = cv2.imread(path)
    if not fast_mode and img is not None and len(img.shape) == 3:
        img = np.array(img[:, :, ::-1])
    return img


In [None]:
img = imread(PATH + 'train_images/ID_8a6e65317' + '.jpg')

IMG_SHAPE = img.shape
IMG_SHAPE

In [None]:
plt.figure(figsize=(15,8))
plt.imshow(img)

## Extract pose data from prediction string:

In [None]:
def str2coords(s, names=['id', 'yaw', 'pitch', 'roll', 'x', 'y', 'z']):
    '''
    Input:
        s: PredictionString (e.g. from train dataframe)
        names: array of what to extract from the string
    Output:
        list of dicts with keys from `names`
    '''
    coords = []
    for l in np.array(s.split()).reshape([-1, 7]):
        coords.append(dict(zip(names, l.astype('float'))))
        if 'id' in coords[-1]:
            coords[-1]['id'] = int(coords[-1]['id'])
    return coords

In [None]:
train["PredictionString"][0]

In [None]:
inp = train['PredictionString'][0]
print('Example input:\n', inp)
print()
print('Output:\n', str2coords(inp))

# 2D Visualization

In [None]:
def get_img_coords(s):
    '''
    Input is a PredictionString (e.g. from train dataframe)
    Output is two arrays:
        xs: x coordinates in the image (row)
        ys: y coordinates in the image (column)
    '''
    coords = str2coords(s)
    xs = [c['x'] for c in coords]
    ys = [c['y'] for c in coords]
    zs = [c['z'] for c in coords]
    P = np.array(list(zip(xs, ys, zs))).T
    img_p = np.dot(camera_matrix, P).T
    img_p[:, 0] /= img_p[:, 2]
    img_p[:, 1] /= img_p[:, 2]
    img_xs = img_p[:, 0]
    img_ys = img_p[:, 1]
    img_zs = img_p[:, 2] # z = Distance from the camera
    return img_xs, img_ys

plt.figure(figsize=(14,14))
plt.imshow(imread(PATH + 'train_images/' + train['ImageId'][2217] + '.jpg'))
plt.scatter(*get_img_coords(train['PredictionString'][2217]), color='yellow', s=100);

In [None]:
get_img_coords(train['PredictionString'][2217])

# 3D Visualization

Apparently used code from here: https://www.kaggle.com/zstusnoopy/visualize-the-location-and-3d-bounding-box-of-car

In [None]:
from math import sin, cos

# convert euler angle to rotation matrix
def euler_to_Rot(yaw, pitch, roll):
    Y = np.array([[cos(yaw), 0, sin(yaw)],
                  [0, 1, 0],
                  [-sin(yaw), 0, cos(yaw)]])
    P = np.array([[1, 0, 0],
                  [0, cos(pitch), -sin(pitch)],
                  [0, sin(pitch), cos(pitch)]])
    R = np.array([[cos(roll), -sin(roll), 0],
                  [sin(roll), cos(roll), 0],
                  [0, 0, 1]])
    return np.dot(Y, np.dot(P, R))

In [None]:
def draw_line(image, points):
    color = (255, 0, 0)
    cv2.line(image, tuple(points[0][:2]), tuple(points[3][:2]), color, 16)
    cv2.line(image, tuple(points[0][:2]), tuple(points[1][:2]), color, 16)
    cv2.line(image, tuple(points[1][:2]), tuple(points[2][:2]), color, 16)
    cv2.line(image, tuple(points[2][:2]), tuple(points[3][:2]), color, 16)
    return image


def draw_points(image, points):
    for (p_x, p_y, p_z) in points:
        cv2.circle(image, (p_x, p_y), int(1000 / p_z), (0, 255, 0), -1)
#         if p_x > image.shape[1] or p_y > image.shape[0]:
#             print('Point', p_x, p_y, 'is out of image with shape', image.shape)
    return image

In [None]:
def visualize(img, coords):
    # You will also need functions from the previous cells
    # coords is a list of dictionaries.
    x_l = 1.02
    y_l = 0.80
    z_l = 2.31
    
    img = img.copy()
    for point in coords:
        # Get values
        x, y, z = point['x'], point['y'], point['z']
        yaw, pitch, roll = -point['pitch'], -point['yaw'], -point['roll']
        # Math
        Rt = np.eye(4)
        t = np.array([x, y, z])
        Rt[:3, 3] = t
        Rt[:3, :3] = euler_to_Rot(yaw, pitch, roll).T
        Rt = Rt[:3, :]
        P = np.array([[x_l, -y_l, -z_l, 1],
                      [x_l, -y_l, z_l, 1],
                      [-x_l, -y_l, z_l, 1],
                      [-x_l, -y_l, -z_l, 1],
                      [0, 0, 0, 1]]).T
        img_cor_points = np.dot(camera_matrix, np.dot(Rt, P))
        img_cor_points = img_cor_points.T
        img_cor_points[:, 0] /= img_cor_points[:, 2]
        img_cor_points[:, 1] /= img_cor_points[:, 2]
        img_cor_points = img_cor_points.astype(int)
        # Drawing
        img = draw_line(img, img_cor_points)
        img = draw_points(img, img_cor_points[-1:])
    
    return img


In [None]:
n_rows = 6

for idx in range(n_rows):
    fig, axes = plt.subplots(1, 2, figsize=(20,20))
    img = imread(PATH + 'train_images/' + train['ImageId'].iloc[idx] + '.jpg')
    axes[0].imshow(img)
    img_vis = visualize(img, str2coords(train['PredictionString'].iloc[idx]))
    axes[1].imshow(img_vis)
    plt.show()

In [None]:
idx = 0
#fig, axes = plt.subplots(1, 2, figsize=(20,20))


In [None]:
PATH + 'train_images/' + train['ImageId'].iloc[idx] + '.jpg'

In [None]:
train['PredictionString'][idx]

In [None]:
train['ImageId'].iloc[idx]

In [None]:
str2coords(train['PredictionString'].iloc[idx])

As far as I understand, in the function `visualize`, he has taken a fixed size for the bounding box.

# 3D visualization

In [None]:
IMG_WIDTH = 1024
IMG_HEIGHT = IMG_WIDTH // 16 * 5
MODEL_SCALE = 8

def preprocess_image(img, flip=False):
    img = img[img.shape[0] // 2:]
    bg = np.ones_like(img) * img.mean(1, keepdims=True).astype(img.dtype)
    bg = bg[:, :img.shape[1] // 6]
    img = np.concatenate([bg, img, bg], 1)
    img = cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT))
    if flip:
        img = img[:,::-1]
    return (img / 255).astype('float32')


In [None]:
img0 = imread(PATH + 'train_images/' + train['ImageId'][22] + '.jpg')
img = preprocess_image(img0)

In [None]:
img0.shape

In [None]:
img.shape

In [None]:
plt.imshow(img0)

In [None]:
plt.imshow(img)

# Image Preprocessing

In [None]:
def rotate(x, angle): #adds angle to x, and the sum is made to lie in [-pi,pi). 180 is changed to -180.
    x = x + angle
    x = x - (x + np.pi) // (2 * np.pi) * 2 * np.pi
    return x



def _regr_preprocess(regr_dict, flip=False):
    #basically in regr (40x128x7), the pitch is replaced by its sin and cos, and id is eliminated
    if flip:
        for k in ['x', 'pitch', 'roll']:
            regr_dict[k] = -regr_dict[k]
    for name in ['x', 'y', 'z']:
        regr_dict[name] = regr_dict[name] / 100 #why divide by 100?
    regr_dict['roll'] = rotate(regr_dict['roll'], np.pi)
    regr_dict['pitch_sin'] = sin(regr_dict['pitch'])
    regr_dict['pitch_cos'] = cos(regr_dict['pitch'])
    regr_dict.pop('pitch')
    regr_dict.pop('id')
    return regr_dict



def get_mask_and_regr(img, labels, flip=False):
    #labels is the PresdictionString column of the training data and is a single string.
    #the only thing used of img is its shape for transformation purposes, not img itself.
    mask = np.zeros([IMG_HEIGHT // MODEL_SCALE, IMG_WIDTH // MODEL_SCALE], dtype='float32')
    regr_names = ['x', 'y', 'z', 'yaw', 'pitch', 'roll']
    regr = np.zeros([IMG_HEIGHT // MODEL_SCALE, IMG_WIDTH // MODEL_SCALE, 7], dtype='float32')
    coords = str2coords(labels) #coords is a list of dictionaries
    xs, ys = get_img_coords(labels) #image coordinates of the labeled cars.
    for x, y, regr_dict in zip(xs, ys, coords):
        x, y = y, x
        x = (x - img.shape[0] // 2) * IMG_HEIGHT / (img.shape[0] // 2) / MODEL_SCALE
        x = np.round(x).astype('int')
        y = (y + img.shape[1] // 6) * IMG_WIDTH / (img.shape[1] * 4/3) / MODEL_SCALE
        y = np.round(y).astype('int')
        if x >= 0 and x < IMG_HEIGHT // MODEL_SCALE and y >= 0 and y < IMG_WIDTH // MODEL_SCALE:
            mask[x, y] = 1
            regr_dict = _regr_preprocess(regr_dict, flip)
            regr[x, y] = [regr_dict[n] for n in sorted(regr_dict)] # WHAT IS THIS SORTED THING??? This returns a list of the keys sorted alphabetically
    if flip:
        mask = np.array(mask[:,::-1])
        regr = np.array(regr[:,::-1])
    return mask, regr

In [None]:
DISTANCE_THRESH_CLEAR = 2

def convert_3d_to_2d(x, y, z, fx = 2304.5479, fy = 2305.8757, cx = 1686.2379, cy = 1354.9849):
    # stolen from https://www.kaggle.com/theshockwaverider/eda-visualization-baseline
    return x * fx / z + cx, y * fy / z + cy

def clear_duplicates(coords):
    for c1 in coords:
        xyz1 = np.array([c1['x'], c1['y'], c1['z']])
        for c2 in coords:
            xyz2 = np.array([c2['x'], c2['y'], c2['z']])
            distance = np.sqrt(((xyz1 - xyz2)**2).sum())
            if distance < DISTANCE_THRESH_CLEAR:
                if c1['confidence'] < c2['confidence']:
                    c1['confidence'] = -1
    return [c for c in coords if c['confidence'] > 0]


def optimize_xy(r, c, x0, y0, z0, flipped=False):
    def distance_fn(xyz):
        x, y, z = xyz
        xx = -x if flipped else x
        slope_err = 0.0#(xzy_slope.predict([[xx,z]])[0] - y)**2 #ACHTUNG! I CHANGED IT TO ZERO...
        x, y = convert_3d_to_2d(x, y, z)
        y, x = x, y
        x = (x - IMG_SHAPE[0] // 2) * IMG_HEIGHT / (IMG_SHAPE[0] // 2) / MODEL_SCALE
        y = (y + IMG_SHAPE[1] // 6) * IMG_WIDTH / (IMG_SHAPE[1] * 4 / 3) / MODEL_SCALE
        return max(0.2, (x-r)**2 + (y-c)**2) + max(0.4, slope_err)
    
    res = minimize(distance_fn, [x0, y0, z0], method='Powell')
    x_new, y_new, z_new = res.x
    return x_new, y_new, z_new

def _regr_back(regr_dict):
    for name in ['x', 'y', 'z']:
        regr_dict[name] = regr_dict[name] * 100
    regr_dict['roll'] = rotate(regr_dict['roll'], -np.pi)
    
    pitch_sin = regr_dict['pitch_sin'] / np.sqrt(regr_dict['pitch_sin']**2 + regr_dict['pitch_cos']**2)
    pitch_cos = regr_dict['pitch_cos'] / np.sqrt(regr_dict['pitch_sin']**2 + regr_dict['pitch_cos']**2)
    regr_dict['pitch'] = np.arccos(pitch_cos) * np.sign(pitch_sin)
    return regr_dict

def extract_coords(prediction, flipped=False): #This extracts coordinates from the prediction
    #"prediction" is a 8x40x128 matrix
    logits = prediction[0] #shape is 40x128, contains the probabilities measures
    regr_output = prediction[1:] #shape is 7x40x128
    points = np.argwhere(logits > 0) #choose only those points where prediction[0] is positive. pixel positions of those points where probability measures are positive
    col_names = sorted(['x', 'y', 'z', 'yaw', 'pitch_sin', 'pitch_cos', 'roll'])
    coords = []
    for r, c in points: #pixel positions of only those points where prediction[0] is positive
        regr_dict = dict(zip(col_names, regr_output[:, r, c]))
        coords.append(_regr_back(regr_dict))
        coords[-1]['confidence'] = 1 / (1 + np.exp(-logits[r, c]))
        coords[-1]['x'], coords[-1]['y'], coords[-1]['z'] = \
                optimize_xy(r, c,
                            coords[-1]['x'],
                            coords[-1]['y'],
                            coords[-1]['z'], flipped)
    coords = clear_duplicates(coords)
    return coords

In [None]:
ax_i = 0
img0 = imread(PATH + 'train_images/' + train['ImageId'].iloc[idx] + '.jpg')
#img = preprocess_image(img0, ax_i==1)
mask, regr = get_mask_and_regr(img0, train['PredictionString'][idx], ax_i==1)
#mask is a matrix of the same dimensions as the preprocessed image. (40 x 128)
#regr has one more dimension which has length 7 (40 x 128 x 7) to account for the regressed values.
regr = np.rollaxis(regr, 2, 0) #brings the 3rd (ie 2+1) to the start (0) position. Has now shape (7 x 40 x 128).
#this rolled form of regr has to go to extract_coordinates.
mrconc = np.concatenate([mask[None], regr], 0) #regr: (7x40x128). mask[None]: 1x40x128. Remember that mask shape was 40x128. None adds an additional dimension.
coords = extract_coords(mrconc, ax_i==1) # mrconc IS THE PREDICTION!!!

In [None]:
mask[None].shape

In [None]:
mask.shape

In [None]:
mrconc.shape

In [None]:
mrconc = np.concatenate([mask[None], regr], 0)
#mask[None].shape

In [None]:
#this function is only for understanding purpoes.
def rotate_degrees(x, angle):
    x = x + angle
    x = x - (x + 180.0) // (2 * 180.0) * 2 * 180.0
    return x

In [None]:
rotate_degrees(50,180) #adds angle to x, and the sum is made to lie in [-pi,pi). 180 is changed to -180.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20,20))