One of the things worth considering for this competition is whether convolutions are really reasonable operations to use for self driving cars. We can test with various different toy problems to see if the models are able to capture the information we think they should, like the distance between cars.

This analysis was triggered by trying to train models at 128x128 and then fine-tune on 300x300 and finding that the performance took a significant hit right away when switching resolutions.

Goals: 
Done
* Can a CNN learn to measure distance between two points?
* Does a pretrained model already have some of this information?
* Is it resilient to new resolutions?
* What do the intermediate layers look like out of the CNN solving this problem?
* Does pooling put a limit on the performance of the model?


in progress:
* Can a CNN learn to compute area of a rectangle?

* Can a CNN simulatenously locate objects to measure between
* Is it resilient to additional objects in validation time

* Can a CNN count?


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

Simple model definition, just a mix of some convolutions and max-pooling. Global max pooling configured so the model is agnostic to resolution. In theory can be applied to images of any resolutions beyond a certain threshold.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models.resnet import resnet18

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.preconv1 = nn.Conv2d(3, 2, 1)
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 128)
        self.fc4 = nn.Linear(128, 128)
        self.fc5 = nn.Linear(128, 128)
        self.fc6 = nn.Linear(128, 128)
        self.fc7 = nn.Linear(128, 128)
        self.fc8 = nn.Linear(128, 1)

    def forward(self, x):
        x = x.to(torch.float32)
        x = self.preconv1(x)
        x = x.view(x.shape[0], x.shape[1], -1)
        x_max = x.max(axis = 2)[0]
        x_min = (-x).max(axis = 2)[0]
        
        x_max = (x_max - 1)*128
        x_min = (-x_min+1)*128
        
        x = torch.cat((x_max, x_min), axis = -1)
        x = F.elu(self.fc1(x))
        x = F.elu(self.fc2(x))
        x = F.elu(self.fc3(x))
        x = F.elu(self.fc4(x))
        x = F.elu(self.fc5(x))
        x = F.elu(self.fc6(x))
        x = F.elu(self.fc7(x))
        x = self.fc8(x)
        
        return torch.squeeze(x)

In [None]:
net = Net()
net = net.to("cuda")

Resnet model we will explore in parallel. Similar principles. We can look at pretrained and randomly initialized and see what information is captured in the pretrained weights

In [None]:
class resnet_model(nn.Module):
    def __init__(self):
        super(resnet_model, self).__init__()
        self.backbone = resnet18(pretrained=True)
        self.backbone.fc = nn.Linear(512, 1)

    def forward(self, x):
        x = self.backbone(x)
        return x.squeeze()


resnet = resnet_model().to("cuda")

In [None]:
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
import numpy as np

Creating a simple pytorch dataset we can generate images of an arbitrary size and then plot two random pixels on them. The model will then try to learn the euclidean distance between these points. 

In [None]:
sample = np.zeros((3, 128, 128))


In [None]:
class measure_dataset(Dataset):
    def __init__(self, size = (128, 128), data_size = 10000):
        self.size = size
        self.data_size = data_size
        
    def __len__(self):
        return self.data_size
    
    def __getitem__(self, index):
        sample = np.zeros((3, self.size[0], self.size[1]))
        sample[1, :, :] = np.arange(0, self.size[0])[None, :]/(self.size[0] + 1)
        sample[2, :, :] = np.arange(0, self.size[1])[:, None]/(self.size[1] + 1)
        
        first_x = np.random.randint(0, self.size[0])
        second_x = np.random.randint(0, self.size[0])
        first_y = np.random.randint(0, self.size[1])
        second_y = np.random.randint(0, self.size[1])
        
        sample[0, first_y, first_x] = 1
        sample[0, second_y, second_x] = -1
        return sample, np.sqrt((first_x - second_x)**2 + (first_y - second_y)**2)
        
        

In [None]:
trn_dataset = measure_dataset()
trn_dataloader = DataLoader(trn_dataset, batch_size=512, drop_last = True)

In [None]:
for x,y in trn_dataloader:
    break

In [None]:
x.shape

This is what the training samples and labels will look like

In [None]:
dual_channel = np.zeros((128, 128, 3))

In [None]:
dual_channel[:, :, 0] = x[0, 0] * x[0, 1]
dual_channel[:, :, 1] = x[0, 0] * x[0, 2]

In [None]:
np.unique(dual_channel[:, :, 0])

In [None]:
np.unique(dual_channel[:, :, 1])

In [None]:
plt.imshow(dual_channel[:, :, 0])

In [None]:
plt.imshow(x[0].permute(1, 2, 0))
print(y[0])

In [None]:
y[0]

In [None]:
plt.imshow(x[0, 2])

We will now create a simple training loop that tries to train the models on this generated data

In [None]:
def train_model(trn_dataloader, model, epochs = 10):
    criterion = nn.L1Loss()
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    running_loss = []
    for epoch in range(epochs):  # loop over the dataset multiple times
        for i, data in enumerate(trn_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs = inputs.to(torch.float32).to("cuda")
            labels = labels.to(torch.float32).to("cuda")
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss.append(loss.cpu().detach().numpy())
        print("epoch", str(epoch) + ":",  np.mean(np.array(running_loss[-100:]).reshape(-1,)))
    return running_loss

We will start with the custom model we defined

In [None]:
net.preconv1.weight.data[0, 0] = 1
net.preconv1.weight.data[0, 1] = 1
net.preconv1.weight.data[0, 2] = 0

net.preconv1.weight.data[1, 0] = 1
net.preconv1.weight.data[1, 1] = 0
net.preconv1.weight.data[1, 2] = 1

net.preconv1.bias.data[:] = 0

In [None]:
net.preconv1.requires_grad_ = False

In [None]:
net.preconv1.weight.data

In [None]:
net.preconv1.bias.data

In [None]:
plt.imshow(x[0, 0])

In [None]:
net(x[0:1].to("cuda"))

In [None]:
running_loss = train_model(trn_dataloader, net, 100)

Next we will train our resnet18 pretrained model and see how it compares

In [None]:
running_loss = train_model(trn_dataloader, resnet, 100)

We can see right away that the pretrained resnet converges way faster and to a lower minimum than the custom convnet. We can test after if it is the pretrained weights that are special here or the better crafted NN architecture

Now we can validate the models. This isn't true validation since the data is randomly generated and can have overlaps with the training set, but I assume these will be fairly rare given the low probability of hitting the same two pixels out of 128x128 images. 

In [None]:
def validate_model(trn_dataloader, model):
    val_criterion = nn.L1Loss(reduction = 'none')
    running_loss = []
    running_labels = []
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(trn_dataloader, 0):
            inputs, labels = data
            inputs = inputs.to(torch.float32).to("cuda")
            labels = labels.to(torch.float32).to("cuda")
            preds = model(inputs)
            loss = val_criterion(preds, labels)
            running_loss.append(loss.cpu().detach().numpy())
            running_labels.append(labels.cpu().detach().numpy())
    return np.array(running_loss).reshape(-1,), np.array(running_labels).reshape(-1,)

The validate function will return the loss of each sample as well as the label for each sample. In this way we can plot the loss against the labels and see if the model is particularly worse at larger distances between points or any other pattern like that. 

In [None]:
running_loss, running_labels = validate_model(trn_dataloader, net)
plt.scatter(running_labels, running_loss)

This is quite an interesting loss vs label pattern. My understanding here is that the model quickly converges to the average case, on a 128x128 image the average euclidean distance between any two random points must be around 75. It probably converges to that point and then begins improving at the other labels predictions, but we can see it is still doing a poor job

In [None]:
running_loss, running_labels = validate_model(trn_dataloader, resnet)
plt.scatter(running_labels, running_loss)

We can see that the resnet pattern looks significantly different. For starters all loss values are below ~4.0 and there is not the V shape present from the other model. 

One thing to consider is the effect that the pooling layers have on our model. In theory max pooling is reducing the resolution of our image, but maybe the convolutions can reformat the data to still capture accurate positions regardless. For example putting a bit in a different channel to denote if it originated from the  top-left pixel or top-right pixel in a pooling layer.

Now I will train the simple model a bit further and then we can inspect the activations from the various layers to see what it is learning

In [None]:
running_loss = train_model(trn_dataloader, net, 100)

In [None]:
x.shape

In [None]:
x = x.to(torch.float32).to('cuda')

Now we can look at the activations of a specific sample across the layers of our simple CNN we trained and see what our model is learning

In [None]:
layers = []
with torch.no_grad():
    for i, module in enumerate(net.modules()):
        try:
            if i == 0:
                continue
            print(module)
            if len(layers) == 0:
                layers.append(module(x))
            else:
                layers.append(module(layers[-1]))
        except:
            print("failed layer")

Viewing across the layers we can see how information is propogating through to the final predictions. I will just look at the first sample from a batch of data and trace it through to the final dense layer of the network. As showing many images all at once is difficult I will limit it the visualizations to be a max of 32 channels per layer. Hopefully we can still find something amongst the incomplete set of images

In [None]:
from mpl_toolkits.axes_grid1 import ImageGrid
with torch.no_grad():
    for i, layer in enumerate(layers):
        layer = layer[0]
        layer = layer[:32]
        print("layer" + str(i))
        fig = plt.figure(figsize=(len(layer)*4, len(layer)*4))
        grid = ImageGrid(fig, 111,  # similar to subplot(111)
                         nrows_ncols=(int(np.ceil(len(layer)/8)), 8),  # creates 2x2 grid of axes
                         axes_pad=0.1,  # pad between axes in inch.
                         )

        for ax, im in zip(grid, layer.cpu().detach().numpy()):
            # Iterating over the grid returns the Axes.
            ax.imshow(im)
        print(im.shape)
        plt.show()

From this we can see... not a whole lot. I am not sure what to infer from these images. It looks like the pixels are just being blurred. I don't entirely know what I expected to see here, but maybe something that looked more like: 

| ![frame 1](https://i.imgur.com/MmZWBWW.png) 	| ![frame 2](https://i.imgur.com/5IgnXRn.png) 	| ![frame 3](https://i.imgur.com/yW3DlMk.png) 	| ![frame 4](https://i.imgur.com/WizGDZz.png) 	| ![frame 5](https://i.imgur.com/mVhb7ME.png) 	| ![frame 6](https://i.imgur.com/u8y0FBH.png) 	| ![frame 7](https://i.imgur.com/WvyaRim.png) 	|    
|-	|-	|-	|-	|-	|-	|-	|

In theory then the model could sum up the pixels and infer distance with this information. It's unclear to me how the problem is solved in the activations we can visualize, but it seems to have learned sufficiently for the task of measuring distance

One hypothesis I have is that the model isnt trying to find the line between the two points and rather inferring x, y coordinates by measuring distance to the borders of the image. It would be interesting to see if the model has learned generalizable features for measuring distance or if it is specific to the resolution it was trained on. In practice I have seen models that work perfectly fine on alternate resolutions detecting cat vs dog and other tasks like that so I would assume there is still some possibility of performance. I would think beyond a certain resolution we would have problems though because the model has never learned to make predictions so high. 

We can start with 256 x 256 and see what happens with our two different models

In [None]:
big_trn_dataset = measure_dataset((256, 256))
big_trn_dataloader = DataLoader(big_trn_dataset, batch_size=64, drop_last = True)

In [None]:
running_loss, running_labels = validate_model(big_trn_dataloader, net)
plt.scatter(running_labels, running_loss)

In [None]:
running_loss, running_labels = validate_model(big_trn_dataloader, resnet)
plt.scatter(running_labels, running_loss)

Interestingly we get a bit of a snail pattern on the custom CNN. Beyond 128 we start seeing a linear growth between label and loss. Kind of expected given that our model likely only learned to predict within the possibilities of distances for 128x128 images. 

With the resnet model we see linear growth almost the whole time. very puzzling. Maybe we can inspect predictions more closely and understand what is causing that

In [None]:
def validate_model(trn_dataloader, model):
    val_criterion = nn.L1Loss(reduction = 'none')
    running_loss = []
    running_labels = []
    predictions = []
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(trn_dataloader, 0):
            inputs, labels = data
            inputs = inputs.to(torch.float32).to("cuda")
            labels = labels.to(torch.float32).to("cuda")
            preds = model(inputs)
            loss = val_criterion(preds, labels)
            running_loss.append(loss.cpu().detach().numpy())
            running_labels.append(labels.cpu().detach().numpy())
            predictions.append(preds.cpu().detach().numpy())
    return np.array(running_loss).reshape(-1,), np.array(running_labels).reshape(-1,), np.array(predictions).reshape(-1,)

In [None]:
_, _, predictions = validate_model(big_trn_dataloader, resnet)
predictions_df = pd.DataFrame(predictions)
print(predictions_df.describe())
predictions_df.hist(bins = 20)
plt.title("distribution of predictions of resnet")

In [None]:
_, _, predictions = validate_model(big_trn_dataloader, net)
predictions_df = pd.DataFrame(predictions)
print(predictions_df.describe())
predictions_df.hist(bins = 20)
plt.title("distribution of predictions of custom CNN")

Interestingly the models have very different modes of failure. The resnet seems to have predictions centered around the totally wrong region, while the custom CNN is skewing its predictions to the top of the range it is capable of but cant predict beyond that range. 

Now on the flip side we can try predicting on smaller images. Maybe performance will work when we arent trying to extapolate beyond a known range

In [None]:
sml_trn_dataset = measure_dataset((110, 110))
sml_trn_dataloader = DataLoader(sml_trn_dataset, batch_size=64, drop_last = True)
running_loss, running_labels, predictions = validate_model(sml_trn_dataloader, resnet)
plt.scatter(running_labels, running_loss)

predictions_df = pd.DataFrame(predictions)
print(predictions_df.describe())
predictions_df.hist(bins = 20)
plt.title("distribution of predictions of resnet")

In [None]:
running_loss, running_labels, predictions = validate_model(sml_trn_dataloader, net)
plt.scatter(running_labels, running_loss)

predictions_df = pd.DataFrame(predictions)
print(predictions_df.describe())
predictions_df.hist(bins = 20)
plt.title("distribution of predictions of custom CNN")

Interestingly on the slightly smaller images the resnet does signficantly better and the custom CNN seems to have decayed rapidly. It would be interesting to see an ablation done over varying resolutions to see what range of resolutions are acceptable for the models.

to be continued...

In [None]:
losses = []
for i in range(64, 256, 10):
    sml_trn_dataset = measure_dataset((i, i))
    sml_trn_dataloader = DataLoader(sml_trn_dataset, batch_size=64, drop_last = True)
    running_loss, running_labels, predictions = validate_model(sml_trn_dataloader, resnet)
    plt.scatter(running_labels, running_loss)

    predictions_df = pd.DataFrame(predictions)
    print("image size:", str(i))
    print(predictions_df.describe())
    predictions_df.hist(bins = 20)
    plt.title("distribution of predictions of resnet")
    plt.show()
    losses.append(np.mean(running_loss))
    
plt.plot(list(range(64, 256, 10)), losses)

In [None]:
losses = []
for i in range(110, 256, 10):
    sml_trn_dataset = measure_dataset((i, i))
    sml_trn_dataloader = DataLoader(sml_trn_dataset, batch_size=64, drop_last = True)
    running_loss, running_labels, predictions = validate_model(sml_trn_dataloader, net)
    plt.scatter(running_labels, running_loss)

    predictions_df = pd.DataFrame(predictions)
    print("image size:", str(i))
    print(predictions_df.describe())
    predictions_df.hist(bins = 20)
    plt.title("distribution of predictions of custom NN")
    plt.show()
    losses.append(np.mean(running_loss))
    


In [None]:
plt.plot(list(range(110, 256, 10)), losses)