# Tip-of-the-Tongue: Doodle-Image Retrieval Engine

__Group 1:__ 
- Ai Bo (TODO)
- New Jun Jie (TODO)
- Rishabh Anand (A0220603Y)

---

__Tip of the tongue__ refers to the situation when we have a vague idea of an object in our memory but simply cannot name it. Most often than not, we feel retrieval of the object's name is imminent. However, we can definitely draw out a doodle of this object when asked to. The objective of this CS4243 project is to investigate the design of learning algorithms for retrieving a collection of real-world images from these manually drawn doodles. 

As this module is on Computer Vision, our project focuses on the dataset collection and preprocessing, as well as model selection, training, and testing _only_. One can easily package the models into a search engine that takes in a doodle and returns the top-k matching images.

As a taster, here are some interesting results:
<!-- ADD RESULTS -->

In [57]:
import numpy as np
import torch, cv2
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader, Sampler
from torchvision import transforms
from torchinfo import summary

## Training and Testing Dataset

The dataset consists of an amalgam of over 1 million doodles web-scraped from the following sources:

- Google Quick, Draw!
- Sketchy

It also features real-life images web-scraped from:

- Google Images 
-

Our final dataset is an unpaired 

In [58]:
def combined_dataset(datasets, size):
    combined_dataset = {}
    for name, dataset in datasets.items():
        for class_name, class_data in dataset.items():
            if class_name not in combined_dataset:
                combined_dataset[class_name] = []
            # resize data so they can be stacked
            resized = []
            for data in class_data:
                resized.append(cv2.resize(data, (size, size), interpolation=cv2.INTER_AREA))
            resized = np.stack(resized, axis=0)
            combined_dataset[class_name].append(resized)
    for class_name, lst_datasets in combined_dataset.items():
        combined_dataset[class_name] = np.concatenate(lst_datasets, axis=0)
    return combined_dataset


class ImageDataset(Dataset):
    DATASET_DIR = {True: 'dataset/dataset_train.npy', False: 'dataset/dataset_test.npy'}

    def __init__(self, doodles_list, real_list, doodle_size, real_size, train: bool):
        super(ImageDataset, self).__init__()

        dataset = np.load(self.DATASET_DIR[train], allow_pickle=True)[()]

        doodle_datasets = {name: data for name, data in dataset.items() if name in doodles_list}
        real_datasets = {name: data for name, data in dataset.items() if name in real_list}
        self.doodle_dict = combined_dataset(doodle_datasets, doodle_size)
        self.real_dict = combined_dataset(real_datasets, real_size)

        # sanity check
        assert set(self.doodle_dict.keys()) == set(self.real_dict.keys()), \
            f'doodle and real images label classes do not match'

        # process classes
        label_idx = {}
        for key in self.doodle_dict.keys():
            if key not in label_idx:
                label_idx[key] = len(label_idx)
        self.label_idx = label_idx

        # parse data and labels
        self.doodle_data, self.doodle_label = self._return_x_y_pairs(self.doodle_dict, label_idx)
        self.real_data, self.real_label = self._return_x_y_pairs(self.real_dict, label_idx)

        # data preprocessing
        self.doodle_preprocess = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize(doodle_size),
            transforms.ToTensor(),
            transforms.Normalize((self.doodle_data/255).mean(), (self.doodle_data/255).std())   # IMPORTANT / 255
        ])

        self.real_preprocess = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize(real_size),
            transforms.ToTensor(),
            transforms.Normalize((self.real_data/255).mean(axis=(0, 1, 2)), (self.real_data/255).std(axis=(0, 1, 2)))
        ])

        print(f'Train = {train}. Doodle list: {doodles_list}, \n real list: {real_list}. \n classes: {label_idx.keys()} \n'
              f'Doodle data size {len(self.doodle_data)}, real data size {len(self.real_data)}, '
              f'ratio {len(self.doodle_data)/len(self.real_data)}')

    def _return_x_y_pairs(self, data_dict, category_mapping):
        xs, ys = [], []
        for key in data_dict.keys():
            data = data_dict[key]
            labels = [category_mapping[key]] * len(data)
            xs.append(data)
            ys.extend(labels)
        return np.concatenate(xs, axis=0), np.array(ys)

    def __getitem__(self, idx):
        # naive sampling scheme - sample with replacement
        # sample label first so that doodle and real data belong to the same category
        label = random.choice(list(self.label_idx.keys()))
        doodle_data = self.doodle_preprocess(random.choice(self.doodle_dict[label]))
        real_data = self.real_preprocess(random.choice(self.real_dict[label]))
        numer_label = self.label_idx[label]
        return doodle_data, numer_label, real_data, numer_label

    def __len__(self):
        return max(len(self.doodle_data), len(self.real_data))     # could be arbitrary number

In [59]:
doodles = ['sketchy_doodle', 'tuberlin', 'google_doodles']
reals = ['sketchy_real', 'google_real', 'cifar']

doodle_size = 64
real_size = 64

train_set = ImageDataset(doodles, reals, doodle_size, real_size, train=True)
val_set = ImageDataset(doodles, reals, doodle_size, real_size, train=False)

print(len(train_set[0]))

Train = True. Doodle list: ['sketchy_doodle', 'tuberlin', 'google_doodles'], 
 real list: ['sketchy_real', 'google_real', 'cifar']. 
 classes: dict_keys(['airplane', 'car', 'cat', 'dog', 'frog', 'horse', 'truck', 'bird', 'ship']) 
Doodle data size 7022, real data size 46364, ratio 0.15145371408851696
Train = False. Doodle list: ['sketchy_doodle', 'tuberlin', 'google_doodles'], 
 real list: ['sketchy_real', 'google_real', 'cifar']. 
 classes: dict_keys(['airplane', 'car', 'cat', 'dog', 'frog', 'horse', 'truck', 'bird', 'ship']) 
Doodle data size 1764, real data size 9341, ratio 0.18884487742211756


NameError: name 'random' is not defined

In [None]:
print (len(train_set))

## Models and Approaches

1. Version 1: Multilayer Perceptron Classification
2. Version 2: Convolutional Neural Network Classification
3. Version 3: Convolutional Neural Network with Contrastive Loss
4. Version 4: Convolutional Neural Network with multiple Contrastive Losses
5. Version 5: ConvNeXt<sup>1</sup> with multiple Contrastive Losses

---

<sup>1</sup> Liu, Z., Mao, H., Wu, C. Y., Feichtenhofer, C., Darrell, T., & Xie, S. (2022). A ConvNet for the 2020s. arXiv preprint arXiv:2201.03545.

## Version 1: Multilayer Perceptron Classification

The final architecture and pipeline look like so:

In [3]:
class MLP(nn.Module):
    def __init__(self, in_dim, hid_dim, out_dim, dropout=0.2):
        super(ExampleMLP, self).__init__()
        self.l1 = nn.Linear(in_dim, hid_dim)
        self.l2 = nn.Linear(hid_dim, hid_dim)
        self.l3 = nn.Linear(hid_dim, hid_dim)
        self.l4 = nn.Linear(hid_dim, out_dim)
        self.relu = nn.LeakyReLU(negative_slope=0.2)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, return_feats=False):
        x = x.flatten(1) # img to vector
        x = self.relu(self.l1(x))
        x = self.dropout(x)
        x = self.relu(self.l2(x))
        x = self.l3(x)
        feat = x
        x = self.relu(x)
        x = self.dropout(x)
        x = self.l4(x)

        if return_feats:
            return x, feat

        return x

## Version 2: Convolutional Neural Network Classification

The final pipeline and architecture look like so:

In [49]:
class ConvBlock(nn.Module):
    def __init__(self, inchannels, outchannels, kernel, stride, padding=0, bias=True):
        super().__init__()        
        self.block = nn.Sequential(
                        nn.Conv2d(
                            inchannels, 
                            outchannels, 
                            kernel_size=kernel, 
                            stride=stride, 
                            padding=padding, 
                            bias=bias
                        ),
                        nn.BatchNorm2d(outchannels),
                        nn.ReLU(inplace=True)
                    )
        
    def forward(self, x):
        return self.block(x)

In [54]:
class ConvNet(nn.Module):
    CHANNELS = [64, 128, 192, 256, 512]
    POOL = (1, 1)

    def __init__(self, in_c, num_classes, dropout=0.2):
        super().__init__()
        layer1 = ConvBlock(in_c, self.CHANNELS[0], kernel=3, stride=2, padding=1, bias=True)
        layer2 = ConvBlock(self.CHANNELS[0], self.CHANNELS[1], kernel=3, stride=2, padding=1, bias=True)
        layer3 = ConvBlock(self.CHANNELS[1], self.CHANNELS[2], kernel=3, stride=2, padding=1, bias=True)
        layer4 = ConvBlock(self.CHANNELS[2], self.CHANNELS[3], kernel=3, stride=2, padding=1, bias=True)
        layer5 = ConvBlock(self.CHANNELS[3], self.CHANNELS[4], kernel=3, stride=2, padding=1, bias=True)
        pool = nn.AdaptiveAvgPool2d(self.POOL)
        self.layers = nn.Sequential(layer1, layer2, layer3, layer4, layer5, pool)

        self.dropout = nn.Dropout(p=dropout)
        self.nn = nn.Sequential(
                    nn.Linear(self.POOL[0] * self.POOL[1] * self.CHANNELS[4], 64),
                    nn.Linear(64, num_classes)
                )

    def forward(self, x, return_feats=False):
        feats = self.layers(x)
        feats = feats.flatten(1)
        x = self.nn(self.dropout(feats))

        if return_feats:
            return x, feats

        return x

In [56]:
x = torch.rand(100, 3, 64, 64)
net = ConvNet(3, 9)
y = net(x)
print (summary(net))

Layer (type:depth-idx)                   Param #
ConvNet                                  --
├─Sequential: 1-1                        --
│    └─ConvBlock: 2-1                    --
│    │    └─Sequential: 3-1              1,920
│    └─ConvBlock: 2-2                    --
│    │    └─Sequential: 3-2              74,112
│    └─ConvBlock: 2-3                    --
│    │    └─Sequential: 3-3              221,760
│    └─ConvBlock: 2-4                    --
│    │    └─Sequential: 3-4              443,136
│    └─ConvBlock: 2-5                    --
│    │    └─Sequential: 3-5              1,181,184
│    └─AdaptiveAvgPool2d: 2-6            --
├─Dropout: 1-2                           --
├─Sequential: 1-3                        --
│    └─Linear: 2-7                       32,832
│    └─Linear: 2-8                       585
Total params: 1,955,529
Trainable params: 1,955,529
Non-trainable params: 0


## Version 3: Convolutional Neural Network with Contrastive Loss

#### Contrastive Loss
We follow the Contrastive Loss from SimCLR<sup>2</sup>:

$$
l_{i, j} = -\log \frac{\text{exp}(\text{sim}(z_i, z_j)/\tau)}{\sum_{2N}^{k=1} \mathbb{1}_{k \neq i} \text{ exp}(\text{sim}(z_i, z_k)/\tau)}
$$

The total loss is the arithmetic mean of the losses for all positive pairs in a batch:

$$
L = \frac{1}{2N} \sum^{N}_{k=1} [l(2k-1, 2k) + l(2k, 2k-1)]
$$

The architecture and pipeline look like so:


---

#### References
<sup>2</sup> Chen, T., Kornblith, S., Norouzi, M., and Hinton, G. A simple framework for contrastive learning of visual representations. In International conference on machine learning, pp. 1597–1607. PMLR, 2020.

In [None]:
class ConvBlock(nn.Module):
    def __init__(self, inchannels, outchannels, kernel, stride, padding=0, bias=True):
        super().__init__()        
        self.block = nn.Sequential(
                        nn.Conv2d(
                            inchannels, 
                            outchannels, 
                            kernel_size=kernel, 
                            stride=stride, 
                            padding=padding, 
                            bias=bias
                        ),
                        nn.BatchNorm2d(outchannels),
                        nn.ReLU(inplace=True)
                    )
        
    def forward(self, x):
        return self.block(x)

In [None]:
class ConvNet(nn.Module):
    CHANNELS = [64, 128, 192, 256, 512]
    POOL = (1, 1)

    def __init__(self, in_c, num_classes, dropout=0.2):
        super().__init__()
        layer1 = ConvBlock(in_c, self.CHANNELS[0], kernel=3, stride=2, padding=1, bias=True)
        layer2 = ConvBlock(self.CHANNELS[0], self.CHANNELS[1], kernel=3, stride=2, padding=1, bias=True)
        layer3 = ConvBlock(self.CHANNELS[1], self.CHANNELS[2], kernel=3, stride=2, padding=1, bias=True)
        layer4 = ConvBlock(self.CHANNELS[2], self.CHANNELS[3], kernel=3, stride=2, padding=1, bias=True)
        layer5 = ConvBlock(self.CHANNELS[3], self.CHANNELS[4], kernel=3, stride=2, padding=1, bias=True)
        pool = nn.AdaptiveAvgPool2d(self.POOL)
        self.layers = nn.Sequential(layer1, layer2, layer3, layer4, layer5, pool)

        self.dropout = nn.Dropout(p=dropout)
        self.nn = nn.Sequential(
                    nn.Linear(self.POOL[0] * self.POOL[1] * self.CHANNELS[4], 64),
                    nn.Linear(64, num_classes)
                )

    def forward(self, x, return_feats=False):
        feats = self.layers(x)
        feats = feats.flatten(1)
        x = self.nn(self.dropout(feats))

        if return_feats:
            return x, feats

        return x

In [2]:
def compute_sim_matrix(feats):
    """
    Takes in a batch of features of size (bs, feat_len).
    """
    sim_matrix = F.cosine_similarity(feats.unsqueeze(2).expand(-1, -1, feats.size(0)),
                                     feats.unsqueeze(2).expand(-1, -1, feats.size(0)).transpose(0, 2),
                                     dim=1)

    return sim_matrix


def compute_target_matrix(labels):
    """
    Takes in a label vector of size (bs)
    """
    label_matrix = labels.unsqueeze(-1).expand((labels.shape[0], labels.shape[0]))
    trans_label_matrix = torch.transpose(label_matrix, 0, 1)
    target_matrix = (label_matrix == trans_label_matrix).type(torch.float)

    return target_matrix


def contrastive_loss(pred_sim_matrix, target_matrix, temperature):
    return F.kl_div(F.softmax(pred_sim_matrix / temperature).log(), F.softmax(target_matrix / temperature),
                    reduction="batchmean", log_target=False)


def compute_contrastive_loss_from_feats(feats, labels, temperature):
    sim_matrix = compute_sim_matrix(feats)
    target_matrix = compute_target_matrix(labels)
    loss = contrastive_loss(sim_matrix, target_matrix, temperature)
    return loss

## Version 4: Convolutional Neural Network with multiple Contrastive Losses

We add two more losses to the Contrastive Loss from Version 3.

#### Loss 2

#### Loss 3

The final architecture and pipeline look like so:

## (BONUS) Version 5: ConvNeXt with multiple Contrastive Losses

<!-- TODO: Talk about ConvNeXt -->

Finally, we train ConvNeXt with the three losses used in Version 4. ConvNeXt (CVPR, 2022) is a "modernised" ConvNet that hopes of competing head-on with Transformers and their success. We believe using ConvNeXt is a timely choice given its recent success at the CVPR 2022 conference and how simple

ConvNeXt is an improvement over the standard ConvNet that brings together innovations from the Transformer<sup>3</sup> and ResNet<sup>4</sup> – the primary work horses in Computer Vision today. Here are the list of enhancements (and their inspiration source) we wish to showcase in this CS4243 project:

1. Block-based architecture design (Transformer & Resnet)
2. Residual Connections between start and end of block (Transformer & ResNet)
3. Wider receptive fields from (3,3) to (7,7) (ResNet)
4. Use of GELU activation instead of ReLU (Transformer)
5. Substituting BatchNorm with LayerNorm (Transformer)

The final architecture and pipeline look like so:



*__Note:__ Before ConvNeXt, we considered the Vision Transformer<sup>5</sup> (ViT; ICLR, 2021) for Version 5 but decided against the move because the leap from vanilla CNN to ViT would have been too huge to explain and justify; furthermore, there is no basis of comparison between them, causing our improvements to be null and void. It would be akin to comparing the efficacy of cutting a cake with a kitchen knife and a chainsaw. ConvNeXt provided the perfect middle ground!*

---

#### References
<sup>3</sup> Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., ... & Polosukhin, I. (2017). Attention is all you need. Advances in neural information processing systems, 30.

<sup>4</sup> He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 770-778).

<sup>5</sup> Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., … Houlsby, N. (2020). An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. International Conference on Learning Representations, 2021.

In [4]:
class ConvNeXtBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.conv1 = nn.Conv2d(dim, dim, (7,7), padding=3, groups=dim)
        self.lin1 = nn.Linear(dim, 4 * dim)
        self.lin2 = nn.Linear(4 * dim, dim)
        self.ln = nn.LayerNorm(dim)
        self.gelu = nn.GELU()

    def forward(self, x):
        res_inp = x
        x = self.conv1(x)
        x = x.permute(0, 2, 3, 1) # NCHW -> NHWC
        x = self.ln(x)
        x = self.lin1(x)
        x = self.lin2(x)
        x = self.gelu(x)
        x = x.permute(0, 3, 1, 2) # NHWC -> NCHW
        out = x + res_inp

        return out

We believe __2__ ConvNeXt blocks are comparable in terms of model size (number of parameters) and depth to the vanilla ConvNet used in Version 3 and 4. 

In [45]:
class ConvNeXt(nn.Module):
    def __init__(self, in_channels, classes, block_dims=[192, 384]):
        super().__init__()
        self.blocks = nn.Sequential(
                        nn.Conv2d(in_channels, block_dims[0], kernel_size=2, stride=2),
                        ConvNeXtBlock(block_dims[0]),
                        nn.Conv2d(block_dims[0], block_dims[1], kernel_size=2, stride=2),
                        ConvNeXtBlock(block_dims[1]),
                    )
        self.block_dims = block_dims
        self.project = nn.Linear(block_dims[-1], classes)

    def forward(self, x, return_feats=False):
        feats = self.blocks(x)
        x = feats.view(-1, self.block_dims[-1], 16*16).mean(2)
        out = self.project(x)

        return out, feats if return_feats else out

In [48]:
x = torch.rand(100, 3, 64, 64)
net = ConvNeXt(3, 9)
y, _ = net(x)
print (summary(net))

Layer (type:depth-idx)                   Param #
ConvNeXt                                 --
├─Sequential: 1-1                        --
│    └─Conv2d: 2-1                       2,496
│    └─ConvNeXtBlock: 2-2                --
│    │    └─Conv2d: 3-1                  9,600
│    │    └─Linear: 3-2                  148,224
│    │    └─Linear: 3-3                  147,648
│    │    └─LayerNorm: 3-4               384
│    │    └─GELU: 3-5                    --
│    └─Conv2d: 2-3                       295,296
│    └─ConvNeXtBlock: 2-4                --
│    │    └─Conv2d: 3-6                  19,200
│    │    └─Linear: 3-7                  591,360
│    │    └─Linear: 3-8                  590,208
│    │    └─LayerNorm: 3-9               768
│    │    └─GELU: 3-10                   --
├─Linear: 1-2                            3,465
Total params: 1,808,649
Trainable params: 1,808,649
Non-trainable params: 0


## Metrics

While the quality of real-life images returned by the model for a given doodle is subjective, we use classification accuracy

## Results and Evaluation

## Analysis and Ablations

### t-SNE
We wish to analyse how the MLP generates embeddings for doodles and real images respectively. We use the $t$-Stochastic Neighbour Embeddings (t-SNE) method to visualise the output embeddings on the validation set in 2D space.

In [None]:
val_loader = DataLoader(val_set, batch_size=256, shuffle=False, drop_last=True)

doodle_model = DoodleMLP(doodle_size * doodle_size, 128, NUM_CLASSES, dropout=0.2)
real_model = RealMLP(real_size * real_size * 3, 512, NUM_CLASSES, dropout=0.2)
model1 = load_model_dic(doodle_model, "mlp_trained/14_model1.pt")
model2 = load_model_dic(real_model, "mlp_trained/14_model2.pt")

model1.eval(), model1.eval()
running_loss1 = 0.0
running_loss2 = 0.0

for i, (x1, y1, x2, y2) in enumerate(val_loader):
    pred1, feats1 = model1(x1, return_feats=True)
    pred2, feats2 = model2(x2, return_feats=True)
    running_loss1 += compute_accuracy(pred1, y1)
    running_loss2 += compute_accuracy(pred2, y2)
avg_loss1 = running_loss1 / (i + 1)
avg_loss2 = running_loss2 / (i + 1)
print(f"Doodle acc: {avg_loss1:.4f}, Real acc: {avg_loss2:.4f}")

xs1 = []
xs2 = []
fs1 = []
fs2 = []
ys1 = []
ys2 = []

with torch.no_grad():
    for i, (x1, y1, x2, y2) in enumerate(val_loader):
        pred1, feats1 = model1(x1, return_feats=True)
        pred2, feats2 = model2(x2, return_feats=True)
        xs1.append(x1)
        xs2.append(x2)
        fs1.append(feats1)
        fs2.append(feats2)
        ys1.append(y1)
        ys2.append(y2)
data1 = torch.cat(xs1).numpy()
data2 = torch.cat(xs2).numpy()
feats1 = torch.cat(fs1).numpy()
labels1 = torch.cat(ys1).numpy()
feats2 = torch.cat(fs2).numpy()
labels2 = torch.cat(ys2).numpy()

def plot_tsne(feats, labels, pc=0, size=1, alpha=1):
    if pc > 0:
        feats = PCA(n_components=pc).fit_transform(feats)
    c = TSNE(n_components=2).fit_transform(feats)
    CLASSES = ["airplane", "car", "bird", "cat", "dog", "frog", "horse", "ship", "truck"]
    class_to_idx = {k: i for i, k in enumerate(CLASSES)}
    colors = cm.rainbow(np.linspace(0, 1, len(CLASSES)))
    plt.title("TSNE of MLP embeddings")
    for clas, color in zip(CLASSES, colors):
        idx = np.where(labels == class_to_idx[clas])[0]
        plt.scatter(c[idx][:,0], c[idx][:,1], label=clas, s=size, alpha=alpha, color=color)
    plt.legend(loc='best', markerscale=2)
    plt.show()

### GradCAM on ConvNet and ConvNeXt

This project's major contributions lie in the CNNs trained above. As with MLPs and t-SNE, we complement all our convolutional models (Version 2, 3, 4, 5) with GradCAM, a method to visualise where the convolutional model is focusing on. It looks out for the most salient features in the image and predicts a class majorly influenced by said features. 

In [None]:
def get_CAM(feature_map, weight, class_idx):
    size_upsample = (32, 32)
    bz, nc, h, w = feature_map.shape

    before_dot = feature_map.reshape((nc, h*w))
    cam = weight[class_idx].unsqueeze(0) @ before_dot

    cam = cam.squeeze(0)
    cam = cam.reshape(h, w)
    cam = cam - torch.min(cam)
    cam = cam / torch.max(cam)
    cam = torch.clip(cam, 0, 1)
    
    img = transforms.Resize(size_upsample)(cam.unsqueeze(0))
    
    return img.detach().numpy(), cam

In [None]:
cifar10_classes = [
    "airplane",
    "automobile",
    "bird",
    "cat",
    "deer",
    "dog",
    "frog",
    "horse",
    "ship",
    "truck",
]

def plot_cam(img, cam):
    ''' Visualization function '''
    img = img.permute(1, 2, 0)
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10,7))
    ax1.imshow(img)
    ax1.set_title(f"Input image\nLabel: {cifar10_classes[y]}")

    ax2.imshow(cam.reshape(32, 32), cmap="jet")
    ax2.set_title("Raw CAM.")

    ax3.imshow(img)
    ax3.imshow(cam.reshape(32, 32), cmap="jet", alpha=0.2)
    ax3.set_title(f"Overlayed CAM.\nPrediction: {cifar10_classes[idx[0]]}")
    plt.show()

In [None]:
rand_idx = torch.randint(0, 10000, size=[1]) # pick a random index from the test set

x = val_[rand_idx][0] # test image
y = cifar_test[rand_idx][1] # associated test label

model.eval()
scores = model(x.unsqueeze(0)) # get the soft labels
probs = scores.data.squeeze()
probs, idx = probs.sort(0, True)

print('true class: ', cifar10_classes[y])
print('predicated class: ', cifar10_classes[idx[0]])

params = list(model.fc.parameters())
weight = params[0].data

feature_maps = model.conv(x.unsqueeze(0))
heatmap, _ = get_CAM(feature_maps, weight, idx[0])
    
plot_cam(x, heatmap)

## Test-driving the Similarity Engine