# Preparing

In [1]:
# Clone Relational Networks repo
!git clone https://github.com/kimhc6028/relational-networks.git

# Create dataset
!python ./relational-networks/sort_of_clevr_generator.py

Cloning into 'relational-networks'...
remote: Enumerating objects: 293, done.[K
remote: Total 293 (delta 0), reused 0 (delta 0), pack-reused 293[K
Receiving objects: 100% (293/293), 57.88 MiB | 26.64 MiB/s, done.
Resolving deltas: 100% (170/170), done.
building test datasets...
building train datasets...
saving datasets...
datasets saved at ./data/sort-of-clevr.pickle


In [2]:
import os
import pickle
import random
import time
import csv

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.cuda.amp import autocast as autocast
from torchsummary import summary

# Relation Networks
Relation Networks are artificial neural networks that perform relational reasoning. Similar to the way neurons are connected in the brain, neural networks use small programs to collaboratively discover patterns in data, with specific architectures for image processing, grammar analysis, or game learning. In this application, the new "relational networks" compares each pair of targets in a scene individually.

RN operates on **objects**, such as images or natural language. So in order to use RN to solve CLEVR tasks, we need to turn images and questions into **objects** that have specific form first.

## ConvInputModel
This part of RN(Relation Networks) acts as a transformer, which turns a RGB image into an object that can be processed by the backbone of RN.

The original CNN model uses 4 convolutional layers each with 24 kernels, ReLU non-linearities, and batch normalization. It turns an image in size of 3x75x75 into a feature map in size of 24x5x5. It is a pretty simple CNN. Considering that Sort-of-CLEVR is simplified version of CLEVR, this 4-layer CNN can already solve this problem to some extent.

In order to improve the performance by only change the CNN part, I refer to the block of Resnet. The schematic diagram of the block structure is as follows.

![Redidual Block](https://d2l.ai/_images/resnet-block.svg)

The output size of the modified CNN is 96x5x5.

In [4]:
class ResBlock(nn.Module):
    def __init__(self, inchannel, outchannel, stride=1):
        super(ResBlock, self).__init__()
        self.left = nn.Sequential(
            nn.Conv2d(inchannel, outchannel, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(outchannel),
            nn.ReLU(inplace=True),
            nn.Conv2d(outchannel, outchannel, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(outchannel)
        )
        self.shortcut = nn.Sequential()
        if stride != 1 or inchannel != outchannel:
            self.shortcut = nn.Sequential(
                nn.Conv2d(inchannel, outchannel, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(outchannel)
            )
            
    def forward(self, x):
        out = self.left(x)
        out = out + self.shortcut(x)
        out = F.relu(out)
        
        return out
        

class ConvInputModel(nn.Module):
    def __init__(self):
        super(ConvInputModel, self).__init__()
        self.inchannel = 32
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, self.inchannel, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(self.inchannel),
            nn.ReLU()
        )
        self.layer1 = self.make_layer(ResBlock, 32, 2, stride=1)
        self.layer2 = self.make_layer(ResBlock, 48, 2, stride=2)
        self.layer3 = self.make_layer(ResBlock, 64, 2, stride=2) 
        self.layer4 = self.make_layer(ResBlock, 96, 2, stride=2)
        self.pool = nn.AvgPool2d(2)
    
    def make_layer(self, block, channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.inchannel, channels, stride))
            self.inchannel = channels
        return nn.Sequential(*layers)
        
    def forward(self, img):
        x = self.conv1(img)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.pool(x)
        return x


summary(ConvInputModel(), (3, 75, 75), device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 75, 75]             864
       BatchNorm2d-2           [-1, 32, 75, 75]              64
              ReLU-3           [-1, 32, 75, 75]               0
            Conv2d-4           [-1, 32, 75, 75]           9,216
       BatchNorm2d-5           [-1, 32, 75, 75]              64
              ReLU-6           [-1, 32, 75, 75]               0
            Conv2d-7           [-1, 32, 75, 75]           9,216
       BatchNorm2d-8           [-1, 32, 75, 75]              64
          ResBlock-9           [-1, 32, 75, 75]               0
           Conv2d-10           [-1, 32, 75, 75]           9,216
      BatchNorm2d-11           [-1, 32, 75, 75]              64
             ReLU-12           [-1, 32, 75, 75]               0
           Conv2d-13           [-1, 32, 75, 75]           9,216
      BatchNorm2d-14           [-1, 32,

## FCOutputModel

This part is a simple Fully Connected Layer. It turns results of backbone of RN into answers.

In [None]:
class FCOutputModel(nn.Module):
    def __init__(self):
        super(FCOutputModel, self).__init__()

        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 10)

    def forward(self, x):
        x = self.fc2(x)
        x = F.relu(x)
        x = F.dropout(x)
        x = self.fc3(x)
        return F.log_softmax(x, dim=1)

## Backbone

The core idea of this network is to treat the feature information proposed by CNN as objects in an image, and then different objects are combined two by two plus the LSTM output features of the problem, connected together to output a so-called relation feature through MLP, and then all the relations are added together to output the result through MLP.

In [None]:
class RN(nn.Module):
    def __init__(self, relation_type='ternary', batch_size=64, lr=0.0001, device='cpu'):
        super().__init__()

        self.conv = ConvInputModel()
        self.conv_size = (96, 5, 5)
        self.device = device
        self.relation_type = relation_type
        
        if self.relation_type == 'ternary':
            ##(number of filters per object+coordinate of object)*3+question vector
            self.g_fc1 = nn.Linear((self.conv_size[0] + 2) * 3 + 18, 256)
        else:
            ##(number of filters per object+coordinate of object)*2+question vector
            self.g_fc1 = nn.Linear((self.conv_size[0] + 2) * 2 + 18, 256)

        self.g_fc2 = nn.Linear(256, 256)
        self.g_fc3 = nn.Linear(256, 256)
        self.g_fc4 = nn.Linear(256, 256)

        self.f_fc1 = nn.Linear(256, 256)

        self.coord_oi = torch.FloatTensor(batch_size, 2).to(self.device)
        self.coord_oj = torch.FloatTensor(batch_size, 2).to(self.device)

        self.coord_oi = Variable(self.coord_oi)
        self.coord_oj = Variable(self.coord_oj)

        # prepare coord tensor
        def cvt_coord(i):
            return [(i / 5 - 2) / 2., (i % 5 - 2) / 2.]
        
        self.coord_tensor = torch.FloatTensor(batch_size, 25, 2).to(self.device)
        self.coord_tensor = Variable(self.coord_tensor)
        np_coord_tensor = np.zeros((batch_size, 25, 2))
        for i in range(25):
            np_coord_tensor[:, i, :] = np.array(cvt_coord(i))
        self.coord_tensor.data.copy_(torch.from_numpy(np_coord_tensor))

        self.fcout = FCOutputModel()

        self.optimizer = optim.Adam(self.parameters(), lr=lr)

    def forward(self, img, qst):
        x = self.conv(img)
        
        """g"""
        mb = x.size()[0]
        n_channels = x.size()[1]
        d = x.size()[2]
        x_flat = x.view(mb,n_channels,d*d).permute(0,2,1)
        
        # add coordinates
        x_flat = torch.cat([x_flat, self.coord_tensor], 2)

        if self.relation_type == 'ternary':
            # add question everywhere
            qst = torch.unsqueeze(qst, 1)
            qst = qst.repeat(1, 25, 1)
            qst = torch.unsqueeze(qst, 1)
            qst = torch.unsqueeze(qst, 1)

            # cast all triples against each other
            x_i = torch.unsqueeze(x_flat, 1)
            x_i = torch.unsqueeze(x_i, 3)
            x_i = x_i.repeat(1, 25, 1, 25, 1)
            
            x_j = torch.unsqueeze(x_flat, 2)
            x_j = torch.unsqueeze(x_j, 2)
            x_j = x_j.repeat(1, 1, 25, 25, 1)

            x_k = torch.unsqueeze(x_flat, 1)
            x_k = torch.unsqueeze(x_k, 1)
            x_k = torch.cat([x_k, qst], 4)
            x_k = x_k.repeat(1, 25, 25, 1, 1)

            # concatenate all together
            x_full = torch.cat([x_i, x_j, x_k], 4)

            # reshape for passing through network
            x_ = x_full.view(mb * (d * d) * (d * d) * (d * d), (self.conv_size[0] + 2) * 3 + 18)
        else:
            # add question everywhere
            qst = torch.unsqueeze(qst, 1)
            qst = qst.repeat(1, 25, 1)
            qst = torch.unsqueeze(qst, 2)

            # cast all pairs against each other
            x_i = torch.unsqueeze(x_flat, 1)
            x_i = x_i.repeat(1, 25, 1, 1)
            x_j = torch.unsqueeze(x_flat, 2)
            x_j = torch.cat([x_j, qst], 3)
            x_j = x_j.repeat(1, 1, 25, 1)
            
            # concatenate all together
            x_full = torch.cat([x_i,x_j],3)
        
            # reshape for passing through network
            x_ = x_full.view(mb * (d * d) * (d * d), (self.conv_size[0] + 2) * 2 + 18)
            
        x_ = self.g_fc1(x_)
        x_ = F.relu(x_)
        x_ = self.g_fc2(x_)
        x_ = F.relu(x_)
        x_ = self.g_fc3(x_)
        x_ = F.relu(x_)
        x_ = self.g_fc4(x_)
        x_ = F.relu(x_)
        
        # reshape again and sum
        if self.relation_type == 'ternary':
            x_g = x_.view(mb, (d * d) * (d * d) * (d * d), 256)
        else:
            x_g = x_.view(mb, (d * d) * (d * d), 256)

        x_g = x_g.sum(1).squeeze()
        
        """f"""
        x_f = self.f_fc1(x_g)
        x_f = F.relu(x_f)
        
        return self.fcout(x_f)

    def train_(self, input_img, input_qst, label):
        self.optimizer.zero_grad()
        if self.device.startswith('cuda'):
            with autocast():
                output = self(input_img, input_qst)
                loss = F.nll_loss(output, label)
        else:
            output = self(input_img, input_qst)
            loss = F.nll_loss(output, label)
        loss.backward()
        self.optimizer.step()
        pred = output.data.max(1)[1]
        correct = pred.eq(label.data).cpu().sum()
        accuracy = correct * 100. / len(label)
        return accuracy, loss
        
    def test_(self, input_img, input_qst, label):
        if self.device.startswith('cuda'):
            with autocast():
                output = self(input_img, input_qst)
                loss = F.nll_loss(output, label)
        else:
            output = self(input_img, input_qst)
            loss = F.nll_loss(output, label)
        pred = output.data.max(1)[1]
        correct = pred.eq(label.data).cpu().sum()
        accuracy = correct * 100. / len(label)
        return accuracy, loss

In a nutshell, the structure of Relation Networks is to first encode a question with LSTM, then superimpose the deep feature with two pairs of spatial cells, then follow by some FC layers and finally softmax classification to a certain answer word.

# Training and testing

In [None]:
bs = 32
epochs = 40
seed = 1
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

print('Using device:', device)

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

model = RN(batch_size=bs, device=device)

model_dirs = './model'
input_img = torch.FloatTensor(bs, 3, 75, 75)
input_qst = torch.FloatTensor(bs, 18)
label = torch.LongTensor(bs)

model.to(device)
input_img = input_img.to(device)
input_qst = input_qst.to(device)
label = label.to(device)

Using device: cuda:0


In [None]:
def tensor_data(data, i):
    img = torch.from_numpy(np.asarray(data[0][bs*i:bs*(i+1)]))
    qst = torch.from_numpy(np.asarray(data[1][bs*i:bs*(i+1)]))
    ans = torch.from_numpy(np.asarray(data[2][bs*i:bs*(i+1)]))

    input_img.data.resize_(img.size()).copy_(img)
    input_qst.data.resize_(qst.size()).copy_(qst)
    label.data.resize_(ans.size()).copy_(ans)


def cvt_data_axis(data):
    img = [e[0] for e in data]
    qst = [e[1] for e in data]
    ans = [e[2] for e in data]
    return (img,qst,ans)


def train(epoch, ternary, rel, norel):
    model.train()

    if not len(rel[0]) == len(norel[0]):
        print('Not equal length for relation dataset and non-relation dataset.')
        return
    
    random.shuffle(ternary)
    random.shuffle(rel)
    random.shuffle(norel)

    ternary = cvt_data_axis(ternary)
    rel = cvt_data_axis(rel)
    norel = cvt_data_axis(norel)

    acc_ternary = []
    acc_rels = []
    acc_norels = []

    l_ternary = []
    l_binary = []
    l_unary = []

    for batch_idx in range(len(rel[0]) // bs):
        tensor_data(ternary, batch_idx)
        accuracy_ternary, loss_ternary = model.train_(input_img, input_qst, label)
        acc_ternary.append(accuracy_ternary.item())
        l_ternary.append(loss_ternary.item())

        tensor_data(rel, batch_idx)
        accuracy_rel, loss_binary = model.train_(input_img, input_qst, label)
        acc_rels.append(accuracy_rel.item())
        l_binary.append(loss_binary.item())

        tensor_data(norel, batch_idx)
        accuracy_norel, loss_unary = model.train_(input_img, input_qst, label)
        acc_norels.append(accuracy_norel.item())
        l_unary.append(loss_unary.item())

        if batch_idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)] '
                  'Ternary accuracy: {:.0f}% | Relations accuracy: {:.0f}% | Non-relations accuracy: {:.0f}%'.format(
                   epoch,
                   batch_idx * bs * 2,
                   len(rel[0]) * 2,
                   100. * batch_idx * bs / len(rel[0]),
                   accuracy_ternary,
                   accuracy_rel,
                   accuracy_norel))
        
    avg_acc_ternary = sum(acc_ternary) / len(acc_ternary)
    avg_acc_binary = sum(acc_rels) / len(acc_rels)
    avg_acc_unary = sum(acc_norels) / len(acc_norels)

    return avg_acc_ternary, avg_acc_binary, avg_acc_unary


def test(epoch, ternary, rel, norel):
    model.eval()

    if not len(rel[0]) == len(norel[0]):
        print('Not equal length for relation dataset and non-relation dataset.')
        return
    
    ternary = cvt_data_axis(ternary)
    rel = cvt_data_axis(rel)
    norel = cvt_data_axis(norel)

    accuracy_ternary = []
    accuracy_rels = []
    accuracy_norels = []

    loss_ternary = []
    loss_binary = []
    loss_unary = []

    for batch_idx in range(len(rel[0]) // bs):
        tensor_data(ternary, batch_idx)
        acc_ter, l_ter = model.test_(input_img, input_qst, label)
        accuracy_ternary.append(acc_ter.item())
        loss_ternary.append(l_ter.item())

        tensor_data(rel, batch_idx)
        acc_bin, l_bin = model.test_(input_img, input_qst, label)
        accuracy_rels.append(acc_bin.item())
        loss_binary.append(l_bin.item())

        tensor_data(norel, batch_idx)
        acc_un, l_un = model.test_(input_img, input_qst, label)
        accuracy_norels.append(acc_un.item())
        loss_unary.append(l_un.item())

    accuracy_ternary = sum(accuracy_ternary) / len(accuracy_ternary)
    accuracy_rel = sum(accuracy_rels) / len(accuracy_rels)
    accuracy_norel = sum(accuracy_norels) / len(accuracy_norels)
    print('\n Test set: Ternary accuracy: {:.0f}% Binary accuracy: {:.0f}% | Unary accuracy: {:.0f}%\n'.format(
        accuracy_ternary, accuracy_rel, accuracy_norel))

    return accuracy_ternary, accuracy_rel, accuracy_norel

    
def load_data():
    print('loading data...')
    dirs = './data'
    filename = os.path.join(dirs, 'sort-of-clevr.pickle')
    with open(filename, 'rb') as f:
        train_datasets, test_datasets = pickle.load(f)
    ternary_train = []
    ternary_test = []
    rel_train = []
    rel_test = []
    norel_train = []
    norel_test = []
    print('processing data...')

    for img, ternary, relations, norelations in train_datasets:
        img = np.swapaxes(img, 0, 2)
        for qst, ans in zip(ternary[0], ternary[1]):
            ternary_train.append((img,qst,ans))
        for qst,ans in zip(relations[0], relations[1]):
            rel_train.append((img,qst,ans))
        for qst,ans in zip(norelations[0], norelations[1]):
            norel_train.append((img,qst,ans))

    for img, ternary, relations, norelations in test_datasets:
        img = np.swapaxes(img, 0, 2)
        for qst, ans in zip(ternary[0], ternary[1]):
            ternary_test.append((img, qst, ans))
        for qst,ans in zip(relations[0], relations[1]):
            rel_test.append((img,qst,ans))
        for qst,ans in zip(norelations[0], norelations[1]):
            norel_test.append((img,qst,ans))
    
    return (ternary_train, ternary_test, rel_train, rel_test, norel_train, norel_test)

In [None]:
ternary_train, ternary_test, rel_train, rel_test, norel_train, norel_test = load_data()

try:
    os.makedirs(model_dirs)
except:
    pass

with open(f'./log.csv', 'w') as log_file:
    csv_writer = csv.writer(log_file, delimiter=',')
    csv_writer.writerow(['epoch', 'train_acc_ternary', 'train_acc_rel',
                     'train_acc_norel', 'train_acc_ternary', 'test_acc_rel', 'test_acc_norel'])

    print(f"Training RN model...")
    for epoch in range(1, epochs + 1):
        train_acc_ternary, train_acc_binary, train_acc_unary = train(
            epoch, ternary_train, rel_train, norel_train)
        torch.cuda.empty_cache()
        torch.cuda.empty_cache()
        time.sleep(2)

        test_acc_ternary, test_acc_binary, test_acc_unary = test(
            epoch, ternary_test, rel_test, norel_test)
        torch.cuda.empty_cache()
        torch.cuda.empty_cache()

        csv_writer.writerow([epoch, train_acc_ternary, train_acc_binary,
                            train_acc_unary, test_acc_ternary, test_acc_binary, test_acc_unary])

loading data...
processing data...
Training RN model...

 Test set: Ternary accuracy: 48% Binary accuracy: 43% | Unary accuracy: 51%


 Test set: Ternary accuracy: 52% Binary accuracy: 42% | Unary accuracy: 49%


 Test set: Ternary accuracy: 29% Binary accuracy: 32% | Unary accuracy: 47%


 Test set: Ternary accuracy: 53% Binary accuracy: 44% | Unary accuracy: 50%


 Test set: Ternary accuracy: 13% Binary accuracy: 10% | Unary accuracy: 0%


 Test set: Ternary accuracy: 54% Binary accuracy: 43% | Unary accuracy: 55%


 Test set: Ternary accuracy: 53% Binary accuracy: 42% | Unary accuracy: 60%


 Test set: Ternary accuracy: 54% Binary accuracy: 44% | Unary accuracy: 63%


 Test set: Ternary accuracy: 54% Binary accuracy: 66% | Unary accuracy: 71%


 Test set: Ternary accuracy: 52% Binary accuracy: 74% | Unary accuracy: 69%


 Test set: Ternary accuracy: 55% Binary accuracy: 75% | Unary accuracy: 70%


 Test set: Ternary accuracy: 51% Binary accuracy: 66% | Unary accuracy: 69%


 Test se

# Conclusion

By replacing the simple CNN network in the original model with a feature extraction network composed of blocks of Resnet, the accuracy of Relations improved to 91.6% and the accuracy of Non-relations improved to 99.1% after training 35 epochs.