In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable

import torchvision
from torchvision import models

import numpy as np
import matplotlib.pyplot as plt
import math
import json
import itertools
import time 

from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split
from PIL import Image
import jpeg4py 
from torchsummary import summary 
from albumentations.pytorch import ToTensor
from albumentations import (Compose, CenterCrop, VerticalFlip, RandomSizedCrop,
                            HorizontalFlip, HueSaturationValue, ShiftScaleRotate,
                            Resize, RandomCrop, Normalize, Rotate, Normalize)

%matplotlib inline

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True

print(device)

# used constants

num_epochs = 10
batch_size = 128
h, w = 224, 224
thresh = 128

cuda:0


In [2]:
transforms = {
            'train':Compose([
        Resize(h, w),
        Normalize(),
        ToTensor()
        ]),
            'val':Compose([
        Resize(h, w),
        Normalize(),
        ToTensor()
        ]),
            'test': Compose([
        Resize(h, w),
        Normalize(),
        ToTensor()
        ]),
}

cnn = models.googlenet(pretrained=True)
cnn = nn.Sequential(*list(cnn.children())[:-2])
cnn = cnn.to(device)
cnn.eval()

summary(cnn, (3, h, w))

img_embed_size = 1024

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
       BasicConv2d-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
       BasicConv2d-7           [-1, 64, 56, 56]               0
            Conv2d-8          [-1, 192, 56, 56]         110,592
       BatchNorm2d-9          [-1, 192, 56, 56]             384
      BasicConv2d-10          [-1, 192, 56, 56]               0
        MaxPool2d-11          [-1, 192, 28, 28]               0
           Conv2d-12           [-1, 64, 28, 28]          12,288
      BatchNorm2d-13           [-1, 64, 28, 28]             128
      BasicConv2d-14           [-1, 64,

In [3]:
# annotations preprocessing

with open('VQAv2/train/annotations/v2_mscoco_train2014_annotations.json', 'r') as file:
    annotations = json.load(file)

ans_list = []

for a in tqdm(annotations['annotations']):
    ans_list.append(a['multiple_choice_answer'])

ans_list = np.unique(ans_list)
ans_dict = {v : k for k, v in enumerate(ans_list)}

n_ans = len(ans_list)
print(n_ans)
del ans_list

100%|██████████| 443757/443757 [00:00<00:00, 2315197.08it/s]


22531


In [4]:
# questions preprocessing

count = {}
bag_of_words = {}

with open('VQAv2/train/questions/v2_OpenEnded_mscoco_train2014_questions.json', 'r') as file:
    questions = json.load(file)
    
questions = [q['question'].lower()[:-1].split(' ') for q in tqdm(questions['questions'])]

for question in tqdm(questions):
    for word in question:
        if word not in count:
            count[word] = 1
        else:
            count[word] += 1

for question in tqdm(questions):
    for word in question:
        if count[word] > thresh:
            if word not in bag_of_words:
                bag_of_words[word] = len(bag_of_words) 

result = []
for question in tqdm(questions):
    q_vec = np.zeros(len(bag_of_words))
    
    for word in question:
        if word in bag_of_words:
            q_vec[bag_of_words[word]] = 1
            
    result.append(q_vec)

questions = np.array(result).astype(bool)

q_embed_size = questions.shape[1]
print(q_embed_size)
del result
del bag_of_words

100%|██████████| 443757/443757 [00:01<00:00, 422137.75it/s]
100%|██████████| 443757/443757 [00:00<00:00, 606148.91it/s]
100%|██████████| 443757/443757 [00:00<00:00, 787606.07it/s]
100%|██████████| 443757/443757 [00:02<00:00, 159213.96it/s]


1181


In [5]:
class VQA(Dataset):
    def __init__(self, questions, annotations, ans_dict=None,
                 images_dir='VQA/train/train2014', transfrom=transforms, train=True): 
        self.questions = questions
        self.annotations = annotations
        self.images_dir = images_dir
        
        if train:
            self.train = True
            self.ans_dict = ans_dict
            self.transform = transforms['train']
        else:
            self.train = False
            self.transform = transforms['val']
            
    def __len__(self):
        return self.questions.shape[0]

    def __getitem__(self, idx):
        annotation = self.annotations['annotations'][idx]
        
        image_id = str(annotation['image_id'])
        #jpeg = TurboJPEG(r'D:\Install\libjpeg-turbo-gcc\bin\turbojpeg.dll')
        #in_file = open(self.images_dir + 
        #                     'COCO_train2014_000000' + 
        #                     image_id.zfill(6) + '.jpg', 'rb')
        #image = jpeg.decode(in_file.read())[...,::-1]
        #in_file.close()
        image = jpeg4py.JPEG(self.images_dir + 
                             'COCO_train2014_000000' + 
                             image_id.zfill(6) + '.jpg').decode()
        
        image = self.transform(image=image)['image']
        question = torch.FloatTensor(self.questions[idx])
        
        if self.train:
            answer = self.ans_dict[annotation['multiple_choice_answer']]
            return (image, question, answer)
        else:
            return (image, question)

In [6]:
train_dataset = VQA(questions, annotations, ans_dict)
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True, pin_memory = True, num_workers = 8) 

In [7]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [8]:
model = nn.Sequential(nn.Linear(1024 + q_embed_size, n_ans)).to(device)

error = nn.CrossEntropyLoss()

learning_rate = 0.002

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.5)

In [9]:
def train(epoch, print_every = 100):
    model.train() 
    
    start = time.time()
    plot_losses = []
    print_loss_total = 0  
    plot_loss_total = 0 
    correct = 0.0
    total = 0.0

    for i, (imgs, qs, anss) in enumerate(train_loader):
        imgs, qs, anss = imgs.to(device), qs.to(device), anss.to(device)

        optimizer.zero_grad()

        img_embeds = cnn(imgs).reshape((-1, img_embed_size))
        txt_embeds = qs
        inputs = torch.cat((img_embeds, txt_embeds), 1)
        
        outputs = model(inputs)

        loss = error(outputs, anss)

        loss.backward()

        optimizer.step()
        
        _, predicted = torch.max(outputs.data, 1)
        total += anss.size(0)
        correct += (predicted == anss).sum().item()
        
        print_loss_total += loss.item()
        plot_loss_total += loss.item()

        if (i + 1) % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            acc = correct / total
            correct = 0.0
            total = 0.0
            print_loss_total = 0
            print('%s (%d iters, %d%%) Loss: %.4f Accuracy: %.4f' % (timeSince(start, (i + 1) / len(train_loader)),
                                        (i + 1) * batch_size, (i + 1) / len(train_loader) * 100, print_loss_avg, 
                                        acc))

In [10]:
for epoch in range(num_epochs): 
    train(epoch)
    lr_scheduler.step()
    # evaluate(test_loader)

0m 58s (- 33m 2s) (12800 iters, 2%) Loss: 5.9614 Accuracy: 0.1836
1m 50s (- 30m 13s) (25600 iters, 5%) Loss: 5.0624 Accuracy: 0.2048
2m 41s (- 28m 25s) (38400 iters, 8%) Loss: 4.8341 Accuracy: 0.2244
3m 32s (- 27m 11s) (51200 iters, 11%) Loss: 4.7321 Accuracy: 0.2416
4m 23s (- 26m 3s) (64000 iters, 14%) Loss: 4.5184 Accuracy: 0.2541
5m 14s (- 25m 2s) (76800 iters, 17%) Loss: 4.4664 Accuracy: 0.2640
6m 5s (- 24m 3s) (89600 iters, 20%) Loss: 4.4002 Accuracy: 0.2718
6m 55s (- 23m 6s) (102400 iters, 23%) Loss: 4.4164 Accuracy: 0.2740
7m 46s (- 22m 10s) (115200 iters, 25%) Loss: 4.2029 Accuracy: 0.2870
8m 37s (- 21m 16s) (128000 iters, 28%) Loss: 4.1844 Accuracy: 0.2933
9m 27s (- 20m 22s) (140800 iters, 31%) Loss: 4.2707 Accuracy: 0.2845
10m 20s (- 19m 33s) (153600 iters, 34%) Loss: 4.1617 Accuracy: 0.2882
11m 11s (- 18m 39s) (166400 iters, 37%) Loss: 4.1404 Accuracy: 0.2960
12m 2s (- 17m 46s) (179200 iters, 40%) Loss: 4.0927 Accuracy: 0.2922
12m 53s (- 16m 53s) (192000 iters, 43%) Loss: 4.

OSError: [Errno 12] Cannot allocate memory