# Get Data

In [1]:
import os

if not os.path.exists("./datasets"):
  !mkdir -p "/content/datasets/"
  !mkdir -p "/content/datasets/Images/"

  !wget https://s3.amazonaws.com/cvmlp/vqa/abstract_v002/vqa/Annotations_Train_abstract_v002.zip
  !wget https://s3.amazonaws.com/cvmlp/vqa/abstract_v002/vqa/Annotations_Val_abstract_v002.zip
  !unzip Annotations_Train_abstract_v002.zip -d ./datasets/Annotations
  !unzip Annotations_Val_abstract_v002.zip -d ./datasets/Annotations

  !wget https://s3.amazonaws.com/cvmlp/vqa/abstract_v002/vqa/Questions_Train_abstract_v002.zip
  !wget https://s3.amazonaws.com/cvmlp/vqa/abstract_v002/vqa/Questions_Val_abstract_v002.zip
  !wget https://s3.amazonaws.com/cvmlp/vqa/abstract_v002/vqa/Questions_Test_abstract_v002.zip
  !unzip Questions_Train_abstract_v002.zip -d ./datasets/Questions
  !unzip Questions_Val_abstract_v002.zip -d ./datasets/Questions
  !unzip Questions_Test_abstract_v002.zip -d ./datasets/Questions

  !wget https://s3.amazonaws.com/cvmlp/vqa/abstract_v002/scene_img/scene_img_abstract_v002_train2015.zip
  !wget https://s3.amazonaws.com/cvmlp/vqa/abstract_v002/scene_img/scene_img_abstract_v002_val2015.zip
  !wget https://s3.amazonaws.com/cvmlp/vqa/abstract_v002/scene_img/scene_img_abstract_v002_test2015.zip
  !unzip scene_img_abstract_v002_train2015.zip -d ./datasets/Images/train
  !unzip scene_img_abstract_v002_val2015.zip -d ./datasets/Images/val
  !unzip scene_img_abstract_v002_test2015.zip -d ./datasets/Images/test
  !rm -rf *zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: ./datasets/Images/test/abstract_v002_test2015_000000045000.png  
  inflating: ./datasets/Images/test/abstract_v002_test2015_000000045001.png  
  inflating: ./datasets/Images/test/abstract_v002_test2015_000000045002.png  
  inflating: ./datasets/Images/test/abstract_v002_test2015_000000045003.png  
  inflating: ./datasets/Images/test/abstract_v002_test2015_000000045004.png  
  inflating: ./datasets/Images/test/abstract_v002_test2015_000000045005.png  
  inflating: ./datasets/Images/test/abstract_v002_test2015_000000045006.png  
  inflating: ./datasets/Images/test/abstract_v002_test2015_000000045007.png  
  inflating: ./datasets/Images/test/abstract_v002_test2015_000000045008.png  
  inflating: ./datasets/Images/test/abstract_v002_test2015_000000045009.png  
  inflating: ./datasets/Images/test/abstract_v002_test2015_000000045010.png  
  inflating: ./datasets/Images/test/abstract_v002_test2015_000000045011.png  

# Data Augmentation

## Flip Images

In [18]:
import PIL
from PIL import Image
from tqdm.notebook import tqdm

og_img_dir = './datasets/Images/train'
os.chdir(og_img_dir)

for img in tqdm(os.listdir('.')):
  flipped_img = Image.open(img).transpose(Image.FLIP_LEFT_RIGHT).save(f'{img[:-9]}{int(img[-9:-4])+20000}.png')

os.chdir('/content')

HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




## Annotations


In [19]:
import json

annotation_path = './datasets/Annotations/abstract_v002_train2015_annotations.json'
with open(annotation_path) as f:
  x = json.load(f)['annotations']
  x2 = []
  for a in tqdm(x):
      a2 = a.copy()
      a2['image_id'] += 20000
      a2['question_id'] += 200000
      x2.append(a2)
  x += x2

info = {'annotations': x}
with open(annotation_path, 'w') as d:
  json.dump(info, d)

HBox(children=(FloatProgress(value=0.0, max=60000.0), HTML(value='')))




## Questions

In [22]:
question_path = './datasets/Questions/OpenEnded_abstract_v002_train2015_questions.json'
with open(question_path) as f:
    x = json.load(f)['questions']
    x2 = []
    for q in tqdm(x):
        q2 = q.copy()
        q2['image_id'] += 20000
        q2['question_id'] += 200000
        x2.append(q2)
    x += x2

info = {"questions": x}
with open(question_path, 'w') as d:
  json.dump(info, d)

HBox(children=(FloatProgress(value=0.0, max=60000.0), HTML(value='')))




# Preprocess Data

In [23]:
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from tqdm.notebook import tqdm

!pip install vit-pytorch

if not os.path.exists("./data_loader.py"):
  !wget https://raw.githubusercontent.com/stran123/basic_vqa/master/data_loader.py

if not os.path.exists("./models.py"):
  !wget https://raw.githubusercontent.com/stran123/basic_vqa/master/models.py

if not os.path.exists("./utils"):
  !mkdir -p "/content/utils"
  !wget https://raw.githubusercontent.com/stran123/basic_vqa/master/utils/text_helper.py -O ./utils/text_helper.py
  !wget https://raw.githubusercontent.com/stran123/basic_vqa/master/utils/resize_images.py -O ./utils/resize_images.py
  !wget https://raw.githubusercontent.com/stran123/basic_vqa/master/utils/make_vacabs_for_questions_answers.py -O ./utils/make_vacabs_for_questions_answers.py
  !wget https://raw.githubusercontent.com/stran123/basic_vqa/master/utils/build_vqa_inputs.py -O ./utils/build_vqa_inputs.py

from data_loader import get_loader
from models import VqaModel



In [31]:
os.chdir("./utils")
if not os.path.exists("../datasets/Resized_Images"):
  !python resize_images.py --input_dir='../datasets/Images' --output_dir='../datasets/Resized_Images'  
  !python make_vacabs_for_questions_answers.py --input_dir='../datasets'
  !python build_vqa_inputs.py --input_dir='../datasets' --output_dir='../datasets'
os.chdir("../datasets/Resized_Images")
if os.path.exists("./test") and not os.path.exists("./test2015"):
  !mv test test2015
if os.path.exists("./train") and not os.path.exists("./train2015"):
  !mv train train2015
if os.path.exists("./val") and not os.path.exists("./val2015"):
  !mv val val2015
os.chdir("../..")

[1000/40000] Resized the images and saved into '../datasets/Resized_Images/train'.
[2000/40000] Resized the images and saved into '../datasets/Resized_Images/train'.
[3000/40000] Resized the images and saved into '../datasets/Resized_Images/train'.
[4000/40000] Resized the images and saved into '../datasets/Resized_Images/train'.
[5000/40000] Resized the images and saved into '../datasets/Resized_Images/train'.
[6000/40000] Resized the images and saved into '../datasets/Resized_Images/train'.
[7000/40000] Resized the images and saved into '../datasets/Resized_Images/train'.
[8000/40000] Resized the images and saved into '../datasets/Resized_Images/train'.
[9000/40000] Resized the images and saved into '../datasets/Resized_Images/train'.
[10000/40000] Resized the images and saved into '../datasets/Resized_Images/train'.
[11000/40000] Resized the images and saved into '../datasets/Resized_Images/train'.
[12000/40000] Resized the images and saved into '../datasets/Resized_Images/train'.
[

# Model arguments

In [36]:
class Args:
  def __init__(self, input_dir = './datasets', output_dir = './output',
               max_qst_length = 30, max_num_ans = 10,
               embed_size = 1024, word_embed_size = 300,
               num_layers = 2, hidden_size = 512,
               learning_rate = 1e-3, step_size = 10, gamma = 0.1,
               num_epochs = 30, batch_size = 128,
               num_workers = 4, save_step = 1,
               use_qst_encoder_type = 'lstm', use_img_encoder_type = 'vggnet', 
               patch_size=28):
    self.input_dir = input_dir
    self.output_dir = output_dir
    self.max_qst_length = max_qst_length
    self.max_num_ans = max_num_ans
    self.embed_size = embed_size
    self.word_embed_size = word_embed_size
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    self.learning_rate = learning_rate
    self.step_size = step_size
    self.gamma = gamma
    self.num_epochs = num_epochs
    self.batch_size = batch_size
    self.num_workers = num_workers
    self.save_step = save_step
    self.use_qst_encoder_type = use_qst_encoder_type # rnn, ltsm
    self.use_img_encoder_type = use_img_encoder_type # vit, transformer, vggnet
    self.patch_size = patch_size

In [37]:
args = Args(output_dir='rnn-vit', num_epochs=1, use_qst_encoder_type='rnn', use_img_encoder_type='vit', patch_size=28)

# Make model

In [38]:
data_loader = get_loader(
        input_dir=args.input_dir,
        input_vqa_train='train.npy',
        input_vqa_valid='valid.npy',
        max_qst_length=args.max_qst_length,
        max_num_ans=args.max_num_ans,
        batch_size=args.batch_size,
        num_workers=args.num_workers)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

qst_vocab = data_loader['train'].dataset.qst_vocab
ans_vocab = data_loader['train'].dataset.ans_vocab

qst_vocab_size = qst_vocab.vocab_size
ans_vocab_size = ans_vocab.vocab_size
ans_unk_idx = data_loader['train'].dataset.ans_vocab.unk2idx

model = VqaModel(
        embed_size=args.embed_size,
        qst_vocab_size=qst_vocab_size,
        ans_vocab_size=ans_vocab_size,
        word_embed_size=args.word_embed_size,
        num_layers=args.num_layers,
        hidden_size=args.hidden_size,
        use_qst_encoder_type=args.use_qst_encoder_type,
        use_img_encoder_type=args.use_img_encoder_type,
        patch_size=args.patch_size).to(device)

  cpuset_checked))


qst_encoder rnn


# Train a model

In [39]:
os.makedirs(args.output_dir, exist_ok=True)
os.makedirs(args.output_dir+'/logs', exist_ok=True)
os.makedirs(args.output_dir+'/models', exist_ok=True)

criterion = nn.CrossEntropyLoss()

# params = list(model.img_encoder.fc.parameters()) \
#     + list(model.qst_encoder.parameters()) \
#     + list(model.fc1.parameters()) \
#     + list(model.fc2.parameters())
params = model.parameters() # for ViT

optimizer = optim.Adam(params, lr=args.learning_rate)
scheduler = lr_scheduler.StepLR(
    optimizer, step_size=args.step_size, gamma=args.gamma)

In [None]:
for epoch in tqdm(range(args.num_epochs)):

    for phase in ['train', 'valid']:

        running_loss = 0.0
        running_corr_exp1 = 0
        running_corr_exp2 = 0
        batch_step_size = len(data_loader[phase].dataset) / args.batch_size

        if phase == 'train':
            scheduler.step()
            model.train()
        else:
            model.eval()
        batch_idx = 0
        for batch_sample in tqdm(data_loader[phase]):
            image = batch_sample['image'].to(device)
            question = batch_sample['question'].to(device)
            label = batch_sample['answer_label'].to(device)
            # not tensor, list.
            multi_choice = batch_sample['answer_multi_choice']

            optimizer.zero_grad()

            with torch.set_grad_enabled(phase == 'train'):

                # [batch_size, ans_vocab_size=1000]
                output = model(image, question)
                _, pred_exp1 = torch.max(output, 1)  # [batch_size]
                _, pred_exp2 = torch.max(output, 1)  # [batch_size]
                loss = criterion(output, label)

                if phase == 'train':
                    loss.backward()
                    optimizer.step()

            # Evaluation metric of 'multiple choice'
            # Exp1: our model prediction to '<unk>' IS accepted as the answer.
            # Exp2: our model prediction to '<unk>' is NOT accepted as the answer.
            pred_exp2[pred_exp2 == ans_unk_idx] = -9999
            running_loss += loss.item()
            running_corr_exp1 += torch.stack([(ans == pred_exp1.cpu())
                                              for ans in multi_choice]).any(dim=0).sum()
            running_corr_exp2 += torch.stack([(ans == pred_exp2.cpu())
                                              for ans in multi_choice]).any(dim=0).sum()

            # Print the average loss in a mini-batch.
            if batch_idx % 100 == 0:
                print('| {} SET | Epoch [{:02d}/{:02d}], Step [{:04d}/{:04d}], Loss: {:.4f}'
                      .format(phase.upper(), epoch+1, args.num_epochs, batch_idx, int(batch_step_size), loss.item()))
            batch_idx += 1
            
        # Print the average loss and accuracy in an epoch.
        epoch_loss = running_loss / batch_step_size
        epoch_acc_exp1 = running_corr_exp1.double(
        ) / len(data_loader[phase].dataset)      # multiple choice
        epoch_acc_exp2 = running_corr_exp2.double(
        ) / len(data_loader[phase].dataset)      # multiple choice

        print('| {} SET | Epoch [{:02d}/{:02d}], Loss: {:.4f}, Acc(Exp1): {:.4f}, Acc(Exp2): {:.4f} \n'
              .format(phase.upper(), epoch+1, args.num_epochs, epoch_loss, epoch_acc_exp1, epoch_acc_exp2))

        # Log the loss and accuracy in an epoch.
        with open(os.path.join(args.output_dir, 'logs/{}-log-epoch-{:02}.txt')
                  .format(phase, epoch+1), 'w') as f:
            f.write(str(epoch+1) + '\t'
                    + str(epoch_loss) + '\t'
                    + str(epoch_acc_exp1.item()) + '\t'
                    + str(epoch_acc_exp2.item()))

    # Save the model check points.
    if (epoch+1) % args.save_step == 0:
        torch.save({'epoch': epoch+1, 'state_dict': model.state_dict()},
                    os.path.join(args.output_dir, 'models/model-epoch-{:02d}.ckpt'.format(epoch+1)))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

  cpuset_checked))


| TRAIN SET | Epoch [01/01], Step [0000/0937], Loss: 6.8954
| TRAIN SET | Epoch [01/01], Step [0100/0937], Loss: 4.4358
| TRAIN SET | Epoch [01/01], Step [0200/0937], Loss: 3.9090
| TRAIN SET | Epoch [01/01], Step [0300/0937], Loss: 4.0810
| TRAIN SET | Epoch [01/01], Step [0400/0937], Loss: 4.3042


# Plotter

In [None]:
import numpy as np
import csv
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(10,5))

for phase in ['train', 'valid']:
    
    epoch = []
    loss = []
    acc = []
    
    for i in range(args.num_epochs):
        
        with open(f'./{args.output_dir}/logs/{phase}-log-epoch-{i+1:02d}.txt', 'r') as f:
            df = csv.reader(f, delimiter='\t')
            data = list(df)

        epoch.append(float(data[0][0]))
        loss.append(float(data[0][1]))
        acc.append(float(data[0][3]))

    plt.subplot(1, 2, 1)
    if phase == 'train':
        plt.plot(epoch, loss, label = phase, color = 'red')
    else:
        plt.plot(epoch, loss, label = phase, color = 'blue')
            
    plt.xlabel('Epoch', fontsize = 20)
    plt.ylabel('Loss', fontsize = 20)
        
    plt.subplot(1, 2, 2)
    fig.tight_layout(rect=[0, 0.03, 1, 0.95])

    if phase == 'train':
        plt.plot(epoch, acc, label = phase, color = 'red')
    else:
        plt.plot(epoch, acc, label = phase, color = 'blue')
    
    plt.xlabel('Epoch', fontsize = 20)
    plt.ylabel('Accuracy', fontsize = 20)
    
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., prop={'size': 20})
    print(phase, acc)
    
fig.suptitle("ViT Image Encoder, RNN Question Encoder", fontsize=16)
    
# plt.savefig(f'./{args.output_dir}/png/train.png', dpi = fig.dpi)

# Test a model




## Load some weights

In [None]:
!git clone https://github.com/stran123/6.869-final-project-lfs

In [None]:
model_path = 'original-model-30.ckpt'
model.load_state_dict(torch.load(model_path)['state_dict'])

In [None]:
from PIL import Image
image_path = './datasets/Resized_Images/'
img = Image.open(image_path+"train2015/abstract_v002_train2015_000000000000.png")
display(img)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

for batch_sample in tqdm(data_loader[phase]):
    image = batch_sample['image'].to(device)
    question = batch_sample['question'].to(device)
    break

image = image[0].unsqueeze(0)
question = question[0].unsqueeze(0)

displayable_img = image.cpu()[0].permute(1,2,0).numpy()*3
display(image)

In [None]:
import spacy
tokenizer = spacy.load('en_core_web_sm')

max_qst_length = 30

user_question = input("Ask a question about this image:\n")

parsed_user_question = [token.text for token in tokenizer(user_question.lower())]

question = np.array([qst_vocab.word2idx('<pad>')] * max_qst_length)  # padded with '<pad>' in 'ans_vocab'
question[:len(parsed_user_question)] = [qst_vocab.word2idx(w) for w in parsed_user_question]
question = torch.tensor(question).unsqueeze(0).to(device)

In [None]:
from collections import Counter
responses = Counter()

plt.imshow(displayable_img)
print(' '.join(qst_vocab.idx2word(i) for i in question[0]))
for i in range(100):
  output = model(image,question)
  _, pred_exp1 = torch.max(output, 1)  # [batch_size]
  responses[' '.join(ans_vocab.idx2word(i) for i in pred_exp1)]+=1

responses = [(k,responses[k]) for k in responses]
sorted_responses = sorted(responses, key=lambda x: x[1])
x = [_ for _ in range(len(responses))]
labels, y = zip(*sorted_responses)

In [None]:
plt.bar(x, y, tick_label=labels)