# Imports

In [None]:
import pandas as pd
import numpy as np
import torch
import json
import pickle
import gensim
import gensim.downloader as api
import torchtext
from torch import nn, stack, tensor
from google.colab import drive

# Data

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open("/content/drive/My Drive/minivqaiust/image_features.pickle", 'rb') as f:
    image_faeture = pickle.load(f)

with open("/content/drive/My Drive/minivqaiust/image_question.json") as f:
      image_question_map = json.load(f)

In [None]:
questions={}

for image_id, question in image_question_map.items():
  for q in question:
    questions[q[0]] = {'question_text':q[1], 'image_id': str(image_id)}

In [None]:
#pre proccessing train data
df = pd.read_csv("/content/drive/My Drive/minivqaiust/train.csv")

train_label=torch.tensor([df['label']])
train_question=[questions[i]['question_text'] for i in list(df['question_id'])]
train_image_feature = torch.tensor([image_feature[questions[i]['image_id']] for i in list(df['question_id'])])

In [None]:
#pre proccessing validation data
df = pd.read_csv("/content/drive/My Drive/minivqaiust/val.csv")

valid_label=torch.tensor([df['label']])
valid_question=[questions[i]['question_text'] for i in list(df['question_id'])]
valid_image_feature = torch.tensor([image_feature[questions[i]['image_id']] for i in list(df['question_id'])])

In [None]:
#pre proccessing test data
df = pd.read_csv("/content/drive/My Drive/minivqaiust/test.csv")

test_question=[questions[i]['question_text'] for i in list(df['question_id'])]
test_image_feature = torch.tensor([image_feature[questions[i]['image_id']] for i in list(df['question_id'])])

# Word Embedding

In [None]:
# Tokenize
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

# pre embedding model
pre_model = api.load('word2vec-google-news-300')

In [None]:
# Embedding layer
word_embedding = nn.Embedding(len(list(pre_model.vocab.keys())) + 1, len(pre_model.get_vector('hi')))

In [None]:
def encode(input):
  return [pre_model.get_index(token, default=-1) + 1 for token in tokenizer(input)]

def padify(input):
  encoded_input = [encode(x) for x in input]
  return stack([nn.functional.pad(tensor(e),(0,l-len(e)),mode='constant',value=0) for e in encoded_input])

In [None]:
# word embedding on train data
with torch.no_grad():
  train_question_emedding = word_embedding(padify(train_question))

In [None]:
# word embedding on validation data
with torch.no_grad():
  valid_question_emedding = word_embedding(padify(valid_question))

In [None]:
# word embedding on test data
with torch.no_grad():
  test_question_emedding = word_embedding(padify(test_question))

# Data loader


In [None]:
train_dataset = torch.utils.data.TensorDataset(train_question_embedding, train_image_feature, train_label)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

In [None]:
valid_dataset = torch.utils.data.TensorDataset(valid_question_embedding, valid_image_feature, valid_label)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=64, shuffle=True)

# Model

In [None]:
class VQA(nn.Module):
    def __init__(self):
        super(type(self), self).__init__()
        self.lstm = nn.LSTM(256, 512, num_layers=1, dropout=0.1)
        self.linear = nn.Sequential(
            nn.BatchNorm1d(512),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 10),
            nn.BatchNorm1d(10),
            nn.ReLU()
        )

    def forward(self, text, image):
        text_features= torch.flatten(self.lstm(text)[0], start_dim=1)
        input = torch.cat([text_feature, image], dim=1)
        logits = nn.functional.softmax(self.linear(input), dim=1)
        return logits

In [None]:
model = VQA()

# Train and Validation

In [None]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0004)

In [None]:
def validatoin_loop(model, dataloader):
  size = len(dataloader.dataset)
  correct = 0
  avg_loss = 0
  for batch, (text, image, y) in enumerate(dataloader):
    prediction = model(text, image)
    loss = loss_function(prediction, y)

    output = [torch.argmax(p).item() for p in prediction]
    correct += (torch.FloatTensor(output) == y).float().sum()
    avg_loss += loss.item()

  accuracy = correct / len(dataloader.dataset)
  return avg_loss, accuracy


In [None]:
def train_loop(dataloader, model, loss_function, optimizer):
    size = len(dataloader.dataset)
    correct = 0
    avg_loss = 0
    for batch, (text, image, label) in enumerate(dataloader):
        prediction = model(text, image)
        loss = loss_function(prediction, label)
        avg_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        output = [torch.argmax(p).item() for p in prediction]
        correct += (torch.FloatTensor(output) == y).float().sum()

    avg_loss /= (len(dataloader.dataset) // 64 + 1)
    accuracy = correct / len(dataloader.dataset)

    val_loss, val_acc = validation_loop(model, valid_dataloader)
    print(f"training / loss: {avg_loss:>7f} | accuracy: {accuracy}")
    print(f"val / loss: {val_loss:>7f} | accuracy: {val_acc}")

In [None]:
for epochs in range(15):
    print(f"Epoch {epochs+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_function, optimizer)

# Prediction

In [None]:
prediction = model(test_question_emedding, torch.tensor(test_image_feature))

output = np.array([torch.argmax(p).item() for p in prediction], dtype='int64')
labeldict = {}
labeldict['question_id'] = list(df['question_id'])
labeldict['label'] = []
for idx, out in enumerate(prediction):
  labeldict['label'].append(int(out))

print(df.head())

In [None]:
torch.save(model.state_dict(), base_path + '/content/drive/My Drive/minivqaiust/minivqa-v1-weights.pth')