# Imports

In [None]:
import pandas as pd
import numpy as np
import torch
import json
import pickle
import gensim
import gensim.downloader as api
import torchtext
from torch import nn, stack, tensor
from google.colab import drive

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Data

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
with open("/content/drive/My Drive/minivqaiust/image_features.pickle", 'rb') as f:
    image_feature = pickle.load(f)

with open("/content/drive/My Drive/minivqaiust/image_question.json") as f:
      image_question_map = json.load(f)

In [None]:
questions={}

for image_id, question in image_question_map.items():
  for q in question:
    questions[q[0]] = {'question_text':q[1], 'image_id': str(image_id)}

In [None]:
#pre proccessing train data
df = pd.read_csv("/content/drive/My Drive/minivqaiust/train.csv")

train_label=torch.tensor(list(df['label']))
train_question=[questions[i]['question_text'] for i in list(df['question_id'])]
train_image_feature = [image_feature[questions[i]['image_id']] for i in list(df['question_id'])]

In [None]:
#pre proccessing validation data
df = pd.read_csv("/content/drive/My Drive/minivqaiust/val.csv")

valid_label=torch.tensor(list(df['label']))
valid_question=[questions[i]['question_text'] for i in list(df['question_id'])]
valid_image_feature = torch.tensor([image_feature[questions[i]['image_id']] for i in list(df['question_id'])])

  valid_image_feature = torch.tensor([image_feature[questions[i]['image_id']] for i in list(df['question_id'])])


In [None]:
#pre proccessing test data
df = pd.read_csv("/content/drive/My Drive/minivqaiust/test.csv")

test_question=[questions[i]['question_text'] for i in list(df['question_id'])]
test_image_feature = torch.tensor([image_feature[questions[i]['image_id']] for i in list(df['question_id'])])

# Word Embedding

In [None]:
# Tokenize
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

# pre embedding model
pre_model = api.load('word2vec-google-news-300')

In [None]:
# Embedding layer
word_embedding = nn.Embedding(len(pre_model.index_to_key) + 1, len(pre_model.get_vector('hi')))

In [None]:
def encode(input):
  return [pre_model.get_index(token, default=-1) + 1 for token in tokenizer(input)]

def padify(input):
  encoded_input = [encode(x) for x in input]
  return stack([nn.functional.pad(tensor(e),(0,15-len(e)),mode='constant',value=0) for e in encoded_input])

In [None]:
# word embedding on train data
with torch.no_grad():
  train_question_embedding = word_embedding(padify(train_question))

In [None]:
# word embedding on validation data
with torch.no_grad():
  valid_question_embedding = word_embedding(padify(valid_question))

In [None]:
# word embedding on test data
with torch.no_grad():
  test_question_embedding = word_embedding(padify(test_question))

# Data loader


In [None]:
train_dataset = torch.utils.data.TensorDataset(train_question_embedding, torch.tensor(train_image_feature), train_label)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

In [None]:
valid_dataset = torch.utils.data.TensorDataset(valid_question_embedding, valid_image_feature, valid_label)
valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=64, shuffle=True)

# Model

In [None]:
class VQA(nn.Module):
    def __init__(self):
        super(type(self), self).__init__()
        self.lstm = nn.LSTM(300, 512, num_layers=1).to(device)
        self.linear = nn.Sequential(
            nn.Linear(8192, 128),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 10),
            nn.Tanh()
        ).to(device)

    def forward(self, text, image):
        text_features= torch.flatten(self.lstm(text)[0], start_dim=1).to(device)
        input = torch.cat([text_features, image], dim=1).to(device)
        logits = nn.functional.softmax(self.linear(input), dim=1)
        return logits

In [None]:
model = VQA()

# Train and Validation

In [None]:
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0004)

In [None]:
def validation_loop(model, dataloader):
  size = len(dataloader.dataset)
  correct = 0
  avg_loss = 0
  for batch, (text, image, label) in enumerate(dataloader):
    prediction = model(text.to(device), image.to(device))
    loss = loss_function(prediction, label.to(device))

    output = [torch.argmax(p).item() for p in prediction]
    correct += (torch.FloatTensor(output) == label).float().sum()
    avg_loss += loss.item()

  accuracy = correct / len(dataloader.dataset)
  return avg_loss, accuracy


In [None]:
def train_loop(dataloader, model, loss_function, optimizer):
    size = len(dataloader.dataset)
    correct = 0
    avg_loss = 0
    for batch, (text, image, label) in enumerate(dataloader):
        prediction = model(text.to(device), image.to(device))
        loss = loss_function(prediction, label.to(device))
        avg_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        output = [torch.argmax(p).item() for p in prediction]
        correct += (torch.FloatTensor(output) == label).float().sum()

    avg_loss /= (len(dataloader.dataset) // 64 + 1)
    accuracy = correct / len(dataloader.dataset)

    val_loss, val_acc = validation_loop(model, valid_dataloader)
    print(f"training / loss: {avg_loss:>7f} | accuracy: {accuracy}")
    print(f"val / loss: {val_loss:>7f} | accuracy: {val_acc}")

In [None]:
del pre_model
del image_feature
del questions
del word_embedding


In [None]:
for epochs in range(15):
    print(f"Epoch {epochs+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_function, optimizer)

Epoch 1
-------------------------------
training / loss: 2.292975 | accuracy: 0.1576923131942749
val / loss: 4.555639 | accuracy: 0.2454545497894287
Epoch 2
-------------------------------
training / loss: 2.255680 | accuracy: 0.30128204822540283
val / loss: 4.472806 | accuracy: 0.4363636374473572
Epoch 3
-------------------------------
training / loss: 2.238444 | accuracy: 0.3807692229747772
val / loss: 4.452094 | accuracy: 0.3909091055393219
Epoch 4
-------------------------------
training / loss: 2.200831 | accuracy: 0.4769230782985687
val / loss: 4.385391 | accuracy: 0.5
Epoch 5
-------------------------------
training / loss: 2.162609 | accuracy: 0.5769230723381042
val / loss: 4.304508 | accuracy: 0.6000000238418579
Epoch 6
-------------------------------
training / loss: 2.134205 | accuracy: 0.6512820720672607
val / loss: 4.265432 | accuracy: 0.6000000238418579
Epoch 7
-------------------------------
training / loss: 2.120522 | accuracy: 0.6243589520454407
val / loss: 4.276535 | 

# Prediction

In [None]:
prediction = model(test_question_embedding.to(device), torch.tensor(test_image_feature).to(device))

output = np.array([torch.argmax(p).item() for p in prediction], dtype='int64')

test_csv = pd.read_csv("/content/drive/My Drive/minivqaiust/test.csv")

df = pd.DataFrame({
    'question_id': sorted(test_csv.index.values),
    'label': output
})
print(df.head())
df.to_csv('/content/drive/My Drive//minivqa-version1.csv', index=False)


   question_id  label
0            0      7
1            1      7
2            2      4
3            3      7
4            4      3


  prediction = model(test_question_embedding.to(device), torch.tensor(test_image_feature).to(device))


In [None]:
torch.save(model.state_dict(), '/content/drive/My Drive//minivqa-versio1-model.pth')