In [2]:
import random
import json
import torch
import torchtext
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [3]:
dataset_base_path = "../data/clevr/CLEVR_CoGenT_v1.0/questions/"
train = json.load(open(dataset_base_path + "CLEVR_trainA_questions.json", "r"))
validation = json.load(open(dataset_base_path + "CLEVR_valA_questions.json", "r"))

In [4]:
print(len(train["questions"]))
print(len(validation["questions"]))

699960
150000


In [5]:
train["questions"][random.randint(0, len(train["questions"]))]

{'question_index': 301889,
 'question_family_index': 58,
 'image_index': 30189,
 'question': 'The yellow block that is made of the same material as the large yellow thing is what size?',
 'answer': 'small',
 'image_filename': 'CLEVR_trainA_030189.png',
 'split': 'trainA',
 'program': [{'value_inputs': [], 'inputs': [], 'function': 'scene'},
  {'value_inputs': ['large'], 'inputs': [0], 'function': 'filter_size'},
  {'value_inputs': ['yellow'], 'inputs': [1], 'function': 'filter_color'},
  {'value_inputs': [], 'inputs': [2], 'function': 'unique'},
  {'value_inputs': [], 'inputs': [3], 'function': 'same_material'},
  {'value_inputs': ['yellow'], 'inputs': [4], 'function': 'filter_color'},
  {'value_inputs': ['cube'], 'inputs': [5], 'function': 'filter_shape'},
  {'value_inputs': [], 'inputs': [6], 'function': 'unique'},
  {'value_inputs': [], 'inputs': [7], 'function': 'query_size'}]}

In [6]:
vocab = {"<PAD>": 0}
train_questions = []
train_answers = []
val_questions = []
val_answers = []
MAX_LENGTH = 0


for q in train["questions"]:
    _q = q["question"].replace("?", "").replace(".", "").replace(",", "").split(" ")
    train_questions.append(_q)
    train_answers.append(q["answer"])
    if len(_q) > MAX_LENGTH:
        MAX_LENGTH = len(_q)
    # question
    for w in _q:
        if w in vocab:
            vocab[w] += 1
        else:
            vocab[w] = 1
    # answer
    for w in q["answer"].split(" "):
        if w in vocab:
            vocab[w] += 1
        else:
            vocab[w] = 1
    # inputs
    for p in q["program"]:
        for _iv in p["value_inputs"]:
            if _iv in vocab:
                vocab[_iv] += 1
            else:
                vocab[_iv] = 1
        # functions
        for w in p["function"].split("_"):
            if w in vocab:
                vocab[w] += 1
            else:
                vocab[w] = 1

for q in validation["questions"]:
    _q = q["question"].replace("?", "").replace(".", "").replace(",", "").split(" ")
    val_questions.append(_q)
    val_answers.append(q["answer"])
    if len(_q) > MAX_LENGTH:
        MAX_LENGTH = len(_q)
    # question
    for w in _q:
        if w in vocab:
            vocab[w] += 1
        else:
            vocab[w] = 1
    # answer
    for w in q["answer"].split(" "):
        if w in vocab:
            vocab[w] += 1
        else:
            vocab[w] = 1
    # inputs
    for p in q["program"]:
        for _iv in p["value_inputs"]:
            if _iv in vocab:
                vocab[_iv] += 1
            else:
                vocab[_iv] = 1
        # functions
        for w in p["function"].split("_"):
            if w in vocab:
                vocab[w] += 1
            else:
                vocab[w] = 1
vocab = list(set(vocab))
len(vocab)

119

In [7]:
x_train = []
y_train = []
x_val = []
y_val = []

# convert the input sentences to a tensor of indices
for q in train_questions:
    padded_q = q + ["<PAD>"] * (MAX_LENGTH - len(q))
    # MAX_LENGTH * len(vocab)
    x_train.append([vocab.index(w) for w in padded_q])

for q in val_questions:
    padded_q = q + ["<PAD>"] * (MAX_LENGTH - len(q))
    x_val.append([vocab.index(w) for w in padded_q])

# answers
for a in train_answers:
    y_train.append(vocab.index(a))
for a in val_answers:
    y_val.append(vocab.index(a))

In [8]:
print(MAX_LENGTH)
print(len(vocab))
print(x_train[0])

45
119
[92, 23, 109, 115, 0, 2, 27, 53, 84, 78, 25, 53, 96, 114, 32, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49]


In [9]:
# to torch tensors
x_train = torch.tensor(x_train)
y_train = torch.tensor(y_train)
x_val = torch.tensor(x_val)
y_val = torch.tensor(y_val)

In [10]:
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

torch.Size([699960, 45])
torch.Size([699960])
torch.Size([150000, 45])
torch.Size([150000])


In [11]:
# Define the bi-directional GRU model
class BiGRU(nn.Module):
    def __init__(self, vocab_dim, embedding_dim, hidden_dim, output_dim):
        super(BiGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_dim, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=1, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
    def forward(self, x):
        # x shape: (seq_len, batch_size)
        embedded = self.embedding(x)
        # embedded shape: (seq_len, batch_size, embedding_dim)
        output, hidden = self.gru(embedded)
        # output shape: (seq_len, batch_size, hidden_dim * 2)
        # hidden shape: (2, batch_size, hidden_dim)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        # hidden shape: (batch_size, hidden_dim * 2)
        out = self.fc(hidden)
        # out shape: (batch_size, output_dim)
        return out

In [12]:
print(len(set(train_answers)))

28


In [13]:
# Define the dimensions
vocab_dim = len(vocab)  # the size of the vocabulary
embedding_dim = 300  # the size of the embedding vector
hidden_dim = 128
output_dim = len(set(train_answers))

model = BiGRU(vocab_dim, embedding_dim, hidden_dim, output_dim)
print(model)

BiGRU(
  (embedding): Embedding(119, 300)
  (gru): GRU(300, 128, bidirectional=True)
  (fc): Linear(in_features=256, out_features=28, bias=True)
)


In [14]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

In [15]:
# to GPU 1
model = model.cuda(1)
loss_fn = loss_fn.cuda(1)
x_train = x_train.cuda(1)
y_train = y_train.cuda(1)
x_val = x_val.cuda(1)
y_val = y_val.cuda(1)

In [20]:
epochs = 10
batch_size = 45
model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for i in range(0, len(x_train), batch_size):
        optimizer.zero_grad()
        x = x_train[i:i+batch_size]
        y = y_train[i:i+batch_size]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch {epoch+1} Loss: {epoch_loss/len(x_train)}')

RuntimeError: cuDNN error: CUDNN_STATUS_MAPPING_ERROR