In [19]:
import tensorflow as tf
import numpy as np 
import os

SEED = 1234

tf.random.set_seed(SEED)
np.random.seed(SEED)

working_directory = os.getcwd()

In [20]:
from google.colab import drive
drive.mount(os.path.join(working_directory,'gdrive'))


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [21]:
!unzip gdrive/MyDrive/anndl-2020-vqa.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: VQA_Dataset/Images/5390.png  
  inflating: VQA_Dataset/Images/5391.png  
  inflating: VQA_Dataset/Images/5392.png  
  inflating: VQA_Dataset/Images/5393.png  
  inflating: VQA_Dataset/Images/5394.png  
  inflating: VQA_Dataset/Images/5395.png  
  inflating: VQA_Dataset/Images/5396.png  
  inflating: VQA_Dataset/Images/5397.png  
  inflating: VQA_Dataset/Images/5398.png  
  inflating: VQA_Dataset/Images/5399.png  
  inflating: VQA_Dataset/Images/54.png  
  inflating: VQA_Dataset/Images/540.png  
  inflating: VQA_Dataset/Images/5400.png  
  inflating: VQA_Dataset/Images/5401.png  
  inflating: VQA_Dataset/Images/5402.png  
  inflating: VQA_Dataset/Images/5403.png  
  inflating: VQA_Dataset/Images/5404.png  
  inflating: VQA_Dataset/Images/5405.png  
  inflating: VQA_Dataset/Images/5406.png  
  inflating: VQA_Dataset/Images/5407.png  
  inflating: VQA_Dataset/Images/5408.png  
  inflating: VQA_Dataset/Images/540

In [22]:
!ls VQA_Dataset

Images	test_questions.json  train_questions_annotations.json


In [23]:
dataset_dir = os.path.join(working_directory,'VQA_Dataset')


In [24]:
# importing json as python dictionaries 

import json 

with open(os.path.join(dataset_dir,'train_questions_annotations.json')) as json_file:
  train_questions_annotations = json.load(json_file)

image_labels_ = os.listdir(os.path.join(dataset_dir,'Images'))
image_labels = []

for name in image_labels_:
  image_labels.append(name[:-4])



In [25]:
questions = {}
answers   = {}

count = 0

for key in train_questions_annotations:
  questions[key] = {}
  questions[key]["question"] = train_questions_annotations[key]["question"]
  questions[key]["image_id"] = train_questions_annotations[key]["image_id"]
  answers[key] = {}
  answers[key]["answer"] = train_questions_annotations[key]["answer"]
  answers[key]["image_id"] = train_questions_annotations[key]["image_id"]
  if (count < 3 ):
    print(f"question {key}: {questions[key]}")
    print(f"answer {key}: {answers[key]}")
  count += 1

question 117792: {'question': 'Who looks happier?', 'image_id': '11779'}
answer 117792: {'answer': 'man', 'image_id': '11779'}
question 117790: {'question': 'Where is the woman sitting?', 'image_id': '11779'}
answer 117790: {'answer': 'blanket', 'image_id': '11779'}
question 117791: {'question': 'Where is the man sitting?', 'image_id': '11779'}
answer 117791: {'answer': 'bench', 'image_id': '11779'}


In [26]:
labels_dict = {
        '0': 0,
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        '5': 5,
        'apple': 6,
        'baseball': 7,
        'bench': 8,
        'bike': 9,
        'bird': 10,
        'black': 11,
        'blanket': 12,
        'blue': 13,
        'bone': 14,
        'book': 15,
        'boy': 16,
        'brown': 17,
        'cat': 18,
        'chair': 19,
        'couch': 20,
        'dog': 21,
        'floor': 22,
        'food': 23,
        'football': 24,
        'girl': 25,
        'grass': 26,
        'gray': 27,
        'green': 28,
        'left': 29,
        'log': 30,
        'man': 31,
        'monkey bars': 32,
        'no': 33,
        'nothing': 34,
        'orange': 35,
        'pie': 36,
        'plant': 37,
        'playing': 38,
        'red': 39,
        'right': 40,
        'rug': 41,
        'sandbox': 42,
        'sitting': 43,
        'sleeping': 44,
        'soccer': 45,
        'squirrel': 46,
        'standing': 47,
        'stool': 48,
        'sunny': 49,
        'table': 50,
        'tree': 51,
        'watermelon': 52,
        'white': 53,
        'wine': 54,
        'woman': 55,
        'yellow': 56,
        'yes': 57
}

In [27]:
def process_answer(answer_string):
  return labels_dict[answer_string]

count = 0

for key in answers:
  answers[key]['answer'] = process_answer(answers[key]['answer'])
  if (count < 5):
    print(f"answer {key}: {answers[key]['answer']}")
  count += 1



answer 117792: 31
answer 117790: 12
answer 117791: 8
answer 55360: 57
answer 169490: 31


In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# preparing tokenizer for the questions
MAX_NUM_WORDS = 1e4

questions_set = set()

for key in questions:
  questions_set.add(questions[key]['question'] + '<eos>')

q_tokenizer = Tokenizer(num_words = MAX_NUM_WORDS, filters = '?')
q_tokenizer.fit_on_texts(questions_set)
q_wtoi = q_tokenizer.word_index

print(q_wtoi['who'])




49


In [29]:
def process_question(q_string):
  q_splitted = q_string.split()
  last = (q_splitted[-1].split('?'))[0]
  q_splitted[-1] = last
  
  result = []
  for wq in q_splitted:
    if(len(wq) > 0) :
      try:
        result.append(q_wtoi[wq.lower()])
      except: #occurs when there is another ?
        s_ = wq.split('?')
        result.append(q_wtoi[s_[0].lower()])

  result.append(q_wtoi['<eos>'])
  return result

for key in questions:
  questions[key]['question'] = process_question(questions[key]['question'])

In [30]:
from PIL import Image

images_dir = os.path.join(dataset_dir,'Images')
# image_labels contains the id of the images and all png's
image_paths = {}

for id in image_labels:
  image_paths[id] = os.path.join(images_dir,id+'.png')

# image paths is a dictionary which associates id of the image to its path, used by the custom generator



In [31]:
# pad questions in questions dict
max_question_length = 0

for key in questions:
  if (len(questions[key]['question'])>max_question_length):
    max_question_length = len(questions[key]['question'])
    
for key in questions:
  pad_num = max_question_length - len(questions[key]['question'])
  for i in range(0,pad_num):
    questions[key]['question'].append(0)





In [32]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, max_question_length,questions_dict, answers_dict, batch_size, num_classes=None, shuffle=True):
        self.batch_size = batch_size
        self.id_list = [key for key in questions_dict]
        self.shuffle = shuffle
        self.questions_dict = questions_dict
        self.answers_dict = answers_dict
        self.on_epoch_end()

    def __len__(self):
        return len(self.id_list) // self.batch_size

    def __getitem__(self, index):
        index = self.index[index * self.batch_size:(index + 1) * self.batch_size]
        batch = [self.id_list[k] for k in index] # list of id_s for the batch 

        i = np.array(np.zeros(shape = [self.batch_size,224,224,3]))
        q = np.array(np.zeros(shape = [self.batch_size,max_question_length]))
        y = np.array(np.zeros(shape = [self.batch_size,58]))

        count = 0
        for id in batch:
          
          img_path = os.path.join(images_dir,self.questions_dict[id]['image_id']+'.png')
          img = Image.open(os.path.join(images_dir,img_path)).convert('RGB')
          img = img.resize([224,224])
          img_array = np.array(img)
          i[count,:,:,:] = img_array
          q[count,:] = self.questions_dict[id]['question']
          y[count,:] = tf.keras.utils.to_categorical(self.answers_dict[id]['answer'], num_classes=58, dtype='float32')
          count += 1
        
        X = [i,q]
        return X, y

    def on_epoch_end(self):
        self.index = np.arange(len(self.id_list)) #np.arange(2) = [0,1,2]
        if self.shuffle == True:
            np.random.shuffle(self.index)


In [33]:
target_shape = [224,224]

train_generator = DataGenerator(max_question_length = max_question_length,
                                questions_dict      = questions,
                                answers_dict        = answers,
                                batch_size          = 32,
                                num_classes         = None,
                                shuffle             = True)


# **Prepare the model**


In [34]:
num_classes = 58

# ------------------------ CNN ----------------------

vgg = tf.keras.applications.VGG16(include_top = False, input_shape = [224,224,3])
vgg_output = tf.keras.layers.GlobalAveragePooling2D()(vgg.output)
vgg_output = tf.keras.layers.Dense(units = '1024',activation = 'softmax')(vgg_output)

cnn = tf.keras.Model(inputs = vgg.input, outputs = vgg_output)
# ------------------------ RNN ----------------------
 


EMBEDDING_SIZE = 32

# ENCODER
# -------
# in keras out = layer(input)

encoder_input = tf.keras.Input(shape=[max_question_length])
encoder_embedding_layer = tf.keras.layers.Embedding(len(q_wtoi)+1, EMBEDDING_SIZE, input_length=max_question_length, mask_zero=True)
encoder_embedding_out = encoder_embedding_layer(encoder_input)

# I need 128 units because I have 4 words each embedded in 32 integers values (128 lstm cells)
encoder = tf.keras.layers.LSTM(units=128, return_state=True)

encoder_output, h, c = encoder(encoder_embedding_out)
encoder_output = tf.keras.layers.Dense(units = 1024, activation = 'softmax')(encoder_output)
encoder_states = [h, c]

rnn = tf.keras.Model(inputs = encoder_input, outputs = encoder_output)


# ---------------------Merging--------------------------------

x1 = cnn.output 
x2 = rnn.output 

merging_layer = tf.keras.layers.Multiply()([x1,x2])
classifier = tf.keras.layers.Dense(units=num_classes,activation='softmax')(merging_layer)

VQA_model = tf.keras.Model(inputs = [cnn.input,rnn.input], outputs = classifier)

VQA_model.summary()




Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 224, 224, 64) 1792        input_3[0][0]                    
__________________________________________________________________________________________________
block1_conv2 (Conv2D)           (None, 224, 224, 64) 36928       block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_pool (MaxPooling2D)      (None, 112, 112, 64) 0           block1_conv2[0][0]               
____________________________________________________________________________________________

In [None]:




# Optimization params
# -------------------

# Loss
loss = tf.keras.losses.CategoricalCrossentropy()

# learning rate
lr = 1e-3
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
# -------------------

# Validation metrics
# ------------------

metrics = ['accuracy']
# ------------------

# Compile Model
VQA_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

batch_size = 32
VQA_model.fit(
    train_generator, epochs=2, verbose=1, callbacks=None,
    steps_per_epoch = len(questions)/batch_size,
    class_weight=None, 
)


Epoch 1/2
 115/1838 [>.............................] - ETA: 22:00 - loss: 4.0274 - accuracy: 0.2473