In [1]:
import tensorflow as tf
import numpy as np 
import os


SEED = 1234

tf.random.set_seed(SEED)
np.random.seed(SEED)

working_directory = os.getcwd()

In [2]:
from google.colab import drive
drive.mount(os.path.join(working_directory,'gdrive'))


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
!unzip gdrive/MyDrive/anndl-2020-vqa.zip

Archive:  gdrive/MyDrive/anndl-2020-vqa.zip
replace VQA_Dataset/Images/0.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [4]:
!ls VQA_Dataset

Images	RImages  test_questions.json  train_questions_annotations.json


In [5]:
from PIL import Image

dataset_dir = os.path.join(working_directory,'VQA_Dataset')

img_directory = os.path.join(working_directory,'VQA_Dataset/Images')
r_img_directory = os.path.join(working_directory,'VQA_Dataset/RImages')

img_names = os.listdir(img_directory)
print(img_names[:10])

if not os.path.exists(r_img_directory):
  os.mkdir(r_img_directory)

for i in img_names:
  img_path = os.path.join(img_directory,i)
  img = Image.open(img_path).convert('RGB')
  img = img.resize([224,224])
  img = img.save(os.path.join(r_img_directory,i))

['14872.png', '902.png', '7704.png', '5192.png', '8913.png', '9777.png', '386.png', '7985.png', '5280.png', '25344.png']


In [6]:
# importing json as python dictionaries 

import json 

with open(os.path.join(dataset_dir,'train_questions_annotations.json')) as json_file:
  train_questions_annotations = json.load(json_file)

with open(os.path.join(dataset_dir,'test_questions.json')) as json_file:
  test_questions = json.load(json_file)


image_labels_ = os.listdir(os.path.join(dataset_dir,'Images'))
image_labels = []

for name in image_labels_:
  image_labels.append(name[:-4])



In [7]:
questions = {}
answers   = {}

count = 0

for key in train_questions_annotations:
  questions[key] = {}
  questions[key]["question"] = train_questions_annotations[key]["question"]
  questions[key]["image_id"] = train_questions_annotations[key]["image_id"]
  answers[key] = {}
  answers[key]["answer"] = train_questions_annotations[key]["answer"]
  answers[key]["image_id"] = train_questions_annotations[key]["image_id"]
  if (count < 3 ):
    print(f"question {key}: {questions[key]}")
    print(f"answer {key}: {answers[key]}")
  count += 1

question 117792: {'question': 'Who looks happier?', 'image_id': '11779'}
answer 117792: {'answer': 'man', 'image_id': '11779'}
question 117790: {'question': 'Where is the woman sitting?', 'image_id': '11779'}
answer 117790: {'answer': 'blanket', 'image_id': '11779'}
question 117791: {'question': 'Where is the man sitting?', 'image_id': '11779'}
answer 117791: {'answer': 'bench', 'image_id': '11779'}


In [8]:
labels_dict = {
        '0': 0,
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        '5': 5,
        'apple': 6,
        'baseball': 7,
        'bench': 8,
        'bike': 9,
        'bird': 10,
        'black': 11,
        'blanket': 12,
        'blue': 13,
        'bone': 14,
        'book': 15,
        'boy': 16,
        'brown': 17,
        'cat': 18,
        'chair': 19,
        'couch': 20,
        'dog': 21,
        'floor': 22,
        'food': 23,
        'football': 24,
        'girl': 25,
        'grass': 26,
        'gray': 27,
        'green': 28,
        'left': 29,
        'log': 30,
        'man': 31,
        'monkey bars': 32,
        'no': 33,
        'nothing': 34,
        'orange': 35,
        'pie': 36,
        'plant': 37,
        'playing': 38,
        'red': 39,
        'right': 40,
        'rug': 41,
        'sandbox': 42,
        'sitting': 43,
        'sleeping': 44,
        'soccer': 45,
        'squirrel': 46,
        'standing': 47,
        'stool': 48,
        'sunny': 49,
        'table': 50,
        'tree': 51,
        'watermelon': 52,
        'white': 53,
        'wine': 54,
        'woman': 55,
        'yellow': 56,
        'yes': 57
}

In [9]:
def process_answer(answer_string):
  return labels_dict[answer_string]

count = 0

for key in answers:
  answers[key]['answer'] = process_answer(answers[key]['answer'])
  if (count < 5):
    print(f"answer {key}: {answers[key]['answer']}")
  count += 1



answer 117792: 31
answer 117790: 12
answer 117791: 8
answer 55360: 57
answer 169490: 31


In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# preparing tokenizer for the questions
MAX_NUM_WORDS = 1e4

questions_set = set()

for key in questions:
  questions_set.add(questions[key]['question'] + '<eos>')
for key in test_questions:
  questions_set.add(test_questions[key]['question'] + '<eos>')

q_tokenizer = Tokenizer(num_words = MAX_NUM_WORDS, filters = '?-,.')
q_tokenizer.fit_on_texts(questions_set)
q_wtoi = q_tokenizer.word_index

print(q_wtoi['who'])

max = 0
for key in q_wtoi:
  if(q_wtoi[key] > max):
    max = q_wtoi[key]

print(max)
q_words = max

50
4670


In [11]:
def process_question(q_string):
  q_string = str(q_string).replace('-',' ')
  q_string = str(q_string).replace(',',' ')
  q_string = str(q_string).replace('.',' ')
  q_string = str(q_string).replace('?',' ')
  q_splitted = q_string.split()
  result = []
  for wq in q_splitted:
    if(len(wq) > 0):
      try:
        result.append(q_wtoi[wq.lower()])
      except: #occurs when there is another ?
        print(f"exception raised on {q_string}")

  result.append(q_wtoi['<eos>'])
  return result

for key in questions:
  questions[key]['question'] = process_question(questions[key]['question'])

In [12]:
from PIL import Image

images_dir = os.path.join(dataset_dir,'Images')
# image_labels contains the id of the images and all png's
image_paths = {}

for id in image_labels:
  image_paths[id] = os.path.join(images_dir,id+'.png')

# image paths is a dictionary which associates id of the image to its path, used by the custom generator



In [13]:
# pad questions in questions dict
max_question_length = 0

for key in questions:
  if (len(questions[key]['question'])>max_question_length):
    max_question_length = len(questions[key]['question'])
    
for key in questions:
  pad_num = max_question_length - len(questions[key]['question'])
  for i in range(0,pad_num):
    questions[key]['question'].append(0)





In [14]:
class DataGenerator(tf.keras.utils.Sequence):
    def __init__(self, max_question_length,questions_dict, answers_dict, batch_size, num_classes=None, shuffle=True):
        self.batch_size = batch_size
        self.id_list = [key for key in questions_dict]
        self.shuffle = shuffle
        self.questions_dict = questions_dict
        self.answers_dict = answers_dict
        self.on_epoch_end()

    def __len__(self):
        return len(self.id_list) // self.batch_size

    def __getitem__(self, index):
        index = self.index[index * self.batch_size:(index + 1) * self.batch_size]
        batch = [self.id_list[k] for k in index] # list of id_s for the batch 

        i = np.array(np.zeros(shape = [self.batch_size,224,224,3], dtype = np.float32))
        q = np.array(np.zeros(shape = [self.batch_size,max_question_length], dtype = np.float32))
        y = np.array(np.zeros(shape = [self.batch_size,58], dtype = np.float32))

        count = 0
        for id in batch:
          
          img_path = os.path.join(r_img_directory,self.questions_dict[id]['image_id']+'.png')
          img = Image.open(img_path)
          img_array = np.array(img)
          i[count,:,:,:] = img_array
          q[count,:] = self.questions_dict[id]['question']
          y[count,:] = tf.keras.utils.to_categorical(self.answers_dict[id]['answer'], num_classes=58, dtype='float32')
          count += 1
        
        X = [i,q]
        return X, y

    def on_epoch_end(self):
        self.index = np.arange(len(self.id_list)) #np.arange(2) = [0,1,2]
        if self.shuffle == True:
            np.random.shuffle(self.index)


In [15]:
target_shape = [224,224]

batch_size = 32

# splitting validation and training set
keys = [key for key in questions]
np.random.shuffle(keys)

split_percentage = 0.92

training_keys = keys[0:int(split_percentage*len(keys))]
validation_keys = keys[int(split_percentage*len(keys)):]

training_questions = {}
training_answers = {}

validation_questions = {}
validation_answers = {}

for key in training_keys:
  training_questions[key] = {}
  training_questions[key]['question'] = questions[key]['question']
  training_questions[key]['image_id'] = questions[key]['image_id']
  training_answers[key] = {}
  training_answers[key]['answer'] = answers[key]['answer']
  training_answers[key]['image_id'] = answers[key]['image_id']

for key in validation_keys:
  validation_questions[key] = {}
  validation_questions[key]['question'] = questions[key]['question']
  validation_questions[key]['image_id'] = questions[key]['image_id']
  validation_answers[key] = {}
  validation_answers[key]['answer'] = answers[key]['answer']
  validation_answers[key]['image_id'] = answers[key]['image_id']


train_generator = DataGenerator(max_question_length = max_question_length,
                                questions_dict      = training_questions,
                                answers_dict        = training_answers,
                                batch_size          = batch_size,
                                num_classes         = None,
                                shuffle             = True)

validation_generator = DataGenerator(max_question_length = max_question_length,
                                questions_dict      = validation_questions,
                                answers_dict        = validation_answers,
                                batch_size          = batch_size,
                                num_classes         = None,
                                shuffle             = True)


# **Prepare the model**


In [16]:

def create_model(percentage,out_units,n,dr,lr,embedding):
  num_classes = 58

  # ------------------------ CNN ----------------------

  vgg = tf.keras.applications.Xception(include_top = False, input_shape = [224,224,3])
#0.7 63 XCeption
  for l in vgg.layers[:int(len(vgg.layers)*percentage)]:
    l.trainable = False 

  vgg_output = tf.keras.layers.GlobalAveragePooling2D()(vgg.output)
  vgg_output = tf.keras.layers.Dense(units = out_units,activation = 'relu')(vgg_output)

  cnn = tf.keras.Model(inputs = vgg.input, outputs = vgg_output)


  # ------------------------ RNN ----------------------

  EMBEDDING_SIZE = embedding

  # ENCODER
  # -------
  # in keras out = layer(input)

  encoder_input = tf.keras.Input(shape=[max_question_length])
  encoder_embedding_layer = tf.keras.layers.Embedding(len(q_wtoi)+1, EMBEDDING_SIZE, input_length=max_question_length, mask_zero=True)
  encoder_embedding_out = encoder_embedding_layer(encoder_input)

  # I need 224 units because I have 4 words each embedded in 32 integers values 
  encoder = tf.keras.layers.LSTM(units=128, return_state=True)

  encoder_output, h, c = encoder(encoder_embedding_out)
  encoder_output = tf.keras.layers.Dense(units = out_units, activation = 'relu')(encoder_output)

  encoder_states = [h, c]

  rnn = tf.keras.Model(inputs = encoder_input, outputs = encoder_output)




  # ---------------------Merging--------------------------------

  x1 = cnn.output 
  x2 = rnn.output 

  merging_layer = tf.keras.layers.Multiply()([x1,x2])
  flatten = tf.keras.layers.Flatten()(merging_layer)
  flatten = tf.keras.layers.BatchNormalization()(flatten)
  classifier = tf.keras.layers.Dense(units=n,activation='relu')(flatten)
  classifier = tf.keras.layers.Dropout(rate = dr)(classifier)
  classifier = tf.keras.layers.Dense(units=n,activation='relu')(classifier)
  classifier = tf.keras.layers.Dropout(rate = dr)(classifier)
  classifier = tf.keras.layers.Dense(units=num_classes,activation='softmax')(classifier)
  VQA_model = tf.keras.Model(inputs = [cnn.input,rnn.input], outputs = classifier)


  # Loss
  loss = tf.keras.losses.CategoricalCrossentropy()

  # learning rate
  lr = lr
  optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
  # -------------------

  # Validation metrics
  # ------------------

  metrics = ['accuracy']
  # ------------------

  # Compile Model
  VQA_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

  return VQA_model




In [17]:


batch_size = 64


callbacks = []

checkpoint_dir = os.path.join(working_directory,'callbacks')

if not os.path.exists(checkpoint_dir):
  os.mkdir(checkpoint_dir)

callbacks.append(tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_dir, monitor='val_loss', verbose=0, save_best_only=True,
    save_weights_only=True, mode='auto', save_freq='epoch'))


callbacks.append(tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', patience=5, verbose=0,
    mode='auto', baseline=None, restore_best_weights=True
))


## Models to train ##

In [18]:
model_1 = {
    "learning_rate" : 0.5e-3,
    "percentage": 0.7,
    "dropout_rate": 0.1,
    "out_units": 512,
    "n_units": 256,
    "embedding": 32
}
model_2 = {
    "learning_rate" : 0.5e-3,
    "percentage": 0.7,
    "dropout_rate": 0.2,
    "out_units": 512,
    "n_units": 512,
     "embedding": 64
}
model_3 = {
    "learning_rate" : 0.5e-3,
    "percentage": 0.,
    "dropout_rate": 0.3,
    "out_units": 512,
    "n_units": 512,
     "embedding": 64
}
model_4 = {
    "learning_rate" : 0.5e-3,
    "percentage": 0.7,
    "dropout_rate": 0.3,
    "out_units": 1024,
    "n_units": 512,
     "embedding": 128
}
model_5 = {
    "learning_rate" : 1e-4,
    "percentage": 0.7,
    "dropout_rate": 0.15,
    "out_units": 512,
    "n_units": 256,
     "embedding": 32
}
model_6 = {
    "learning_rate" : 1e-4,
    "percentage": 0.7,
    "dropout_rate": 0.2,
    "out_units": 512,
    "n_units": 256,
     "embedding": 32
}
model_7 = {
    "learning_rate" : 1e-4,
    "percentage": 0.,
    "dropout_rate": 0.25,
    "out_units": 512,
    "n_units": 512,
     "embedding": 32
}
model_8 = {
    "learning_rate" : 1e-4,
    "percentage": 0.,
    "dropout_rate": 0.3,
    "out_units": 1024,
    "n_units": 512,
    "embedding": 64
}

model_9 = {
    "learning_rate" : 1e-5,
    "percentage": 0.,
    "dropout_rate": 0.25,
    "out_units": 1024,
    "n_units": 512,
     "embedding": 32
}
model_10 = {
    "learning_rate" : 1e-5,
    "percentage": 0.,
    "dropout_rate": 0.3,
    "out_units": 1024,
    "n_units": 1024,
     "embedding": 64
}
model_10 = {
    "learning_rate" : 1e-5,
    "percentage": 0.,
    "dropout_rate": 0.3,
    "out_units": 1024,
    "n_units": 1024,
     "embedding": 128
}

models = []
models.append(model_1)
models.append(model_2)
models.append(model_3)
models.append(model_4)
models.append(model_5)
models.append(model_6)
models.append(model_7)
models.append(model_8)
models.append(model_9)
models.append(model_10)


In [None]:
import matplotlib.pyplot as plt

histories = {}
count = 1

for m in models:
  model = create_model(percentage = m["percentage"],out_units = m["out_units"],n = m["n_units"],dr = m["dropout_rate"],lr = m["learning_rate"],embedding = m["embedding"])

  history = model.fit(train_generator, validation_data = validation_generator,
  epochs=20, verbose=0, callbacks=None,
  steps_per_epoch = int(len(training_questions)/batch_size))
  histories["model_"+str(count)] = history.history
  print(f"Model with learning rate: {m['learning_rate']},output units: {m['out_units']}, percentage freezed: {m['percentage']}, units in classifier: {m['n_units']}, drop rate: {m['dropout_rate']},embedding :{m['embedding']}")
  print(f"Maximum validation accuracy: {np.max(history.history['val_accuracy'])}, Min validation loss: {np.min(history.history['val_loss'])}")
  count += 1



## **Model selection after Cross validation** ##
## 10 models trained w/ different comb. of reasonable hyperparameters ##

In [None]:
batch_size = 32


callbacks = []

checkpoint_dir = os.path.join(working_directory,'callbacks')

if not os.path.exists(checkpoint_dir):
  os.mkdir(checkpoint_dir)

callbacks.append(tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_dir, monitor='val_loss', verbose=0, save_best_only=True,
    save_weights_only=True, mode='auto', save_freq='epoch'))


callbacks.append(tf.keras.callbacks.EarlyStopping(
    monitor='val_accuracy', patience=5, verbose=0,
    mode='auto', baseline=None, restore_best_weights=True
))



## Select the best model and train with early stopping with high patience ##

In [None]:
target_shape = [224,224]

batch_size = 32

# splitting validation and training set
keys = [key for key in questions]
np.random.shuffle(keys)

split_percentage = 1

training_keys = keys[0:int(split_percentage*len(keys))]
validation_keys = keys[int(split_percentage*len(keys)):]

training_questions = {}
training_answers = {}

validation_questions = {}
validation_answers = {}

for key in training_keys:
  training_questions[key] = {}
  training_questions[key]['question'] = questions[key]['question']
  training_questions[key]['image_id'] = questions[key]['image_id']
  training_answers[key] = {}
  training_answers[key]['answer'] = answers[key]['answer']
  training_answers[key]['image_id'] = answers[key]['image_id']

for key in validation_keys:
  validation_questions[key] = {}
  validation_questions[key]['question'] = questions[key]['question']
  validation_questions[key]['image_id'] = questions[key]['image_id']
  validation_answers[key] = {}
  validation_answers[key]['answer'] = answers[key]['answer']
  validation_answers[key]['image_id'] = answers[key]['image_id']


train_generator = DataGenerator(max_question_length = max_question_length,
                                questions_dict      = training_questions,
                                answers_dict        = training_answers,
                                batch_size          = batch_size,
                                num_classes         = None,
                                shuffle             = True)



In [None]:
# lowest loss model:

min = 99
count = 1
for k in histories:
  if (k["val_loss"]<min):
    min = k["val_loss"]
    best = count
  count += 1

model = models[best]
VQA_model = create_model(percentage = model["percentage"],out_units= model["out_units"],n= model["n_units"],dr= model["dropout_rate"],lr= model["learning_rate"],embedding= model["embedding"])
history = VQA_model.fit(train_generator, validation_data = validation_generator,
  epochs=35, verbose=1, callbacks=None,
  steps_per_epoch = int(len(training_questions)/batch_size))


# **Preparing the test evaluation**

In [None]:


for key in test_questions:
  test_questions[key]['question'] = process_question(test_questions[key]['question'])

for key in test_questions:
  pad_num = max_question_length - len(test_questions[key]['question'])
  for i in range(0,pad_num):
    test_questions[key]['question'].append(0)





In [None]:
print(test_questions)

In [None]:
          
# img_path = os.path.join(images_dir,self.questions_dict[id]['image_id']+'.png')
# img = Image.open(os.path.join(images_dir,img_path)).convert('RGB')
# img = img.resize([224,224])
# img_array = np.array(img)

test_len = len(test_questions)


test_dataset = []

i = np.array(np.zeros(shape = [test_len,224,224,3], dtype = np.float32))
q = np.array(np.zeros(shape = [test_len,max_question_length], dtype = np.float32))

count = 0 
for key in test_questions:
  img_path = os.path.join(images_dir,test_questions[key]['image_id']+'.png')
  img = Image.open(os.path.join(images_dir,img_path)).convert('RGB')
  img = img.resize([224,224])
  img_array = np.array(img)
  i[count,:,:,:] = img_array
  q[count,:] = test_questions[key]['question']
  count += 1

test_dataset = [i,q]


In [None]:
print(test_dataset[0].shape)

In [None]:

predictions = VQA_model.predict(x = test_dataset)

In [None]:
eval_dict = {}
pred = [] 
for p in predictions:
  pred.append(int(tf.math.argmax(p)))

count = 0
for key in test_questions:
  eval_dict[key] = pred[count]
  count += 1


In [None]:
print(eval_dict)

In [None]:
eval_dir = os.path.join(working_directory,'gdrive/MyDrive')

def create_csv(eval_dict, results_dir):

    csv_fname = 'EVAL_h3_21'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in eval_dict.items():
            f.write(key + ',' + str(value) + '\n')

create_csv(eval_dict=eval_dict, results_dir=eval_dir)