In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Import libraries and fix seed to make experiments reproducible
import os
import tensorflow as tf
import numpy as np

SEED = 1234
tf.random.set_seed(SEED)
np.random.seed(SEED)

cwd = os.getcwd()

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# Unzip vqa dataset
!unzip '/content/drive/MyDrive/Challenge3/anndl-2020-vqa.zip'

In [None]:
!ls '/content/VQA_Dataset/'

Images	test_questions.json  train_questions_annotations.json


In [None]:
# My working environment
env = '/content/drive/My Drive/Challenge3'

# **Import Data from the json file**

In [None]:
# Load data from json file
import json 

f = open('/content/VQA_Dataset/train_questions_annotations.json') 
_data = json.load(f)

questions = []
answers = []
image_ids = []
data_ids = []
for data_id in _data:
  _id, _answer, _image_id, _question = data_id, _data[data_id]['answer'], _data[data_id]['image_id'], _data[data_id]['question']
  data_ids.append(_id)
  questions.append(_question)
  answers.append(_answer)
  image_ids.append(_image_id)

print('Number of sentences:', len(questions))

Number of sentences: 58832


In [None]:
# Create a random validation_list, which is a list of indices, to split data in training and validation

import math, random

list_of_index = list(range(len(questions)))
val_rate = 0.1
val_size = (math.ceil(len(questions) * val_rate))

validation_list = random.sample(list_of_index, val_size)  

In [None]:
# Split data for training and validation using the list just created

train_questions = []
train_answers = []
train_image_ids = []
train_data_ids = []

validation_questions= []
validation_answers= []
validation_image_ids = []
validation_data_ids = []

i = 0

for i in list_of_index:
  if i in validation_list:
    validation_data_ids.append(data_ids[i])
    validation_questions.append(questions[i])
    validation_answers.append(answers[i])
    validation_image_ids.append(image_ids[i])
  else:
    train_data_ids.append(data_ids[i])
    train_questions.append(questions[i])
    train_answers.append(answers[i])
    train_image_ids.append(image_ids[i])

# **Data Pre-Processing and Dataset Creation**

In [None]:
# TOKENIZATION

# Convert words to integers
from tensorflow.keras.preprocessing.text import Tokenizer

questions_tokenizer = Tokenizer()
questions_tokenizer.fit_on_texts(questions)
questions_tokenized = questions_tokenizer.texts_to_sequences(questions)

questions_wtoi = questions_tokenizer.word_index
print('Total words used in questions:', len(questions_wtoi))

max_questions_length = max(len(sentence) for sentence in questions_tokenized)
print('Max questions sentence length:', max_questions_length)

Total words used in questions: 4640
Max questions sentence length: 21


In [None]:
# Padding sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

questions_encoder_inputs = pad_sequences(questions_tokenized, maxlen=max_questions_length)
print("Questions encoder inputs shape:", questions_encoder_inputs.shape)

questions_encoder_inputs_train = []
questions_encoder_inputs_validation = []

for count in list_of_index:
  if count in validation_list:
    questions_encoder_inputs_validation.append(questions_encoder_inputs[count])
  else:
    questions_encoder_inputs_train.append(questions_encoder_inputs[count])

Questions encoder inputs shape: (58832, 21)


In [None]:
# Labels Dictionary

labels_dict = {
        '0': 0,
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        '5': 5,
        'apple': 6,
        'baseball': 7,
        'bench': 8,
        'bike': 9,
        'bird': 10,
        'black': 11,
        'blanket': 12,
        'blue': 13,
        'bone': 14,
        'book': 15,
        'boy': 16,
        'brown': 17,
        'cat': 18,
        'chair': 19,
        'couch': 20,
        'dog': 21,
        'floor': 22,
        'food': 23,
        'football': 24,
        'girl': 25,
        'grass': 26,
        'gray': 27,
        'green': 28,
        'left': 29,
        'log': 30,
        'man': 31,
        'monkey bars': 32,
        'no': 33,
        'nothing': 34,
        'orange': 35,
        'pie': 36,
        'plant': 37,
        'playing': 38,
        'red': 39,
        'right': 40,
        'rug': 41,
        'sandbox': 42,
        'sitting': 43,
        'sleeping': 44,
        'soccer': 45,
        'squirrel': 46,
        'standing': 47,
        'stool': 48,
        'sunny': 49,
        'table': 50,
        'tree': 51,
        'watermelon': 52,
        'white': 53,
        'wine': 54,
        'woman': 55,
        'yellow': 56,
        'yes': 57
}

num_classes = 58

In [None]:
# ImageDataGenerator
from tensorflow.keras.preprocessing.image import ImageDataGenerator

img_h = 256
img_w = 256

apply_data_augmentation = False

# Create training ImageDataGenerator object
if apply_data_augmentation:
    img_data_gen = ImageDataGenerator(rotation_range=10,
                                      width_shift_range=10,
                                      height_shift_range=10,
                                      zoom_range=0.3,
                                      horizontal_flip=True,
                                      vertical_flip=True,
                                      fill_mode='reflect')
else:
    img_data_gen = ImageDataGenerator(fill_mode='reflect')

In [None]:
from PIL import Image

class CustomDataset(tf.keras.utils.Sequence):

  def __init__(self, which_subset, filenames, img_generator, encoder_input, preprocessing_function, output): 

    self.which_subset = which_subset
    self.subset_filenames = filenames
    self.img_generator = img_generator
    self.encoder_input = encoder_input
    self.preprocessing_function = preprocessing_function
    self.output = output

  def __len__(self):
    return len(self.subset_filenames)

  def __getitem__(self, index):
    
    # Read image and sentence
    curr_filename = self.subset_filenames[index]
    img = Image.open(os.path.join(curr_filename))
    img = img.convert('RGB')
    sentence = self.encoder_input[index]
    answer = self.output[index]

    # Resize image
    img = img.resize([img_h, img_w])
    
    # Converting in numpy arrays
    img_arr = np.array(img)
    sentence = np.array(sentence)
    answer = np.array(answer)

    if self.which_subset == 'training':
      if self.img_generator is not None:
        img_t = self.img_generator.get_random_transform(img_arr.shape, seed=SEED)
        img_arr = self.img_generator.apply_transform(img_arr, img_t)
        
    if self.preprocessing_function is not None:
        img_arr = self.preprocessing_function(img_arr)

    # return couples image-question and answers
    return (img_arr, sentence), answer

In [None]:
# Creating images filenames (for training and validation) which will be used to create the CustomDataset

filenames_train = []
filenames_validation = []

for i in list_of_index:
  if i in validation_list:
    filenames_validation.append('/content/VQA_Dataset/Images/' + image_ids[i] + '.png')
  else:
    filenames_train.append('/content/VQA_Dataset/Images/' + image_ids[i] + '.png')

filenames_train = np.array(filenames_train)
filenames_validation = np.array(filenames_validation)

In [None]:
# Creating one-hot-encoded arrays for output answers (both training and validation)

output_answers_train = []
output_answers_validation = []

for ans in train_answers:
  output_answers_train.append(labels_dict[ans])
output_answers_train = tf.one_hot(output_answers_train, depth=num_classes)

for ans in validation_answers:
  output_answers_validation.append(labels_dict[ans])
output_answers_validation = tf.one_hot(output_answers_validation, depth=num_classes)

In [None]:
# Creating the custom datasets using homonym class

from tensorflow.keras.applications.vgg16 import preprocess_input 

dataset_train = CustomDataset(which_subset='training', filenames=filenames_train, 
                        img_generator=img_data_gen, encoder_input=questions_encoder_inputs_train,
                        preprocessing_function=preprocess_input, output=output_answers_train)

dataset_valid = CustomDataset(which_subset='validation', filenames=filenames_validation, 
                              img_generator=None, encoder_input=questions_encoder_inputs_validation,
                              preprocessing_function=preprocess_input, output=output_answers_validation)

In [None]:
# Eventually create final datasets for training and validation

batch_size=64

train_dataset = tf.data.Dataset.from_generator(lambda: dataset_train,
                                               output_types=((tf.float32, tf.int32), tf.int32),
                                               output_shapes=(([img_h, img_w, 3], [max_questions_length]), [num_classes]))

train_dataset.shuffle

train_dataset = train_dataset.batch(batch_size=batch_size)

train_dataset = train_dataset.repeat()

valid_dataset = tf.data.Dataset.from_generator(lambda: dataset_valid,
                                               output_types=((tf.float32, tf.int32), tf.int32),
                                               output_shapes=(([img_h, img_w, 3], [max_questions_length]), [num_classes]))

valid_dataset.shuffle

valid_dataset = valid_dataset.batch(batch_size=batch_size)

valid_dataset = valid_dataset.repeat()

<bound method DatasetV2.shuffle of <FlatMapDataset shapes: (((256, 256, 3), (21,)), (58,)), types: ((tf.float32, tf.int32), tf.int32)>>

<bound method DatasetV2.shuffle of <FlatMapDataset shapes: (((256, 256, 3), (21,)), (58,)), types: ((tf.float32, tf.int32), tf.int32)>>

# **Model**

CNN

In [None]:
# Image features extraction with VGG16

vgg = tf.keras.applications.VGG16(
    include_top=False, 
    weights="imagenet",
    input_shape=(img_h, img_w, 3)
)

finetuning = False

if finetuning:
    freeze_until = 15
    
    for layer in vgg.layers[:freeze_until]:
        layer.trainable = False
else:
    vgg.trainable = False

CNN = tf.keras.Sequential()
CNN.add(vgg)
CNN.add(tf.keras.layers.Flatten())
CNN.add(tf.keras.layers.Dropout(0.5))
CNN.add(tf.keras.layers.Dense(units=128, activation='relu'))

input_CNN = tf.keras.Input(shape=(img_h, img_w, 3))
out_CNN = CNN(input_CNN)

vgg.summary()
CNN.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 256, 256, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 256, 256, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 256, 256, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 128, 128, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 128, 128, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 128, 128, 128)    

RNN

In [None]:
# Encode questions with LSTM

EMBEDDING_SIZE = 64

encoder_input = tf.keras.Input(shape=[max_questions_length])
encoder_embedding_layer = tf.keras.layers.Embedding(len(questions_wtoi)+1, EMBEDDING_SIZE, input_length=max_questions_length, mask_zero=True)
encoder_embedding_out = encoder_embedding_layer(encoder_input)
encoder = tf.keras.layers.LSTM(units=128)
encoder_output = encoder(encoder_embedding_out)

encoder_model = tf.keras.Model(encoder_input, encoder_output)
out_encoder=encoder_model(encoder_input)

encoder_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 21)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 21, 64)            297024    
_________________________________________________________________
lstm (LSTM)                  (None, 128)               98816     
Total params: 395,840
Trainable params: 395,840
Non-trainable params: 0
_________________________________________________________________


 Merging CNN and RNN and finalizing the model


In [None]:
# Merge

merge_model = tf.keras.layers.multiply(inputs=[out_CNN, out_encoder])

In [None]:
# Adding dense layer and the final softmax after the merge

merge_model = tf.keras.layers.Dense(units=64, activation='tanh')(merge_model)
merge_model = tf.keras.layers.Dropout(rate=0.5)(merge_model)

out_merge_model = tf.keras.layers.Dense(units=num_classes, activation='softmax')(merge_model)

VQA_model = tf.keras.Model(inputs=[input_CNN, encoder_input], outputs=out_merge_model)

# **Compile and Fit the model**

In [None]:
# Optimization params
# -------------------

# Loss
# Categorical Crossentropy loss
loss = tf.keras.losses.CategoricalCrossentropy()
# learning rate
lr = 1e-4
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
# -------------------

# Validation metrics
# ------------------
metrics = 'accuracy'
# ------------------

# Compile Model
VQA_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
import os
from datetime import datetime

cwd = '/content/drive/My Drive/Challenge3/'

exps_dir = os.path.join(cwd, 'VQA_experiments')
if not os.path.exists(exps_dir):
    os.makedirs(exps_dir)

now = datetime.now().strftime('%b%d_%H-%M-%S')

exp_name = 'VGG16_LSTM'

exp_dir = os.path.join(exps_dir, exp_name + '_' + str(now))
if not os.path.exists(exp_dir):
    os.makedirs(exp_dir)
    
callbacks = []

# Model checkpoint
# ----------------
ckpt_dir = os.path.join(exp_dir, 'ckpts')
if not os.path.exists(ckpt_dir):
    os.makedirs(ckpt_dir)

ckpt_callback = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(ckpt_dir, 'cp_{epoch:02d}.ckpt'),
                                                   save_weights_only=True)  # False to save the model directly
callbacks.append(ckpt_callback)

# Early Stopping

early_stop = True
if early_stop:
    es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    callbacks.append(es_callback)

In [None]:
# Fit the model

VQA_model.fit(x=train_dataset,
          epochs=50,  #### set repeat in training dataset
          steps_per_epoch=len(dataset_train) // batch_size ,
          validation_data= valid_dataset,
          validation_steps=len(dataset_valid) // batch_size,
          callbacks=callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50


<tensorflow.python.keras.callbacks.History at 0x7f4db2dbfa58>

# **Generate csv file for predictions**

In [None]:
# load best checkpoint to generate predictions, which is from the 10th epoch checkpoint of the training above

VQA_model.load_weights('/content/drive/MyDrive/Challenge3/VQA_experiments/VGG16_LSTM_Jan26_13-44-13/ckpts/cp_10.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f4dbd5693c8>

In [None]:
# Import necessary libraries

import os

from datetime import datetime

from PIL import Image

In [None]:
# Given function for saving the csv file, once the experiment is complete

def create_csv(results, results_dir='./'):

    csv_fname = 'results_'
    csv_fname += datetime.now().strftime('%b%d_%H-%M-%S') + '.csv'

    with open(os.path.join(results_dir, csv_fname), 'w') as f:

        f.write('Id,Category\n')

        for key, value in results.items():
            f.write(key + ',' + str(value) + '\n')

In [None]:
# Import data from Test_questions.json

import json 

f = open('/content/VQA_Dataset/test_questions.json') 
_data = json.load(f)

questions_test = []
image_ids_test = []
data_ids_test = []
for data_id in _data:
  _id, _image_id, _question = data_id, _data[data_id]['image_id'], _data[data_id]['question']
  data_ids_test.append(_id)
  questions_test.append(_question)
  image_ids_test.append(_image_id + '.png')

print('Number of sentences:', len(questions))

Number of sentences: 58832


In [None]:
# Creation of the csv file

from tensorflow.keras.applications.vgg16 import preprocess_input 
results ={}

for i in range(len(questions_test)):
  
  # Open image and convert to RGB
  img = Image.open(os.path.join('/content/VQA_Dataset/Images', image_ids_test[i])).convert('RGB')

  # Create a tensor from each image and preprocess with vgg preprocessing function
  img_arr = np.array(img.resize([img_h, img_w]))
  img_arr = np.expand_dims(img_arr,0)
  img_arr = preprocess_input(img_arr)

  # Tokenize the question and convert to numpy array
  input_tokenized = questions_tokenizer.texts_to_sequences([questions_test[i]])
  input_tokenized = pad_sequences(input_tokenized, maxlen = max_questions_length)
  quest_arr = np.array(input_tokenized)

  # Input for the model
  input = (img_arr, quest_arr)

  # Predict and add to the dictionary
  softmax = VQA_model.predict(input)
  prediction = tf.argmax(softmax,1)
  results[data_ids_test[i]] = int(prediction)

  # Close opened image
  img.close()

# Eventually create csv file for prediction using the function declared before
create_csv(results, env)