In [8]:
import tensorflow as tf
import pandas as pd
import numpy as np
from PIL import Image
import os
from tqdm import tqdm
from transformers import ViTImageProcessor, ViTModel
import shutil
import pickle


## Basic Architecture

The architecture of the system takes an image and a question as inputs and outputs (via a classifier) an answer. This is designed as:

1. a CNN to encode the image into an embedding vector representation (size: 768)
2. a LTSM encoder to encode the question into an embedding vector representation (size: 384)
3. the two vectors above to create the input into the classifier (total input size: 1152) with a one-hot encoded vector of the unique class labels from the training dataset as the outputs (size: 2521)

The image encoding is done via a pretrained model from HuggingFace : google/vit-base-patch16-224-in21k - (https://huggingface.co/google/vit-base-patch16-224-in21k)

The question encoding is done in one pass via the pretrained sentence-transformers/all-MiniLM-L6-v2 model - (https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)

The final classification layers are:
* Input layer : input size (1152,)
* Dense layer : 1024 nodes
* Dense layer : 1024 nodes
* Output layer : 2521 nodes, with softmax activation


In [9]:
DATA_DIRECTORY = "./data"

# the CSV training data - preprocessed from before via the preprocess_annotations.py
TRAINING_DATA = os.path.join(DATA_DIRECTORY, "combined_training.csv")
VALIDATION_DATA = os.path.join(DATA_DIRECTORY, "combined_validation.csv")

# our images directories
TRAINING_IMAGES = os.path.join(DATA_DIRECTORY, "scene_img_abstract_v002_train2015")
VALIDATION_IMAGES = os.path.join(DATA_DIRECTORY, "scene_img_abstract_v002_val2015")

# saving our TF datasets so we don't have to rebuild everytime
TRAINING_ENCODED_DATASET = os.path.join(DATA_DIRECTORY, "training_encoded_dataset")
VALIDATION_ENCODED_DATASET = os.path.join(DATA_DIRECTORY, "validation_encoded_dataset")

TRAINING_PREENCODED_QUESTIONS = os.path.join(DATA_DIRECTORY, "training_encoded_questions.pkl")
VALIDATION_PREENCODED_QUESTIONS = os.path.join(DATA_DIRECTORY, "validation_encoded_questions.pkl")

TRAINING_PREENCODED_IMAGES = os.path.join(DATA_DIRECTORY, "training_encoded_images.pkl")
VALIDATION_PREENCODED_IMAGES = os.path.join(DATA_DIRECTORY, "validation_encoded_images.pkl")

In [10]:
# https://huggingface.co/jinaai/jina-embeddings-v3
from transformers import AutoModel
embeddings_model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True)

# this is the specific embeddings size for the all-MiniLM-L6-v2 model
embeddings_vector_size = 768

# https://huggingface.co/docs/transformers/en/model_doc/vit
# https://huggingface.co/google/vit-base-patch16-224-in21k
image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
image_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

pooling_layer_size = 768



In [11]:
# helper functions

def encode_image(path_to_image):
    image = Image.open(path_to_image).convert("RGB").resize((224, 224))
    inputs = image_processor(images=image, return_tensors="pt", do_rescale=True)
    outputs = image_model(**inputs)
    encoded_image = outputs.pooler_output.detach().numpy()[0]
    return encoded_image.tolist()

def encode_question(question_text):
    encoded = embeddings_model.encode(question_text.lower(), truncate_dim=embeddings_vector_size)
    return encoded.tolist()

def pairwise_multiplication(image, question):
    ret = []
    for i in range(len(image)):
        ret.append(image[i] * question[i])
    return ret

def get_encoded_image_question(path_to_image, question_text, pre_encoded_images = None, pre_encoded_questions = None):
    if pre_encoded_images:
        encoded_image = pre_encoded_images[path_to_image]
    else:
        encoded_image = encode_image(path_to_image)
        
    if pre_encoded_questions:
        encoded_question = pre_encoded_questions[question_text]
    else:
        encoded_question = encode_question(question_text)
    return pairwise_multiplication(encoded_image, encoded_question)


In [12]:
def encode_questions(array_of_questions):
    ret = {}
    for question in tqdm(array_of_questions):
        ret[question] = encode_question(question)
    return ret

def encode_images(array_of_images):
    ret = {}
    for image in tqdm(array_of_images):
        ret[image] = encode_image(image)
    return ret

In [13]:
# load our training dataset

cnn_training_df = pd.read_csv(TRAINING_DATA)
cnn_training_df.drop(cnn_training_df.columns.difference(['image_filename', 'question', 'multiple_choice_answer']), axis=1, inplace=True)
cnn_training_df.drop_duplicates(inplace=True)

# also extract all the unique answers from it
unique_answers = [x.lower() for x in cnn_training_df.multiple_choice_answer.unique().tolist()]
unique_answers_len = len(unique_answers)

unique_answers_map = {}
for index, answer in enumerate(unique_answers):
    unique_answers_map[answer] = index

print(f'Total unique answers: {unique_answers_len}')

# helper function to create a 1-hot encoded vector of an answer
def one_hot_answer_encode(answer):
    ret = [0] * unique_answers_len
    lower_answer = answer.lower()
    if lower_answer in unique_answers_map:
        answer_index = unique_answers_map[lower_answer]
        ret[answer_index] = 1
        return ret
    else:
        return None


Total unique answers: 2521


In [14]:
# %%script false --no-raise-error
# comment out the line above to rebuild the training dataset

if os.path.exists(TRAINING_PREENCODED_QUESTIONS):
    print('loading saved pre-encoding questions')
    with open(TRAINING_PREENCODED_QUESTIONS, 'rb') as infile:
        pre_encoded_questions = pickle.load(infile)
else:
    print('pre-encoding questions')
    unique_questions = cnn_training_df.question.unique().tolist()
    pre_encoded_questions = encode_questions(unique_questions)
    with open(TRAINING_PREENCODED_QUESTIONS, 'wb') as outfile:
        pickle.dump(pre_encoded_questions, outfile)

if os.path.exists(TRAINING_PREENCODED_IMAGES):
    print('loading saved images')
    with open(TRAINING_PREENCODED_IMAGES, 'rb') as infile:
        pre_encoded_images = pickle.load(infile)
else:
    print('pre-encoding images')
    unique_images = cnn_training_df.image_filename.unique().tolist()
    unique_images = [os.path.join(TRAINING_IMAGES, x) for x in unique_images]
    pre_encoded_images = encode_images(unique_images)
    with open(TRAINING_PREENCODED_IMAGES, 'wb') as outfile:
        pickle.dump(pre_encoded_images, outfile)

# build the training dataset
print('building training data...')
data_embeddings = []
data_outputs = []
for index, row in tqdm(cnn_training_df.iterrows(), total=cnn_training_df.shape[0]):
    this_image_filename = row["image_filename"]
    this_question = row["question"]
    this_answer = row["multiple_choice_answer"]

    one_hot_answer = one_hot_answer_encode(this_answer)
    if not one_hot_answer:
        continue

    embedding_input = get_encoded_image_question(
        os.path.join(TRAINING_IMAGES, this_image_filename),
        this_question,
        pre_encoded_images=pre_encoded_images,
        pre_encoded_questions=pre_encoded_questions
    )

    data_embeddings.append(embedding_input)
    data_outputs.append(one_hot_answer)

    # if len(data_outputs) == 20:
    #     break

cnn_training_dataset = tf.data.Dataset.from_tensor_slices(([data_embeddings], [data_outputs]))
print("saving dataset")
if os.path.exists(TRAINING_ENCODED_DATASET):
    shutil.rmtree(TRAINING_ENCODED_DATASET)

os.makedirs(TRAINING_ENCODED_DATASET)

cnn_training_dataset.save(TRAINING_ENCODED_DATASET)


loading saved pre-encoding questions
loading saved images
building training data...


100%|██████████| 59983/59983 [00:05<00:00, 10411.40it/s]


saving dataset


In [15]:
# load our validation dataset
cnn_validation_df = pd.read_csv(VALIDATION_DATA)
cnn_validation_df.drop(cnn_validation_df.columns.difference(['image_filename', 'question', 'multiple_choice_answer']), axis=1, inplace=True)
cnn_validation_df.drop_duplicates(inplace=True)

In [16]:
# %%script false --no-raise-error
# comment out the line above to rebuild the validation dataset

if os.path.exists(VALIDATION_PREENCODED_QUESTIONS):
    print('loading saved pre-encoding questions')
    with open(VALIDATION_PREENCODED_QUESTIONS, 'rb') as infile:
        pre_encoded_questions = pickle.load(infile)
else:
    print('pre-encoding questions')
    unique_questions = cnn_validation_df.question.unique().tolist()
    pre_encoded_questions = encode_questions(unique_questions)
    with open(VALIDATION_PREENCODED_QUESTIONS, 'wb') as outfile:
        pickle.dump(pre_encoded_questions, outfile)

if os.path.exists(VALIDATION_PREENCODED_IMAGES):
    print('loading saved images')
    with open(VALIDATION_PREENCODED_IMAGES, 'rb') as infile:
        pre_encoded_images = pickle.load(infile)
else:
    print('pre-encoding images')
    unique_images = cnn_validation_df.image_filename.unique().tolist()
    unique_images = [os.path.join(VALIDATION_IMAGES, x) for x in unique_images]
    pre_encoded_images = encode_images(unique_images)
    with open(VALIDATION_PREENCODED_IMAGES, 'wb') as outfile:
        pickle.dump(pre_encoded_images, outfile)

data_embeddings = []
data_outputs = []
for index, row in tqdm(cnn_validation_df.iterrows(), total=cnn_validation_df.shape[0]):
    this_image_filename = row["image_filename"]
    this_question = row["question"]
    this_answer = row["multiple_choice_answer"]

    one_hot_answer = one_hot_answer_encode(this_answer)
    if not one_hot_answer:
        continue

    embedding_input = get_encoded_image_question(
        os.path.join(VALIDATION_IMAGES, this_image_filename),
        this_question,
        pre_encoded_images=pre_encoded_images,
        pre_encoded_questions=pre_encoded_questions
    )

    data_embeddings.append(embedding_input)
    data_outputs.append(one_hot_answer)

    # if len(data_outputs) == 20:
    #     break

cnn_validation_dataset = tf.data.Dataset.from_tensor_slices(([data_embeddings], [data_outputs]))
print("saving dataset")
if os.path.exists(VALIDATION_ENCODED_DATASET):
    shutil.rmtree(VALIDATION_ENCODED_DATASET)

os.makedirs(VALIDATION_ENCODED_DATASET)

cnn_validation_dataset.save(VALIDATION_ENCODED_DATASET)


pre-encoding questions


100%|██████████| 19199/19199 [40:04<00:00,  7.98it/s]


pre-encoding images


100%|██████████| 10000/10000 [18:17<00:00,  9.11it/s]
100%|██████████| 29990/29990 [00:02<00:00, 13885.20it/s]


saving dataset


In [17]:
# reload our datasets
training_dataset = tf.data.Dataset.load(TRAINING_ENCODED_DATASET)
validation_dataset = tf.data.Dataset.load(VALIDATION_ENCODED_DATASET)

In [31]:
# model creation

from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Sequential

input_vector_size = embeddings_vector_size # 768
output_vector_size = unique_answers_len # 2521 as of this last run

qa_model = Sequential()

# our combined vector is the input
qa_model.add(Input(shape=(input_vector_size,)))

# two densely connected layers
qa_model.add(Dense(1024,activation='relu'))
qa_model.add(Dense(1024,activation='relu'))

# and our output layer
qa_model.add(Dense(unique_answers_len, activation="softmax"))

qa_model.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(),
    loss='categorical_crossentropy',
    metrics=["acc", "AUC"]
)
qa_model.build(input_shape=(input_vector_size,))

display(qa_model.summary())


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 1024)              787456    
                                                                 
 dense_6 (Dense)             (None, 1024)              1049600   
                                                                 
 dense_7 (Dense)             (None, 2521)              2584025   
                                                                 
Total params: 4421081 (16.87 MB)
Trainable params: 4421081 (16.87 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


None

In [32]:
# model training

fit_history = qa_model.fit(
    x = training_dataset,
    batch_size=64,
    epochs = 20,
    validation_data=validation_dataset,
    verbose=1
)

display(fit_history.history)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


{'loss': [7.830138683319092,
  7.80070686340332,
  7.759055137634277,
  7.693273544311523,
  7.591084003448486,
  7.439083576202393,
  7.222714900970459,
  6.927210807800293,
  6.5402750968933105,
  6.0583295822143555,
  5.499658107757568,
  4.928905963897705,
  4.480183124542236,
  4.325539588928223,
  4.442935943603516,
  4.555269241333008,
  4.601552963256836,
  4.5925211906433105,
  4.536729335784912,
  4.460849285125732],
 'acc': [0.0004667989269364625,
  0.2385842651128769,
  0.23856759071350098,
  0.23856759071350098,
  0.23856759071350098,
  0.23856759071350098,
  0.23856759071350098,
  0.23856759071350098,
  0.23856759071350098,
  0.23856759071350098,
  0.23856759071350098,
  0.23856759071350098,
  0.23856759071350098,
  0.23856759071350098,
  0.23856759071350098,
  0.23856759071350098,
  0.23856759071350098,
  0.1690979152917862,
  0.1690979152917862,
  0.1690979152917862],
 'auc': [0.055495940148830414,
  0.055495940148830414,
  0.055495940148830414,
  0.055495940148830414,


In [34]:
def find_max_value_index(predictions):
    max_index = 0
    max_value = -99999999.99
    for index, value in enumerate(predictions):
        if value > max_value:
            max_index = index
            max_value = value
    return max_index, max_value

def answer_question(image_filepath, question_text):
    embedding_input = get_encoded_image_question(image_filepath, question_text)
    embedding_input = [embedding_input]
    ds = tf.data.Dataset.from_tensor_slices([embedding_input])
    predicted = qa_model.predict(ds)
    predicted_as_list = predicted.tolist()[0]
    return find_max_value_index(predicted_as_list)

def test_qa(image_filepath, question_text):
    print(f'Question: {question_text} (image: {image_filepath})')
    answer_index, answer_prob_value = answer_question(image_filepath, question_text)
    answer = unique_answers[answer_index]
    print(f'The answer found is: {answer} (output_value: {answer_prob_value})')

def get_image_path(filename):
    return os.path.join(VALIDATION_IMAGES, filename)

# testing
testing_data = [
    { 'question': 'Is the dog asleep?', "image": get_image_path('abstract_v002_val2015_000000027578.png') }
]

for test in testing_data:
    test_qa(test['image'], test['question'])



Question: Is the dog asleep? (image: ./data/scene_img_abstract_v002_val2015/abstract_v002_val2015_000000027578.png)
The answer found is: no (output_value: 0.26438599824905396)
