## Visual Question Answering (VQA) Dataset



In [7]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image
import numpy as np
import tensorflow as tf

In [8]:
# Load pre-trained ResNet50 without the classification layer
resnet = ResNet50(weights='imagenet', include_top=False, pooling='avg')

def extract_image_features(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)

    features = resnet.predict(img_array)
    return features  # Shape: (1, 2048)

In [46]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model

# Assuming a vocabulary size of 10,000 and question length of 20 words
vocab_size = 10000
max_length = 21
embedding_dim = 300

# Text input
text_input = Input(shape=(max_length,), name='text_input')
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length)(text_input)
lstm_layer = LSTM(256)(embedding_layer)  # Encodes the question into a fixed-size vector




In [10]:
from tensorflow.keras.layers import Concatenate

# Image input
image_input = Input(shape=(2048,), name='image_input')  # The features extracted by ResNet

# Combine image and text features
combined = Concatenate()([image_input, lstm_layer])
dense_1 = Dense(512, activation='relu')(combined)
output = Dense(4, activation='softmax')(dense_1)  # Assuming there are 4 possible answer choices

# Build the model
vqa_model = Model(inputs=[image_input, text_input], outputs=output)
vqa_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


# Preprocess the data

## Notes on the data

- There are 60k open-ended questions and 60k multiple-choice questions in the training data.
- Multiple-choice questions are the same as open-ended questions, but with 4 answer choices.
- We don't need to do open-ended questions according to assignment 
- Captions data is just filenames, urls to images and height and width
- Annotations data has answers!

In [5]:
import json

# Load training questions, annotations, and captions
with open('VQA-data/VQA-train/MultipleChoice_abstract_v002_train2015_questions.json', 'r') as f:
    questions_data = json.load(f)['questions']

with open('VQA-data/VQA-train/captions_abstract_v002_train2015.json', 'r') as f:
    captions_data = json.load(f)['images']

with open('VQA-data/VQA-train/abstract_v002_train2015_annotations.json', 'r') as f:
    annotations_data = json.load(f)['annotations']

# Load test questions
with open('VQA-data/VQA-test/MultipleChoice_abstract_v002_test2015_questions.json', 'r') as f:
    questions_test_data = json.load(f)['questions']


In [38]:
print(len(questions_data))
print(len(questions_test_data))

print(captions_data[0])
print(questions_test_data[0])
print(annotations_data[0])
print(annotations_data[1])


60000
60000
{'url': 'http://visualqa.org/data/abstract_v002/scene_img/img/0.png', 'file_name': 'abstract_v002_train2015_000000000000.png', 'image_id': 0, 'width': 700, 'height': 400}
{'image_id': 39456, 'question': 'What color are the chairs?', 'multiple_choices': ['red', '4', '3', 'plates', 'brown', 'yellow', 'green', 'standing in water', 'no', 'white', 'yes', 'diagonally', '2', '1', 'blue', 'dog', 'because', 'sit on floor'], 'question_id': 394560}
{'question_type': 'who', 'multiple_choice_answer': 'man', 'answers': [{'answer': 'old person', 'answer_confidence': 'maybe', 'answer_id': 1}, {'answer': 'man', 'answer_confidence': 'maybe', 'answer_id': 2}, {'answer': 'man', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'man', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'old man', 'answer_confidence': 'yes', 'answer_id': 5}, {'answer': 'man', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'man', 'answer_confidence': 'yes', 'answer_id': 7}, {'answer': 'man', 'a

In [6]:
# Find the longest answer in the test data
max_answer_length = 0
longest_answer = None
for q in questions_test_data:
    for a in q['multiple_choices']:
        word_count = len(a.split())
        if word_count > max_answer_length:
            max_answer_length = word_count
            longest_answer = a

print(f"Longest answer: \"{longest_answer}\" with {max_answer_length} words")


Longest answer: "because super bowl is on and everyone needs to be able to see tv" with 14 words


In [45]:
# Find the longest question in the test data

# Initialize variables to track the longest question
max_length = 0
longest_question = None

# Loop through each question in the dataset
for q in questions_test_data:
    # Split the question into words
    word_count = len(q['question'].split())

    # Update the longest question if the current one has more words
    if word_count > max_length:
        max_length = word_count
        longest_question = q['question']

# Print the longest question and its word count
print(f"Longest question: \"{longest_question}\" with {max_length} words")




Longest question: "What color do you get when you mix the color of the "D" block with the color of the "C" block?" with 21 words


In [47]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Open-ended questions
open_ended_questions = [q['question'] for q in multiple_choice_questions_data]
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(open_ended_questions)

open_ended_question_sequences = tokenizer.texts_to_sequences(open_ended_questions)
padded_open_ended_questions = pad_sequences(open_ended_question_sequences, maxlen=max_length, padding='post')

# Multiple-choice questions
multiple_choice_questions = [q['question'] for q in multiple_choice_questions_data]
tokenizer.fit_on_texts(multiple_choice_questions)

multiple_choice_question_sequences = tokenizer.texts_to_sequences(multiple_choice_questions)
padded_multiple_choice_questions = pad_sequences(multiple_choice_question_sequences, maxlen=max_length, padding='post')

# Multiple-choice answers
multiple_choice_answers = [q['multiple_choices'] for q in multiple_choice_questions_data]
tokenizer.fit_on_texts(multiple_choice_questions)

multiple_choice_question_sequences = tokenizer.texts_to_sequences(multiple_choice_questions)
padded_multiple_choice_questions = pad_sequences(multiple_choice_question_sequences, maxlen=max_length, padding='post')


print(multiple_choice_questions[0])
print(multiple_choice_question_sequences[0])
print(padded_multiple_choice_questions[0])

Who looks happier?
[42, 715, 2222]
[  42  715 2222    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0]


In [2]:
# This script demonstrates how to train a pretrained ResNet model on the Abstract Scenes dataset
# to answer multiple-choice questions using the Visual Question Answering (VQA) dataset.

import os
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image
import PIL
from PIL import Image
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm

## Notes
- Only considering the correct answer for now (not the answers with "answer_confidence" maybe)

In [5]:
# This script demonstrates how to train a pretrained ResNet model on the Abstract Scenes dataset
# to answer multiple-choice questions using the Visual Question Answering (VQA) dataset.

import os
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image
import PIL
from PIL import Image
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm

# Step 1: Load Dataset Files
# Load the training questions and annotations files
with open('VQA-data/VQA-train/MultipleChoice_abstract_v002_train2015_questions.json', 'r') as f:
    questions_data = json.load(f)["questions"]

with open('VQA-data/VQA-train/abstract_v002_train2015_annotations.json', 'r') as f:
    annotations_data = json.load(f)["annotations"]



# Step 2: Filter Multiple-Choice Questions and Match with Answers
annotations_dict = {a['question_id']: a['multiple_choice_answer'] for a in annotations_data}
filtered_data = []
for question in questions_data:
    question_id = question.get('question_id')
    if question_id in annotations_dict:
        question['answer'] = annotations_dict[question_id]
        # Keep all multiple choices
        question['multiple_choices'] = question['multiple_choices'] 
        filtered_data.append(question)



# Step 3: Extract Image Features Using ResNet50
# Load the pre-trained ResNet model without the classification layer
resnet = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# Extract training image features
train_image_directory = '/Users/liamkopp/Downloads/scene_img_abstract_v002_train2015'
train_image_features_dict = {}

processed_image_ids = set([q['image_id'] for q in filtered_data])
saved_train_features_path = 'train_image_features.npy'

if os.path.exists(saved_train_features_path):
    train_image_features_dict = np.load(saved_train_features_path, allow_pickle=True).item()
    print("Saved Training Images Found!")
else:
    for img_name in tqdm(os.listdir(train_image_directory), desc='Processing Training Images', leave=False, miniters=100):
        img_id = img_name.split('.')[0].split('_')[-1].lstrip('0')
        if img_name == 'abstract_v002_train2015_000000000000.png':
            img_id = '0'
        img_path = os.path.join(train_image_directory, img_name)
        img = image.load_img(img_path, target_size=(224, 224))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)

        features = resnet.predict(img_array, verbose=0)
        train_image_features_dict[img_id] = features.flatten()

    np.save(saved_train_features_path, train_image_features_dict)



# Step 4: Tokenize and Pad Questions and Choices
# Extract questions and choices
questions = [q['question'] for q in filtered_data]
choices = [choice for q in filtered_data for choice in q['multiple_choices']]

# Initialize and fit the tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(questions + choices)

# Tokenize and pad questions
max_question_length = 21
question_sequences = tokenizer.texts_to_sequences(questions)
padded_questions = pad_sequences(question_sequences, maxlen=max_question_length, padding='post')

# Tokenize and pad choices
max_choice_length = 14
choices_sequences = [tokenizer.texts_to_sequences(q['multiple_choices']) for q in filtered_data]
padded_choices = [pad_sequences(seq, maxlen=max_choice_length, padding='post') for seq in choices_sequences]



# Step 5: Prepare Inputs and Labels for Training
questions_input = []
choices_input = []
image_features = []
labels = []

print('\nStarting training data preparation...')
# Prepare training data
for i, q in enumerate(filtered_data):
  question_id = q.get('question_id')
  if question_id in annotations_dict:
    q['answer'] = annotations_dict[question_id]
    img_id = str(q['image_id'])
    if img_id in train_image_features_dict:  
      for j in range(len(q['multiple_choices'])):
        questions_input.append(padded_questions[min(i, len(padded_questions) - 1)])
        image_features.append(train_image_features_dict[img_id])
        choices_input.append(padded_choices[min(i, len(padded_choices) - 1)][j])
        # Create a label for each choice (1 for correct, 0 for incorrect)
        correct_index = q['multiple_choices'].index(q['answer'])
        labels.append(1 if j == correct_index else 0)
    else:
      print(f'No match for training image_id: {img_id}')
      print(f'Available keys (first 5): {list(train_image_features_dict.keys())[:5]}')
      print(f'Available keys: {list(train_image_features_dict.keys())[:5]}')

# Convert training data to numpy arrays
questions_input = np.array(questions_input).reshape(-1, max_question_length)
choices_input = np.array(choices_input).reshape(-1, max_choice_length)
image_features = np.array(image_features).reshape(-1, 2048)
labels = np.array(labels).reshape(-1, 1)

# Print shapes to debug
print('Training Data Shapes:')
print(f'Questions Input: {questions_input.shape}')
print(f'Choices Input: {choices_input.shape}')
print(f'Image Features: {image_features.shape}')
print(f'Labels: {labels.shape}')
print()



# Step 6: Define the Model Architecture
# Image input
image_input = Input(shape=(2048,), name='image_input')

# Question input
question_input = Input(shape=(max_question_length,), name='question_input')
question_embedding = Embedding(input_dim=10000, output_dim=300)(question_input)
question_lstm = LSTM(128)(question_embedding)

# Choice input
choice_input = Input(shape=(max_choice_length,), name='choice_input')
choice_embedding = Embedding(input_dim=10000, output_dim=300)(choice_input)
choice_lstm = LSTM(128)(choice_embedding)

# Concatenate image, question, and choice features
concat = Concatenate()([image_input, question_lstm, choice_lstm])
dense_1 = Dense(128, activation='relu')(concat)
output = Dense(1, activation='sigmoid')(dense_1)

# Define and compile the model
vqa_model = Model(inputs=[image_input, question_input, choice_input], outputs=output)
vqa_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 7: Train the Model
# Check if all input arrays have the same length
# assert len(image_features) == len(questions_input) == len(choices_input) == len(labels), "Training data arrays have mismatched lengths."
# assert len(val_image_features) == len(val_questions_input) == len(val_choices_input) == len(val_labels), "Validation data arrays have mismatched lengths."

# # Adjust batch size if necessary
batch_size = min(32, len(image_features))

vqa_model.fit(
    [image_features, questions_input, choices_input],
    labels,
    validation_split=0.2,
    epochs=1,
    batch_size=batch_size
)

# Step 8: Save the Model
vqa_model.save('vqa_resnet_model.keras')

Saved Training Images Found!

Starting training data preparation...
No match for training image_id: 0
Available keys (first 5): ['4361', '14172', '2710', '12503', '18990']
Available keys: ['4361', '14172', '2710', '12503', '18990']
No match for training image_id: 0
Available keys (first 5): ['4361', '14172', '2710', '12503', '18990']
Available keys: ['4361', '14172', '2710', '12503', '18990']
No match for training image_id: 0
Available keys (first 5): ['4361', '14172', '2710', '12503', '18990']
Available keys: ['4361', '14172', '2710', '12503', '18990']
Training Data Shapes:
Questions Input: (1079946, 21)
Choices Input: (1079946, 14)
Image Features: (1079946, 2048)
Labels: (1079946, 1)




[1m26999/26999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1687s[0m 62ms/step - accuracy: 0.9445 - loss: 0.2195 - val_accuracy: 0.9444 - val_loss: 0.2149


## Testing/Prediction

In [7]:
# Step 9: Load the Model for Testing
vqa_model = tf.keras.models.load_model('vqa_resnet_model.keras')

# Step 10: Load Test Dataset Files
# Load the validation questions and annotations files
with open('VQA-data/VQA-validate/MultipleChoice_abstract_v002_val2015_questions.json', 'r') as f:
    val_questions_data = json.load(f)["questions"]

with open('VQA-data/VQA-validate/abstract_v002_val2015_annotations.json', 'r') as f:
    val_annotations_data = json.load(f)["annotations"]

# Filter the validation data
# All question ids with correct answer {275780: 'yes', 275781: 'tv',...}
val_annotations_dict = {a['question_id']: a['multiple_choice_answer'] for a in val_annotations_data}
# All questions with correct answer and id
# 'image_id': 27578, 'question': 'Is the dog asleep?', 'multiple_choices': ['white', 'picnicking', 'yes', 'hot', 'dog and girl', 'blue', 'both', 'no', 'red', '2', 'marshmallow', 'brown', 'bone', 'twins', 'food', '1', '4', '3'], 'question_id': 275780, 'answer': 'yes'}
val_filtered_data = []
for question in val_questions_data:
    question_id = question.get('question_id')
    if question_id in val_annotations_dict:
        question['answer'] = val_annotations_dict[question_id]
        # Keep all multiple choices
        question['multiple_choices'] = question['multiple_choices'] 
        val_filtered_data.append(question)


# Extract validation image features
val_image_directory = '/Users/liamkopp/Downloads/scene_img_abstract_v002_val2015'
val_image_features_dict = {}

val_image_ids = set([q['image_id'] for q in val_questions_data])
saved_val_features_path = 'val_image_features.npy'

if os.path.exists(saved_val_features_path):   
    val_image_features_dict = np.load(saved_val_features_path, allow_pickle=True).item()
    print("Saved Validation Features Loaded!") 
else:
    for img_name in tqdm(os.listdir(val_image_directory), desc='Processing Validation Images', leave=False, miniters=100):
        img_id = img_name.split('.')[0].split('_')[-1].lstrip('0')
        if img_id == '0' or int(img_id) in val_image_ids:
            img_path = os.path.join(val_image_directory, img_name)
            img = image.load_img(img_path, target_size=(224, 224))
            img_array = image.img_to_array(img)
            img_array = np.expand_dims(img_array, axis=0)
            img_array = preprocess_input(img_array)

            features = resnet.predict(img_array, verbose=0)
            val_image_features_dict[img_id] = features.flatten()

    np.save(saved_val_features_path, val_image_features_dict)


# Step 11: Prepare Test Data Inputs

# Tokenize and Pad Questions and Choices
val_questions = [q['question'] for q in val_filtered_data]
val_choices = [choice for q in val_filtered_data for choice in q['multiple_choices']]

# Initialize and fit the tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(val_questions + val_choices)

# Tokenize and pad questions
max_question_length = 21
val_question_sequences = tokenizer.texts_to_sequences(val_questions)
val_padded_questions = pad_sequences(val_question_sequences, maxlen=max_question_length, padding='post')

# Tokenize and pad choices
max_choice_length = 14
val_choices_sequences = [tokenizer.texts_to_sequences(q['multiple_choices']) for q in val_filtered_data]
val_padded_choices = [pad_sequences(seq, maxlen=max_choice_length, padding='post') for seq in val_choices_sequences]


# Prepare validation data
val_questions_input = []
val_choices_input = []
val_image_features = []
val_labels = []

question_id_list = []
print('\nStarting validation data preparation...')
for i, q in enumerate(val_filtered_data):
  question_id = q.get('question_id')
  question_id_list.append(question_id)
  if question_id in val_annotations_dict:
    q['answer'] = val_annotations_dict[question_id]
    img_id = str(q['image_id'])
    if img_id in val_image_features_dict:  
      for j in range(len(q['multiple_choices'])):
        val_questions_input.append(val_padded_questions[min(i, len(val_padded_questions) - 1)])
        val_image_features.append(val_image_features_dict[img_id])
        val_choices_input.append(val_padded_choices[min(i, len(val_padded_choices) - 1)][j])
        # Create a label for each choice (1 for correct, 0 for incorrect)
        correct_index = q['multiple_choices'].index(q['answer'])
        val_labels.append(1 if j == correct_index else 0)
    else:
      print(f'No match for validation image_id: {img_id}')
      print(f'Available keys (first 5): {list(val_image_features_dict.keys())[:5]}')
      print(f'Available keys: {list(val_image_features_dict.keys())[:5]}')

# print(len(val_image_features_dict))  
# print(len(val_image_features))
# print(len(val_choices_input))    
    
# Convert validation data to numpy arrays
val_questions_input = np.array(val_questions_input).reshape(-1, max_question_length)
val_choices_input = np.array(val_choices_input).reshape(-1, max_choice_length)
val_image_features = np.array(val_image_features).reshape(-1, 2048)
val_labels = np.array(val_labels).reshape(-1, 1)

# Print shapes to debug
print('\nValidation Data Shapes:')
print(f'Questions Input: {val_questions_input.shape}')
print(f'Choices Input: {val_choices_input.shape}')
print(f'Image Features: {val_image_features.shape}')
print(f'Labels: {val_labels.shape}')
print()

# Step 12: Evaluate the Model on Test Data
# test_loss, test_accuracy = vqa_model.evaluate(
#     [val_image_features, val_questions_input, val_choices_input],
#     val_labels,
#     batch_size=32
# )

print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Step 13: Make Predictions on the Test Data
predictions = vqa_model.predict([val_image_features, val_questions_input, val_choices_input])

# Example usage: Show the predicted answer for a specific question
for i in range(0, len(predictions), len(val_filtered_data[i // len(val_filtered_data[i]['multiple_choices'])]['multiple_choices'])):
    question_idx = i // len(val_filtered_data[i // len(val_filtered_data[i]['multiple_choices'])]['multiple_choices'])
    question = val_filtered_data[question_idx]['question']
    choices = val_filtered_data[question_idx]['multiple_choices']
    actual_answer = val_filtered_data[question_idx]['answer']

    predicted_idx = np.argmax(predictions[i:i+len(choices)])
    predicted_answer = choices[predicted_idx]

    print(f"Question: {question}")
    print(f"Predicted Answer: {predicted_answer}")
    print(f"Actual Answer: {actual_answer}")





Saved Validation Features Loaded!

Starting validation data preparation...

Validation Data Shapes:
Questions Input: (540000, 21)
Choices Input: (540000, 14)
Image Features: (540000, 2048)
Labels: (540000, 1)

Test Accuracy: 94.44%
[1m16875/16875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 14ms/step
Question: Is the dog asleep?
Predicted Answer: white
Actual Answer: yes
Question: What is the man looking at?
Predicted Answer: monkey bars
Actual Answer: tv
Question: Is the man sitting on the armrest?
Predicted Answer: 2
Actual Answer: yes
Question: Did she bake the pie?
Predicted Answer: golden girls
Actual Answer: yes
Question: What is resting on the table besides the pie?
Predicted Answer: brown
Actual Answer: wine glass
Question: Is the old lady eating a pie?
Predicted Answer: 45
Actual Answer: no
Question: Is the baby asleep?
Predicted Answer: nest
Actual Answer: no
Question: What is on the table?
Predicted Answer: brown
Actual Answer: tv
Question: Is the woman wearing p

IndexError: list index out of range