**This script contains steps for data loading, cleaning, filtering data for K class labels, resizing images, image feature extraction using ResNet50 and text feature extraction using BERT, tokenization, label encoding, and saving features in numpy files.**

## 1. Data Preparation & Preprocessing

In [None]:
import os
import json
import zipfile
import requests
from PIL import Image
from io import BytesIO
from google.colab import files, drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 1.1 Dataset Setup


In [None]:
import os
import json
import pandas as pd

# Define dataset paths
DATASET_DIR = "/content/drive/MyDrive/DS8013 Deep Learning/Project"
IMAGE_DIR_TRAIN = os.path.join(DATASET_DIR, "scene_img_abstract_v002_train2015")
IMAGE_DIR_VAL = os.path.join(DATASET_DIR, "scene_img_abstract_v002_val2015")
IMAGE_DIR_TEST = os.path.join(DATASET_DIR, "scene_img_abstract_v002_test2015")

# JSON file paths
QUESTIONS_TRAIN_PATH = os.path.join(DATASET_DIR, "Questions_Train_abstract_v002/MultipleChoice_abstract_v002_train2015_questions.json")
QUESTIONS_VAL_PATH = os.path.join(DATASET_DIR, "Questions_Val_abstract_v002/MultipleChoice_abstract_v002_val2015_questions.json")
QUESTIONS_TEST_PATH = os.path.join(DATASET_DIR, "Questions_Test_abstract_v002/MultipleChoice_abstract_v002_test2015_questions.json")

ANSWERS_TRAIN_PATH = os.path.join(DATASET_DIR, "Annotations_Train_abstract_v002/abstract_v002_train2015_annotations.json")
ANSWERS_VAL_PATH = os.path.join(DATASET_DIR, "Annotations_Val_abstract_v002/abstract_v002_val2015_annotations.json")

# Function to load JSON
def load_json(json_path):
    with open(json_path, "r") as file:
        return json.load(file)

# Load the JSON files
questions_train = load_json(QUESTIONS_TRAIN_PATH)
questions_val = load_json(QUESTIONS_VAL_PATH)
questions_test = load_json(QUESTIONS_TEST_PATH)

answers_train = load_json(ANSWERS_TRAIN_PATH)
answers_val = load_json(ANSWERS_VAL_PATH)


In [None]:
def questions_to_dataframe(questions_json):
    data = []
    for item in questions_json["questions"]:
        data.append({
            "image_id": item["image_id"],
            "question_id": item["question_id"],
            "question": item["question"],
            "multiple_choices": item["multiple_choices"]
        })
    return pd.DataFrame(data)

df_questions_train = questions_to_dataframe(questions_train)
df_questions_val = questions_to_dataframe(questions_val)
df_questions_test = questions_to_dataframe(questions_test)

# Check the structure
df_questions_train.head()


Unnamed: 0,image_id,question_id,question,multiple_choices
0,11779,117792,Who looks happier?,"[alive, 1, woman, purple, 2, yes, white, boy, ..."
1,11779,117790,Where is the woman sitting?,"[3, no, blue, red, 1, slide, monkey bars, jump..."
2,11779,117791,Where is the man sitting?,"[away, yes, blue, 1, 2, mouse, couch, no, yell..."
3,5536,55360,Is this man hungry?,"[water, yellow, 4, running, blue, pouring, out..."
4,5536,55361,What kind of drink is that?,"[wine, girl would fall, soda, white, yes, coke..."


In [None]:
def annotations_to_dataframe(annotations_json):
    data = []
    for item in annotations_json["annotations"]:
        data.append({
            "image_id": item["image_id"],
            "question_id": item["question_id"],
            "multiple_choice_answer": item["multiple_choice_answer"],
            "answers": [ans["answer"] for ans in item["answers"]],
            "answer_type": item["answer_type"],
            "question_type": item["question_type"]
        })
    return pd.DataFrame(data)

df_answers_train = annotations_to_dataframe(answers_train)
df_answers_val = annotations_to_dataframe(answers_val)

# Check the structure
df_answers_train.head()


Unnamed: 0,image_id,question_id,multiple_choice_answer,answers,answer_type,question_type
0,11779,117792,man,"[old person, man, man, man, old man, man, man,...",other,who
1,11779,117790,blanket,"[on blanket, blanket, on blanket, blanket, pic...",other,where is the
2,11779,117791,bench,"[on bench, bench, on bench, bench, bench, on b...",other,where is the
3,5536,55360,yes,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",yes/no,is this
4,5536,55361,soda,"[water, soda, wine, soft, soda, soda, soda, wa...",other,what kind of


In [None]:
df_train = df_questions_train.merge(df_answers_train, on=["image_id", "question_id"])
df_val = df_questions_val.merge(df_answers_val, on=["image_id", "question_id"])

# Print merged train data
df_train.head()


Unnamed: 0,image_id,question_id,question,multiple_choices,multiple_choice_answer,answers,answer_type,question_type
0,11779,117792,Who looks happier?,"[alive, 1, woman, purple, 2, yes, white, boy, ...",man,"[old person, man, man, man, old man, man, man,...",other,who
1,11779,117790,Where is the woman sitting?,"[3, no, blue, red, 1, slide, monkey bars, jump...",blanket,"[on blanket, blanket, on blanket, blanket, pic...",other,where is the
2,11779,117791,Where is the man sitting?,"[away, yes, blue, 1, 2, mouse, couch, no, yell...",bench,"[on bench, bench, on bench, bench, bench, on b...",other,where is the
3,5536,55360,Is this man hungry?,"[water, yellow, 4, running, blue, pouring, out...",yes,"[yes, yes, yes, yes, yes, yes, yes, yes, yes, ...",yes/no,is this
4,5536,55361,What kind of drink is that?,"[wine, girl would fall, soda, white, yes, coke...",soda,"[water, soda, wine, soft, soda, soda, soda, wa...",other,what kind of


In [None]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m 

### 1.2 Text Cleaning & Filtering

In [None]:
import contractions
import re

def clean_text(text):
    if isinstance(text, str):
        text = contractions.fix(text)  # Expand contractions
        text = text.lower()  # Convert to lowercase
        text = re.sub('[-,:]', ' ', text)  # Replace certain punctuation with space
        text = re.sub(r'(?!\d)\.(?!\d)', '', text)  # Remove '.' only if it's not part of a number
        text = re.sub('[^A-Za-z0-9. ]+', '', text)  # Remove all other punctuation
        text = re.sub(' +', ' ', text)  # Remove extra spaces
        return text.strip()
    return text

# Apply cleaning to questions and answers
df_train['question'] = df_train['question'].apply(clean_text)
df_train['multiple_choice_answer'] = df_train['multiple_choice_answer'].apply(clean_text)

df_val['question'] = df_val['question'].apply(clean_text)
df_val['multiple_choice_answer'] = df_val['multiple_choice_answer'].apply(clean_text)

df_questions_test['question'] = df_questions_test['question'].apply(clean_text)


In [None]:
df_questions_test.head()

Unnamed: 0,image_id,question_id,question,multiple_choices
0,39456,394560,what color are the chairs,"[red, 4, 3, plates, brown, yellow, green, stan..."
1,39456,394561,is the man asleep,"[3, blue, 4, no, anger, white, 2, red, yes, on..."
2,39456,394562,what is on the table,"[white, on sidewalk, salt and pepper, 3, yes, ..."
3,47922,479220,how many bushes are in the background,"[blue, 1, 3, 40, dog, 10, 2, sun rays, red, ye..."
4,47922,479221,what are they playing,"[yes, soccer, on man's head, frisbee, golf, mo..."


In [None]:
# Filter the dataset to include only the most frequent answers.
# This step helps to focus the model on a manageable set of answer classes.
# By selecting top answers, we can improve training efficiency and potentially accuracy.

# Count the occurrences of each possible answer
all_answers = []
for answers in df_train['answers']:
    all_answers.extend(answers)  # Flatten the list of possible answers into one list

# Count frequency of each unique answer
answer_counts = pd.Series(all_answers).value_counts()
print("Answer Counts:\n", answer_counts.head(20))  # Display top 20 answers


Answer Counts:
 yes        137644
no         105104
2           34772
1           19452
red         13599
3           13445
white        7552
4            6270
blue         5849
yellow       5270
brown        5006
dog          4957
0            4873
cat          4215
5            3546
green        3025
black        2854
sitting      2765
gray         2536
wine         2420
Name: count, dtype: int64


In [None]:
# Calculate the total number of data points (total number of answers in the dataset)
num_data_points = len(all_answers)

# Calculate the frequency of each answer
answer_counts = pd.Series(all_answers).value_counts()

# Normalize the frequencies to percentages (coverage per answer)
answer_percentage = (answer_counts / num_data_points) * 100

# Calculate the cumulative sum of percentages to see how much data each set of answers covers
cumulative_coverage = answer_percentage.cumsum()

# Display cumulative coverage to understand the coverage percentages
print("Cumulative Coverage (Percentage):\n", cumulative_coverage.head(20))  # Check top 20 values

# Find the top answers that cover at least 80% of the data
top_answers_coverage = cumulative_coverage[cumulative_coverage <= 85]

# Get the actual top answers corresponding to 85% coverage
top_answers = top_answers_coverage.index

# Output the results
print(f"Number of top answers covering 85% of data: {len(top_answers)}")
print(f"Top answers covering 85% of data:\n{top_answers}")


Cumulative Coverage (Percentage):
 yes        22.940667
no         40.458000
2          46.253333
1          49.495333
red        51.761833
3          54.002667
white      55.261333
4          56.306333
blue       57.281167
yellow     58.159500
brown      58.993833
dog        59.820000
0          60.632167
cat        61.334667
5          61.925667
green      62.429833
black      62.905500
sitting    63.366333
gray       63.789000
wine       64.192333
Name: count, dtype: float64
Number of top answers covering 85% of data: 181
Top answers covering 85% of data:
Index(['yes', 'no', '2', '1', 'red', '3', 'white', '4', 'blue', 'yellow',
       ...
       'none', 'tea set', 'window', 'old', 'mantle', 'water', 'lady', 'nest',
       'see saw', 'bottle'],
      dtype='object', length=181)


In [None]:
# Apply filtering to training dataset
df_train_filtered = df_train[df_train['multiple_choice_answer'].isin(top_answers)]
print(df_train_filtered.shape)

# Apply filtering to validation dataset
df_val_filtered = df_val[df_val['multiple_choice_answer'].isin(top_answers)]
print(df_val_filtered.shape)


(53389, 8)
(26769, 8)


In [None]:
!pip install tensorflow



In [None]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
import numpy as np
import pandas as pd

## 2. Feature Engineering

**Image features extracted using ResNet50**


In [None]:
# Extract image features using a pretrained ResNet50 model from PyTorch.
# ResNet50 is a deep convolutional neural network known for its strong performance in image recognition.
# The extracted features will serve as the visual input for our VQA model.

import os
import torch
import torchvision.transforms as transforms
from torchvision import models
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
from tqdm import tqdm

# Set device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load ResNet50 (pretrained on ImageNet)
resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
resnet.fc = torch.nn.Identity()  # Remove classification layer (output is 2048-D feature vector)
resnet = resnet.to(device)
resnet.eval()

# Define image transformations (resize, normalize)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to find image file
def find_image_file(directory, image_id):
    filename_pattern = f"abstract_v002_train2015_{int(image_id):012d}.png"
    img_path = os.path.join(directory, filename_pattern)
    return img_path if os.path.exists(img_path) else None

# Custom Dataset Class
class ImageDataset(Dataset):
    def __init__(self, image_ids, directory, transform=None):
        self.image_ids = image_ids
        self.directory = directory
        self.transform = transform

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        img_path = find_image_file(self.directory, image_id)
        if img_path:
            img = Image.open(img_path).convert("RGB")
            if self.transform:
                img = self.transform(img)
            return img, image_id
        else:
            return torch.zeros((3, 224, 224)), image_id  # If image not found, return zero image

# Function to extract features using DataLoader
def extract_features(image_ids, directory, batch_size=64):
    dataset = ImageDataset(image_ids, directory, transform=transform)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2)

    features_dict = {}

    with torch.no_grad():
        for images, image_ids in tqdm(dataloader, desc="Extracting Features"):
            images = images.to(device)
            features = resnet(images).cpu().numpy()  # Extract features
            for img_id, feat in zip(image_ids.numpy(), features):
                features_dict[img_id] = feat  # Store features in dictionary

    return features_dict

# Convert image IDs to integer
train_image_ids = df_train_filtered['image_id'].astype(int).unique()
val_image_ids = df_val_filtered['image_id'].astype(int).unique()
test_image_ids = df_questions_test['image_id'].astype(int).unique()

# Extract features for train & val sets
train_features = extract_features(train_image_ids, IMAGE_DIR_TRAIN, batch_size=64)
val_features = extract_features(val_image_ids, IMAGE_DIR_VAL, batch_size=64)
test_features = extract_features(test_image_ids, IMAGE_DIR_TEST, batch_size=64)

# Convert to numpy arrays
train_feature_array = np.array(list(train_features.values()))
val_feature_array = np.array(list(val_features.values()))
test_feature_array = np.array(list(test_features.values()))

print(f"Train feature shape: {train_feature_array.shape}")
print(f"Val feature shape: {val_feature_array.shape}")
print(f"Test feature shape: {test_feature_array.shape}")

# Save features as numpy files
np.save("train_features.npy", train_feature_array)
np.save("val_features.npy", val_feature_array)
np.save("test_features.npy", test_feature_array)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 170MB/s]
Extracting Features: 100%|██████████| 312/312 [44:46<00:00,  8.61s/it]
Extracting Features: 100%|██████████| 157/157 [00:30<00:00,  5.10it/s]


Train feature shape: (19961, 2048)
Val feature shape: (9986, 2048)


**Text embeddings generated using BERT**

In [None]:
# Generate text embeddings for questions using a pretrained BERT model from Hugging Face Transformers.
# BERT is a powerful transformer-based model that captures contextual information in text.
# These embeddings will represent the textual input for our VQA model.

from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from tqdm import tqdm

def encode_text_with_bert(df, text_column, batch_size=32):
    """
    Encodes text using BERT in smaller batches to prevent memory crashes.

    Parameters:
        df (DataFrame): The input dataframe
        text_column (str): Column containing text
        batch_size (int): Number of samples per batch

    Returns:
        np.ndarray: BERT embeddings of shape (num_samples, 768)
    """

    # Load BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    model.eval()  # Set model to evaluation mode

    embeddings_list = []

    texts = df[text_column].tolist()
    num_batches = (len(texts) + batch_size - 1) // batch_size  # Compute total batches

    for i in tqdm(range(num_batches), desc="Encoding Questions with BERT"):
        batch_texts = texts[i * batch_size : (i + 1) * batch_size]

        # Tokenize the batch
        encoded_inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=22)

        # Get embeddings
        with torch.no_grad():
            outputs = model(**encoded_inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling

        embeddings_list.append(batch_embeddings.cpu().numpy())  # Move to CPU before converting to numpy

    return np.vstack(embeddings_list)  # Stack all batches

# For filtered train set
bert_embeddings_train = encode_text_with_bert(df_train_filtered, 'question')
print("BERT Embeddings Shape for filtered train set:", bert_embeddings_train.shape)  # Should be (num_samples, 768)

# For filtered Validation set
bert_embeddings_val = encode_text_with_bert(df_val_filtered, 'question')
print("BERT Embeddings Shape for filtered validation set:", bert_embeddings_val.shape)  # Should be (num_samples, 768)

# For test set
bert_embeddings_test = encode_text_with_bert(df_questions_test, 'question')
print("BERT Embeddings Shape for test set:", bert_embeddings_test.shape)  # Should be (num_samples, 768)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Encoding Questions with BERT: 100%|██████████| 1669/1669 [33:26<00:00,  1.20s/it]


BERT Embeddings Shape: (53389, 768)


Encoding Questions with BERT: 100%|██████████| 837/837 [15:58<00:00,  1.14s/it]

BERT Embeddings Shape: (26769, 768)





In [None]:
# Save embeddings
np.save("/content/bert_embeddings_train.npy", bert_embeddings_train)
np.save("/content/bert_embeddings_val.npy", bert_embeddings_val)
np.save("/content/bert_embeddings_test.npy", bert_embeddings_test)

**Answer Encoding**

In [None]:
# Encode the answer options into a numerical format suitable for training.
# Label encoding converts string answers to numerical labels, and one-hot encoding creates binary vectors.
# This representation is necessary for training classification models.

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Step 1: Label Encode Answers
label_encoder = LabelEncoder()

# Fit on all unique answers in both train and validation sets
all_answers = np.concatenate((df_train_filtered['multiple_choice_answer'].values,
                              df_val_filtered['multiple_choice_answer'].values))
label_encoder.fit(all_answers)

# Transform train & val answers
train_integer_encoded = label_encoder.transform(df_train_filtered['multiple_choice_answer'].values)
val_integer_encoded = label_encoder.transform(df_val_filtered['multiple_choice_answer'].values)

# Step 2: One-Hot Encoding
# Ensure OneHotEncoder knows all possible labels
num_classes = len(label_encoder.classes_)
onehot_encoder = OneHotEncoder(sparse_output=False, categories=[np.arange(num_classes)])

# Reshape before encoding
train_integer_encoded = train_integer_encoded.reshape(-1, 1)
val_integer_encoded = val_integer_encoded.reshape(-1, 1)

# Fit-transform train and transform validation
train_one_hot = onehot_encoder.fit_transform(train_integer_encoded)
val_one_hot = onehot_encoder.transform(val_integer_encoded)

# Step 3: Save Encoded Answers
np.save("train_answers.npy", train_one_hot)
np.save("val_answers.npy", val_one_hot)

# Save label mapping for decoding predictions later
np.save("answer_classes.npy", label_encoder.classes_)

# Print output shapes
print(f"Number of unique answers: {num_classes}")  # Should be 181
print(f"Train answers shape: {train_one_hot.shape}")  # (num_train_samples, 181)
print(f"Val answers shape: {val_one_hot.shape}")  # (num_val_samples, 181)


Number of unique answers: 181
Train answers shape: (53389, 181)
Val answers shape: (26769, 181)


### 2.1 Load all the features saved as numpy array

In [None]:
# Load the pre-extracted image features and question embeddings.
# This retrieves the saved feature arrays from the previous steps.
# Loading these features allows us to proceed with model training without re-extracting them.

train_features = np.load("train_features.npy")  # (num_train_samples, 2048)
val_features = np.load("val_features.npy")  # (num_val_samples, 2048)

**map image feature for each question row to get the same sampled dataset**

In [None]:
# Map image features to each question row based on image ID.
# This ensures that each question in the filtered DataFrames is associated with its corresponding image features.
# It handles cases where the order of questions might not match the order of extracted image features.

#Load features from filtered image IDs
train_image_features = dict(zip(df_train_filtered['image_id'], train_features)) # Create a dictionary for train features
val_image_features = dict(zip(df_val_filtered['image_id'], val_features)) # Create a dictionary for val features

# Function to get the image feature for each question row
def get_image_features_for_questions(df, image_features_dict):
    image_features = []
    for img_id in df['image_id']:
        image_features.append(image_features_dict.get(img_id, np.zeros(2048)))  # Default to zero if missing
    return np.array(image_features)

# Get the image features for the filtered train and val question rows
train_image_features_for_questions = get_image_features_for_questions(df_train_filtered, train_image_features)
val_image_features_for_questions = get_image_features_for_questions(df_val_filtered, val_image_features)

# Print out the shapes to confirm they match the number of rows
print(f"Train Image Features Shape: {train_image_features_for_questions.shape}")
print(f"Val Image Features Shape: {val_image_features_for_questions.shape}")

Train Image Features Shape: (53389, 2048)
Val Image Features Shape: (26769, 2048)


In [None]:
train_question_embeddings = np.load("bert_embeddings_train.npy")  # (num_train_samples, 768)
train_answers = np.load("train_answers.npy")

In [None]:
val_question_embeddings = np.load("bert_embeddings_val.npy")  # (num_val_samples, 768)
val_answers = np.load("val_answers.npy")  # (num_val_samples, 181)