# CS 5330: Final Project
### VQA finetuning of BLIP using Vizwiz dataset
- *Model link*: https://huggingface.co/Salesforce/blip-vqa-base
- *Dataset link*: https://www.kaggle.com/datasets/lhanhsin/vizwiz

Preprocessing 

In [None]:
import numpy as np 
import pandas as pd 

In [None]:
import kagglehub
path = kagglehub.dataset_download("lhanhsin/vizwiz")

print("Path to dataset files:", path)

In [None]:
! nvidia-smi

In [None]:
!kill -9 50631  

In [None]:
import json
path = '/home/kumar.riti/.cache/kagglehub/datasets/lhanhsin/vizwiz/versions/2/'
train_annotations_path = path + "Annotations/Annotations/train.json" 
val_annotations_path = path + "Annotations/Annotations/val.json"  
image_dir = "train/train"

In [None]:
with open(train_annotations_path, 'r') as f:
    train = json.load(f)
train_df = pd.DataFrame(train)

with open(val_annotations_path, 'r') as f:
    val = json.load(f)

val_df = pd.DataFrame(val)

In [None]:
def get_most_confident_answer(answers):
    if isinstance(answers, list) and len(answers) > 0:
        return max(answers, key=lambda x: x['answer_confidence'])['answer']
    return ""

train_df['most_confident_answer'] = train_df['answers'].apply(get_most_confident_answer)

In [None]:
val_df['most_confident_answer'] = val_df['answers'].apply(get_most_confident_answer)

In [None]:
trian_df = train_df[train_df['answerable'] == 1]	
val_df = val_df[val_df['answerable'] == 1]
train_df = train_df[:]
val_df = val_df[:]

val_df = val_df[['image', 'question', 'most_confident_answer']]
train_df = train_df[['image', 'question', 'most_confident_answer']]

In [None]:
import os
import requests
from transformers import BlipProcessor, BlipForQuestionAnswering
from datasets import load_dataset
import torch
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle

model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

torch.cuda.empty_cache()
torch.manual_seed(42)

In [None]:
import torch
from PIL import Image
from torchvision import transforms
class VQADataset(torch.utils.data.Dataset):
    """VQA dataset using a pandas DataFrame."""

    def __init__(self, df, processor, prefix, image_size=128):
        """
        Args:
            df (pd.DataFrame): DataFrame containing 'question', 'answer', and 'image' columns.
            processor: Hugging Face processor for tokenization and feature extraction.
            prefix: for path
            image_size: for resizing
        """
        self.df = df
        self.processor = processor
        self.prefix = prefix
        self.transform = transforms.Compose([
            transforms.Resize((image_size, image_size)),  
            transforms.ToTensor(), 
            #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
        ])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        row = self.df.iloc[idx]
        question = row['question']
        answer = row['most_confident_answer']
        image_path = self.prefix + row['image']

        # Load the image
        image = Image.open(image_path).convert("RGB")
        image = self.transform(image)
        

        # Process the image and text
        encoding = self.processor(image, question, padding="max_length",max_length=8, truncation=True, return_tensors="pt")

        # Process the answer
        labels = self.processor.tokenizer.encode(
            answer, max_length=8, pad_to_max_length=True, return_tensors='pt'
        )
        encoding["labels"] = labels

        # Remove batch dimension
        for k, v in encoding.items():
            encoding[k] = v.squeeze()

        return encoding


In [None]:

train_dataset = VQADataset(df=train_df,
                          processor=processor, prefix=f'{path}/train/train/')
valid_dataset = VQADataset(df=val_df,
                          processor=processor, prefix=f'{path}/val/val/')



In [None]:
batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9, last_epoch=-1, verbose=False)

num_epochs = 1
info = []
scaler = torch.cuda.amp.GradScaler()

for epoch in range(num_epochs):
    epoch_loss = 0
    model.train()
    for idx, batch in zip(tqdm(range(len(train_dataloader)), desc='Training batch: ...'), train_dataloader):
        input_ids = batch.pop('input_ids').to(device)
        pixel_values = batch.pop('pixel_values').to(device)
        attention_masked = batch.pop('attention_mask').to(device)
        labels = batch.pop('labels').to(device)
        
        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(input_ids=input_ids,
                        pixel_values=pixel_values,
                        attention_mask=attention_masked,
                        labels=labels)
            
        loss = outputs.loss
        epoch_loss += loss.item()
        optimizer.zero_grad()
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
    
    model.eval()
    eval_loss = 0
    for idx, batch in zip(tqdm(range(len(valid_dataloader)), desc='Validating batch: ...'), valid_dataloader):
        input_ids = batch.pop('input_ids').to(device)
        pixel_values = batch.pop('pixel_values').to(device)
        attention_masked = batch.pop('attention_mask').to(device)
        labels = batch.pop('labels').to(device)

        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(input_ids=input_ids,
                        pixel_values=pixel_values,
                        attention_mask=attention_masked,
                        labels=labels)
        
        loss = outputs.loss
        eval_loss += loss.item()

    info.append((epoch_loss/len(train_dataloader), eval_loss/len(valid_dataloader), optimizer.param_groups[0]["lr"]))
    print("Epoch: {} - Training loss: {} - Eval Loss: {} - LR: {}".format(epoch+1, epoch_loss/len(train_dataloader), eval_loss/len(valid_dataloader), optimizer.param_groups[0]["lr"]))
    scheduler.step()
    model.save_pretrained("Model/blip-finetuned-vizwiz", from_pt=True) 
    processor.save_pretrained("Model/blip-finetuned-vizwiz", from_pt=True) 
    print("Saved model to Model/blip-finetuned-vizwiz")
print("The finetuning process is completed.")

In [None]:
batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=8, shuffle=False, pin_memory=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9, last_epoch=-1, verbose=False)

num_epochs = 1
info = []
scaler = torch.cuda.amp.GradScaler()

for epoch in range(num_epochs):
    epoch_loss = 0
    model.train()
    for idx, batch in zip(tqdm(range(len(train_dataloader)), desc='Training batch: ...'), train_dataloader):
        input_ids = batch.pop('input_ids').to(device)
        pixel_values = batch.pop('pixel_values').to(device)
        attention_masked = batch.pop('attention_mask').to(device)
        labels = batch.pop('labels').to(device)
        
        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(input_ids=input_ids,
                        pixel_values=pixel_values,
                        attention_mask=attention_masked,
                        labels=labels)
            
        loss = outputs.loss
        epoch_loss += loss.item()
        optimizer.zero_grad()
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
    
    model.eval()
    eval_loss = 0
    for idx, batch in zip(tqdm(range(len(valid_dataloader)), desc='Validating batch: ...'), valid_dataloader):
        input_ids = batch.pop('input_ids').to(device)
        pixel_values = batch.pop('pixel_values').to(device)
        attention_masked = batch.pop('attention_mask').to(device)
        labels = batch.pop('labels').to(device)

        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(input_ids=input_ids,
                        pixel_values=pixel_values,
                        attention_mask=attention_masked,
                        labels=labels)
        
        loss = outputs.loss
        eval_loss += loss.item()

    info.append((epoch_loss/len(train_dataloader), eval_loss/len(valid_dataloader), optimizer.param_groups[0]["lr"]))
    print("Epoch: {} - Training loss: {} - Eval Loss: {} - LR: {}".format(epoch+1, epoch_loss/len(train_dataloader), eval_loss/len(valid_dataloader), optimizer.param_groups[0]["lr"]))
    scheduler.step()
    
    model.save_pretrained("Model/blip-saved-model", from_pt=True) 
    processor.save_pretrained("Model/blip-saved-model", from_pt=True) 
    print("Saved model to Model/blip-saved-model")
        
    
pickle.dump(info, open("tracking_information.pkl", "wb"))
print("The finetuning process is completed.")

In [None]:
import matplotlib.pyplot as plt
from PIL import Image

def visualize_vqa_result(processor, model, dataset, index):
    """
    Visualizes the VQA results by displaying the image, question, actual answer, and predicted answer.

    Args:
        processor: BLIP processor for encoding images and text.
        model: Fine-tuned BLIP model.
        dataset: Dataset containing "image", "question", and "answer".
        index: Index of the sample to visualize.

    Returns:
        None
    """
    image_path = path + 'val/val/' + dataset["image"][index]
    question = dataset["question"][index]
    actual_answer = dataset["most_confident_answer"][index]

    image = Image.open(image_path).convert("RGB")
    plt.imshow(image)
    plt.axis("off")
    plt.title("Test Image")
    plt.show()

    inputs = processor(images=image, text=question, return_tensors="pt").to("cuda")
    output = model.generate(**inputs)
    predicted_answer = processor.decode(output[0], skip_special_tokens=True)

    print(f"Question: {question}")
    print(f"Actual Answer: {actual_answer}")
    print(f"Predicted Answer: {predicted_answer}")


In [None]:
visualize_vqa_result(processor, model, val_df, index=31)