In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import time
import torch
import pandas as pd
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering
from tqdm import tqdm


# CONFIGURATION

# Fine-tuned model path
MODEL_PATH = "/kaggle/input/fine-tuned-blip-vqa-lora/fine_tuned_blip_vqa_lora"

# Data paths 
VQA_CSV_PATH = "/kaggle/input/vqadataset/VQADataset.csv"
ABO_META_PATH = "/kaggle/input/abo-small/metadata/images.csv"
ABO_IMAGE_PATH = "/kaggle/input/abo-small/small"

# Number of samples to test
NUM_SAMPLES = 10000


# LOAD MODEL & PROCESSOR

processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained(MODEL_PATH)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


# LOAD DATA

vqa_df = pd.read_csv(VQA_CSV_PATH)
abo_meta = pd.read_csv(ABO_META_PATH)
vqa_df = pd.merge(vqa_df, abo_meta[['image_id', 'path']], on='image_id', how='left')
vqa_df = vqa_df.dropna(subset=["path"]).reset_index(drop=True)
vqa_df = vqa_df.head(NUM_SAMPLES)


# INFERENCE FUNCTIONS

def run_inference(use_cache: bool):
    answers = []
    start_time = time.time()

    for _, row in tqdm(vqa_df.iterrows(), total=len(vqa_df), desc=f"{'With' if use_cache else 'Without'} KV Cache"):
        image_path = os.path.join(ABO_IMAGE_PATH, row['path'])
        question = row['question']

        try:
            image = Image.open(image_path).convert("RGB")
        except Exception:
            image = Image.new("RGB", (224, 224), (0, 0, 0))

        inputs = processor(image, question, return_tensors="pt").to(device)

        with torch.no_grad():
            output_ids = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                pixel_values=inputs["pixel_values"],  
                use_cache=use_cache,
                max_length=32,
                num_beams=1,
                do_sample=False
            )

        answer = processor.tokenizer.decode(output_ids[0], skip_special_tokens=True)
        answers.append(answer)

    total_time = time.time() - start_time
    avg_time = total_time / len(vqa_df)
    return answers, avg_time



# RUN BENCHMARK

torch.cuda.empty_cache()
answers_no_cache, time_no_cache = run_inference(use_cache=False)

torch.cuda.empty_cache()
answers_with_cache, time_with_cache = run_inference(use_cache=True)

# RESULTS

# Compare answers 
num_same = sum([a1 == a2 for a1, a2 in zip(answers_no_cache, answers_with_cache)])

print("\n==============================")
print(f"Average time WITHOUT KV Cache: {time_no_cache:.4f} seconds/sample")
print(f"Average time WITH KV Cache  : {time_with_cache:.4f} seconds/sample")
print(f"Answer consistency          : {num_same}/{NUM_SAMPLES} identical")
print("==============================")


Without KV Cache: 100%|██████████| 10000/10000 [17:13<00:00,  9.68it/s]
With KV Cache: 100%|██████████| 10000/10000 [16:37<00:00, 10.03it/s]


Average time WITHOUT KV Cache: 0.1033 seconds/sample
Average time WITH KV Cache  : 0.0997 seconds/sample
Answer consistency          : 10000/10000 identical





In [None]:
import os
import time
import torch
import pandas as pd
from PIL import Image
from transformers import ViltProcessor, ViltForQuestionAnswering
from tqdm import tqdm
import logging

# Set up logging for errors
logging.basicConfig(filename='/kaggle/working/inference_errors.log', level=logging.ERROR, format='%(asctime)s - %(message)s')


# CONFIGURATION

# Fine-tuned model path 
MODEL_PATH = "/kaggle/input/fine-tuned-vilt-vqa-lora/fine_tuned_vilt_vqa_lora/fine_tuned_vilt_vqa_lora"

# Data paths
VQA_CSV_PATH = "/kaggle/input/vqadataset/VQADataset.csv"
ABO_META_PATH = "/kaggle/input/abo-small/metadata/images.csv"
ABO_IMAGE_PATH = "/kaggle/input/abo-small/small"

# Number of samples to test
NUM_SAMPLES = 5000

# LOAD MODEL & PROCESSOR

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
try:
    model = ViltForQuestionAnswering.from_pretrained(MODEL_PATH)
except Exception as e:
    logging.error(f"Failed to load model from {MODEL_PATH}: {e}")
    raise

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Get answer vocabulary for mapping logits to answers
answer_vocab = model.config.id2label  # Maps index (0-3128) to answer string


# LOAD DATA

try:
    vqa_df = pd.read_csv(VQA_CSV_PATH)
    abo_meta = pd.read_csv(ABO_META_PATH)
    vqa_df = pd.merge(vqa_df, abo_meta[['image_id', 'path']], on='image_id', how='left')
    vqa_df = vqa_df.dropna(subset=["path"]).reset_index(drop=True)
    vqa_df = vqa_df.head(NUM_SAMPLES)
except Exception as e:
    logging.error(f"Failed to load or process dataset: {e}")
    raise


# INFERENCE FUNCTIONS

def run_inference(run_label: str = "Standard"):
    answers = []
    start_time = time.time()

    for _, row in tqdm(vqa_df.iterrows(), total=len(vqa_df), desc=f"Inference ({run_label})"):
        image_path = os.path.join(ABO_IMAGE_PATH, row['path'])
        question = row['question']

        try:
            image = Image.open(image_path).convert("RGB")
            image = image.resize((384, 384), Image.Resampling.LANCZOS)  # ViLT expects 384x384
        except Exception as e:
            logging.error(f"Error loading image {image_path}: {e}")
            image = Image.new("RGB", (384, 384), (0, 0, 0))

        inputs = processor(
            images=image,
            text=question,
            padding="max_length",
            max_length=40,
            truncation=True,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                pixel_values=inputs["pixel_values"]
            )

        # Get the predicted answer from logits
        logits = outputs.logits  # Shape: [1, 3129]
        pred_idx = torch.argmax(logits, dim=-1).item()  # Get index of highest logit
        answer = answer_vocab[pred_idx]  # Map index to answer string
        answers.append(answer)

    total_time = time.time() - start_time
    avg_time = total_time / len(vqa_df)
    return answers, avg_time


# RUN BENCHMARK

torch.cuda.empty_cache()
answers_run1, time_run1 = run_inference(run_label="Run 1")

torch.cuda.empty_cache()
answers_run2, time_run2 = run_inference(run_label="Run 2")


# RESULTS

# Compare answers
num_same = sum([a1 == a2 for a1, a2 in zip(answers_run1, answers_run2)])

# Save results to CSV
results_df = pd.DataFrame({
    'question': vqa_df['question'],
    'answer_run1': answers_run1,
    'answer_run2': answers_run2
})
results_df.to_csv('/kaggle/working/inference_results.csv', index=False)

print("\n==============================")
print(f"Average time (Run 1): {time_run1:.4f} seconds/sample")
print(f"Average time (Run 2): {time_run2:.4f} seconds/sample")
print(f"Answer consistency  : {num_same}/{NUM_SAMPLES} identical")
print(f"Inference results saved to /kaggle/working/inference_results.csv")
print("==============================")

Inference (Run 1): 100%|██████████| 5000/5000 [32:22<00:00,  2.57it/s]
Inference (Run 2): 100%|██████████| 5000/5000 [32:13<00:00,  2.59it/s]


Average time (Run 1): 0.3885 seconds/sample
Average time (Run 2): 0.3868 seconds/sample
Answer consistency  : 5000/5000 identical
Inference results saved to /kaggle/working/inference_results.csv



