### Install Required Libraries

In [None]:
# Install necessary libraries
!pip install transformers accelerate git+https://github.com/facebookresearch/detr.git -q
!pip install torch torchvision -q


In [None]:
# Mount Google Drive to access image folder and CSV file
from google.colab import drive
drive.mount('/content/drive')

### Import Python Libraries

In [None]:
# Import necessary libraries
from transformers import ViltProcessor, ViltForQuestionAnswering
from PIL import Image
import torch
import pandas as pd
import os
import re
from tqdm import tqdm


### Load VILT Processor and Model

In [None]:
# Setup device for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load processor and model for VILT QnA
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa").to(device)


### Load CSV and Format Questions

In [None]:
# Load your CSV from Google Drive
csv_file_path = '/content/drive/MyDrive/VR_proj2/proper_result_2_final.csv'  # Modify path accordingly
df = pd.read_csv(csv_file_path)

# Add constraint to each question for one-word answer
df['question'] = df['question'].apply(lambda q: f"Answer in one word: {q.strip()}")

# Save back or to a new CSV
df.to_csv("/content/drive/MyDrive/VR_proj2/test_1_oneword.csv", index=False)
print("Updated CSV saved as test_1_oneword.csv with 'Answer in one word' prompt.")


###  Run VILT Model on Each Image-Question Pair

In [None]:
# Set row range for batch processing
start_idx = 50000
end_idx = 60000  # exclusive, so rows 0 to 19
subset_df = df.iloc[start_idx:end_idx]
results = []

# Process each row
for idx, row in tqdm(subset_df.iterrows(), total=len(subset_df), desc="Running VILT on selected rows"):
    image_path = os.path.join("/content/drive/MyDrive/VR_proj2/abo-images-small", row['path'])
    if not os.path.exists(image_path):
        results.append("image_not_found")
        continue

    try:
        # Load image and question
        image = Image.open(image_path).convert("RGB")
        question = row["question"]

        # Prepare inputs
        inputs = processor(images=image, text=question, return_tensors="pt").to(device)

        # Forward pass to get logits
        with torch.no_grad():
            outputs = model(**inputs)

        # Get the logits and predicted token IDs
        logits = outputs.logits
        idx = logits.argmax(dim=-1).item()

        # Use id2label mapping to get the answer
        answer = model.config.id2label[idx]

        # Handle cases where the answer might be empty or invalid
        if not answer:
            answer = "no_answer"
        answer = re.sub(r"[^\w]", "", answer)  # Remove punctuation

    except Exception as e:
        answer = f"error: {str(e)}"

    results.append(answer)

# Save results to a new CSV
subset_df['vilt_answer'] = results
subset_df.to_csv(f"/content/drive/MyDrive/VR_proj2/vilt_vqa_results_{start_idx}_{end_idx}.csv", index=False)
