In [None]:
!pip install -q openai pandas tqdm gdown transformers==4.48.0 flash-attn sentence-transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/6.0 MB[0m [31m11.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/6.0 MB[0m [31m41.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.0/6.0 MB[0m [31m71.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9

In [None]:
import gdown

url = "https://drive.google.com/uc?id=1Ukyxc6e9vFNFrA6PjbPNuBUk6w1cEwRa"
output = "dataset.zip"
gdown.download(url, output, quiet=False)
!unzip -qq dataset.zip

Downloading...
From (original): https://drive.google.com/uc?id=1Ukyxc6e9vFNFrA6PjbPNuBUk6w1cEwRa
From (redirected): https://drive.google.com/uc?id=1Ukyxc6e9vFNFrA6PjbPNuBUk6w1cEwRa&confirm=t&uuid=c1156b85-86f5-4f5d-911a-18289541df81
To: /content/dataset.zip
100%|██████████| 4.99G/4.99G [00:49<00:00, 100MB/s]


In [None]:
import os
import re
import pandas as pd
from tqdm import tqdm
from PIL import Image
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoProcessor
import torch

CSV_PATH       = "vqa_dataset_test.csv"
IMAGE_DIR      = "/content/dataset/image_data/images/test"
OUTPUT_DIR     = "Phi35"
IMAGES_PER_CAT = 500
MIN_DAY        = 200
MIN_NIGHT      = 200

MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"

os.makedirs(OUTPUT_DIR, exist_ok=True)

processor = AutoProcessor.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    num_crops=16
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    _attn_implementation="flash_attention_2"
)

def is_sensible_question(question):
    match = re.search(r"Are there more (.+?)s? than (.+?)s?\?", question, re.IGNORECASE)
    if match:
        left = match.group(1).lower().strip(" '\"")
        right = match.group(2).lower().strip(" '\"")
        if left == right:
            return False
    return True

def find_image(root_dir, filename):
    matches = list(Path(root_dir).rglob(filename))
    if not matches:
        raise FileNotFoundError(f"No image named '{filename}' under '{root_dir}'")
    if len(matches) > 1:
        print(f"Warning: multiple matches for '{filename}', using '{matches[0]}'")
    return str(matches[0])

# --- FILTER CATEGORIES HERE ---
categories_to_process = [
    "Object Counting",
    "Surrounding Description",
    "Object Description"
]

df = pd.read_csv(CSV_PATH)
df = df[df["category"].isin(categories_to_process)]
 # <-- Only these categories

system_prompt = (
    "You are a driving and road safety expert. Answer visual questions about road scenes and traffic objects with precision and expertise.\n\n"
    "For all spatial relationship questions, answer in multiple words, describing the spatial relation clearly.\n\n"
    "For all other questions, answer in one word only, with no period at the end. If the answer is a number, use digits. "
    "If the answer is a vehicle, be specific (for example: car, bus, truck, motorcycle, bicycle, etc).\n\n"
    "Do not provide explanations or extra context-just the answer as specified above."
)

for category, group in df.groupby("category"):
    print(f"\nProcessing category: {category}")

    group = group.copy()
    group['is_day'] = group['filename'].str.contains('Day')
    group['is_night'] = group['filename'].str.contains('Night')

    day_rows = group[group['is_day']]
    night_rows = group[group['is_night']]

    n_day = min(len(day_rows), MIN_DAY)
    n_night = min(len(night_rows), MIN_NIGHT)
    n_remaining = IMAGES_PER_CAT - (n_day + n_night)

    sampled_day = day_rows.sample(n=n_day, random_state=42) if n_day > 0 else pd.DataFrame()
    sampled_night = night_rows.sample(n=n_night, random_state=42) if n_night > 0 else pd.DataFrame()
    remaining_rows = group.drop(sampled_day.index).drop(sampled_night.index)
    sampled_remaining = remaining_rows.sample(n=n_remaining, random_state=42) if n_remaining > 0 else pd.DataFrame()

    sampled_rows = pd.concat([sampled_day, sampled_night, sampled_remaining]).reset_index(drop=True)

    results = []
    for _, row in tqdm(sampled_rows.iterrows(), total=len(sampled_rows), desc=f"VQA {category}"):
        question     = row["question"]
        ground_truth = row["answer"]

        if not is_sensible_question(question):
            continue

        try:
            image_path = find_image(IMAGE_DIR, row["filename"])
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            print(f"Skipping {row['filename']}: {e}")
            continue

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"<|image_1|>\n{question}"}
        ]

        prompt = processor.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        inputs = processor(prompt, [image], return_tensors="pt").to(model.device)
        generation_args = {
            "max_new_tokens": 64,
            "temperature": 0.0,
            "do_sample": False,
            "eos_token_id": processor.tokenizer.eos_token_id,
        }
        generate_ids = model.generate(**inputs, **generation_args)
        generate_ids = generate_ids[:, inputs['input_ids'].shape[-1]:]
        pred = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0].strip()

        results.append({
            "image_name":   row["filename"],
            "category":     category,
            "question":     question,
            "ground_truth": ground_truth,
            "prediction":   pred,
            "correct":      pred.lower() == ground_truth.lower()
        })

    out_csv = os.path.join(OUTPUT_DIR, f"{category.replace(' ', '_')}.csv")
    pd.DataFrame(results).to_csv(out_csv, index=False)
    print(f"Results for category written to {out_csv}")

print("\nAll categories processed and results saved in", OUTPUT_DIR)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

processing_phi3_v.py:   0%|          | 0.00/22.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-vision-instruct:
- processing_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


preprocessor_config.json:   0%|          | 0.00/442 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/9.52k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.85M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.78k [00:00<?, ?B/s]

configuration_phi3_v.py:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-vision-instruct:
- configuration_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3_v.py:   0%|          | 0.00/88.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-vision-instruct:
- modeling_phi3_v.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/68.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.35G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]


Processing category: Object Counting


The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48
VQA Object Counting:  56%|█████▌    | 278/500 [01:38<01:25,  2.59it/s]



VQA Object Counting: 100%|██████████| 500/500 [02:55<00:00,  2.85it/s]


Results for category written to Experiment_1_Category_HF_Phi35/Object_Counting.csv

Processing category: Object Description


VQA Object Description:  42%|████▏     | 208/500 [01:20<01:58,  2.47it/s]



VQA Object Description:  63%|██████▎   | 317/500 [02:02<01:09,  2.65it/s]



VQA Object Description:  66%|██████▌   | 328/500 [02:06<01:10,  2.43it/s]



VQA Object Description:  67%|██████▋   | 336/500 [02:09<01:02,  2.61it/s]



VQA Object Description:  72%|███████▏  | 362/500 [02:19<00:57,  2.41it/s]



VQA Object Description:  77%|███████▋  | 384/500 [02:28<00:43,  2.65it/s]



VQA Object Description: 100%|██████████| 500/500 [03:13<00:00,  2.58it/s]


Results for category written to Experiment_1_Category_HF_Phi35/Object_Description.csv

Processing category: Surrounding Description


VQA Surrounding Description:  42%|████▏     | 208/500 [01:29<02:03,  2.37it/s]



VQA Surrounding Description:  63%|██████▎   | 317/500 [02:10<01:09,  2.65it/s]



VQA Surrounding Description:  66%|██████▌   | 328/500 [02:15<01:04,  2.67it/s]



VQA Surrounding Description:  67%|██████▋   | 336/500 [02:18<01:00,  2.69it/s]



VQA Surrounding Description:  72%|███████▏  | 362/500 [02:28<00:51,  2.67it/s]



VQA Surrounding Description:  77%|███████▋  | 384/500 [02:37<00:45,  2.52it/s]



VQA Surrounding Description: 100%|██████████| 500/500 [03:25<00:00,  2.43it/s]

Results for category written to Experiment_1_Category_HF_Phi35/Surrounding_Description.csv

All categories processed and results saved in Experiment_1_Category_HF_Phi35





For Paligemma, Huggingface key with access to the model is needed

login with your account and go to the below website
https://huggingface.co/google/paligemma2-3b-mix-224
Click Acknowledge and Get the API key embed in secrets in colab as 'hf_key'

In [None]:
import os
import re
import pandas as pd
from tqdm import tqdm
from PIL import Image
from pathlib import Path
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch
from huggingface_hub import login
from google.colab import userdata
from sklearn.metrics import classification_report, confusion_matrix

CSV_PATH       = "vqa_dataset_test.csv"
IMAGE_DIR      = "/content/dataset/image_data/images/test"
OUTPUT_DIR     = "PaliGemma"
IMAGES_PER_CAT = 500
MIN_DAY        = 200
MIN_NIGHT      = 200

# Use the mix checkpoint for VQA and general inference
MODEL_NAME = "google/paligemma2-3b-mix-224"
token = userdata.get('hf_key')

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Authenticate
login(token)

# Load processor and model with token
processor = AutoProcessor.from_pretrained(MODEL_NAME, token=token)
model = AutoModelForVision2Seq.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16,
    token=token
)

def is_sensible_question(question):
    match = re.search(r"Are there more (.+?)s? than (.+?)s?\?", question, re.IGNORECASE)
    if match:
        left = match.group(1).lower().strip(" '\"")
        right = match.group(2).lower().strip(" '\"")
        if left == right:
            return False
    return True

def find_image(root_dir, filename):
    matches = list(Path(root_dir).rglob(filename))
    if not matches:
        raise FileNotFoundError(f"No image named '{filename}' under '{root_dir}'")
    if len(matches) > 1:
        print(f"Warning: multiple matches for '{filename}', using '{matches[0]}'")
    return str(matches[0])

def extract_answer(output_text, question=None):
    """
    Extracts only the answer from the model's output.
    If the output contains the question followed by a newline and the answer, it returns only the answer.
    """
    # If question is provided, try to split using the question
    if question and question in output_text:
        # Split on the question, then take what's after it
        answer_part = output_text.split(question, 1)[-1].strip()
        # If the answer is on the next line, take that line
        if "\n" in answer_part:
            answer = answer_part.split("\n", 1)[-1].strip()
        else:
            answer = answer_part.strip()
        return answer
    # Fallback: if output is multi-line, take the last non-empty line
    lines = [line.strip() for line in output_text.strip().split("\n") if line.strip()]
    if lines:
        return lines[-1]
    return output_text.strip()




# --- VQA INFERENCE CODE ---

categories_to_process = [
    "Object Counting",
    "Surrounding Description",
    "Object Description"
]

df = pd.read_csv(CSV_PATH)
df = df[df["category"].isin(categories_to_process)]

for category, group in df.groupby("category"):
    print(f"\nProcessing category: {category}")

    group = group.copy()
    group['is_day'] = group['filename'].str.contains('Day')
    group['is_night'] = group['filename'].str.contains('Night')

    day_rows = group[group['is_day']]
    night_rows = group[group['is_night']]

    n_day = min(len(day_rows), MIN_DAY)
    n_night = min(len(night_rows), MIN_NIGHT)
    n_remaining = IMAGES_PER_CAT - (n_day + n_night)

    sampled_day = day_rows.sample(n=n_day, random_state=42) if n_day > 0 else pd.DataFrame()
    sampled_night = night_rows.sample(n=n_night, random_state=42) if n_night > 0 else pd.DataFrame()
    remaining_rows = group.drop(sampled_day.index).drop(sampled_night.index)
    sampled_remaining = remaining_rows.sample(n=n_remaining, random_state=42) if n_remaining > 0 else pd.DataFrame()

    sampled_rows = pd.concat([sampled_day, sampled_night, sampled_remaining]).reset_index(drop=True)

    results = []
    for _, row in tqdm(sampled_rows.iterrows(), total=len(sampled_rows), desc=f"VQA {category}"):
        question = row["question"]
        ground_truth = row["answer"]

        if not is_sensible_question(question):
            continue

        try:
            image_path = find_image(IMAGE_DIR, row["filename"])
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            print(f"Skipping {row['filename']}: {e}")
            continue

        system_prompt = (
    "You are a driving and road safety expert. Answer visual questions about road scenes and traffic objects with precision and expertise.\n\n"
    "For all spatial relationship questions, answer in multiple words, describing the spatial relation clearly.\n\n"
    "For all other questions, answer in one word only, with no period at the end. If the answer is a number, use digits. "
    "If the answer is a vehicle, be specific (for example: car, bus, truck, motorcycle, bicycle, etc).\n\n"
    "Do not provide explanations or extra context-just the answer as specified above.")


        prompt = f"{system_prompt}\n\nanswer en {question}"
        inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)
        output = model.generate(**inputs, max_new_tokens=64)
        raw_output = processor.decode(output[0], skip_special_tokens=True)
        pred = extract_answer(raw_output)

        results.append({
            "image_name":   row["filename"],
            "category":     category,
            "question":     question,
            "ground_truth": ground_truth,
            "prediction":   pred,
            "correct":      pred.lower() == ground_truth.lower()
        })

    out_csv = os.path.join(OUTPUT_DIR, f"{category.replace(' ', '_')}.csv")
    pd.DataFrame(results).to_csv(out_csv, index=False)
    print(f"Results for category written to {out_csv}")

print("\nAll categories processed and results saved in", OUTPUT_DIR)




In [None]:
import os
import pandas as pd
import base64
import time
from pathlib import Path
from tqdm import tqdm
from openai import OpenAI, RateLimitError
# from google.colab import userdata
from concurrent.futures import ThreadPoolExecutor, as_completed

# Configuration
CSV_PATH       = "vqa_dataset_test.csv"
IMAGE_DIR      = "/content/dataset/image_data/images/test"
OUTPUT_DIR     = "4o"
MODEL_NAME     = "gpt-4o"
IMAGES_PER_CAT = 500
MIN_DAY        = 200
MIN_NIGHT      = 200
MAX_TOKENS     = 256
MAX_WORKERS    = 5
MAX_RETRIES    = 6
INITIAL_BACKOFF = 2

categories_to_process = [
    "Object Counting",
    "Surrounding Description",
    "Object Description"
]

system_prompt = ('''
You are a driving and road safety expert. Answer visual questions about road scenes and traffic objects with precision and expertise.
For all other questions, answer in one word only, with no period at the end. If the answer is a number, use digits.
If the answer is a vehicle, be specific (for example: car, bus, truck, motorcycle, bicycle, etc).
Do not provide explanations or extra context-just the answer as specified above.
''')

os.makedirs(OUTPUT_DIR, exist_ok=True)
api_key=userdata.get('openai_api_key')
client = OpenAI(api_key=api_key)

def find_image(root_dir: str, filename: str) -> str:
    matches = list(Path(root_dir).rglob(filename))
    if not matches:
        raise FileNotFoundError(f"No image named '{filename}' under '{root_dir}'")
    if len(matches) > 1:
        print(f"Warning: multiple matches for '{filename}', using '{matches[0]}'")
    return str(matches[0])

def process_row(row, category):
    img_path     = find_image(IMAGE_DIR, row["filename"])
    question     = row["question"]
    ground_truth = row["answer"]

    # Read and encode the image as base64
    with open(img_path, "rb") as f:
        img_b64 = base64.b64encode(f.read()).decode()

    messages = [
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}
            ]
        }
    ]

    # Retry logic for rate limit errors
    for attempt in range(MAX_RETRIES):
        try:
            resp = client.chat.completions.create(
                model=MODEL_NAME,
                messages=messages,
                max_completion_tokens=MAX_TOKENS
            )
            pred = resp.choices[0].message.content.strip()
            return {
                "image_name":   row["filename"],
                "category":     category,
                "question":     question,
                "ground_truth": ground_truth,
                "prediction":   pred,
                "correct":      pred.lower() == ground_truth.lower()
            }
        except Exception as e:
            # Check if it's a rate limit error
            if hasattr(e, 'status_code') and e.status_code == 429:
                backoff = INITIAL_BACKOFF * (2 ** attempt)
                print(f"Rate limit hit. Retry {attempt+1}/{MAX_RETRIES} in {backoff:.1f}s...")
                time.sleep(backoff)
            else:
                print(f"Error: {e}")
                break
    # If all retries fail, return a failed result
    return {
        "image_name":   row["filename"],
        "category":     category,
        "question":     question,
        "ground_truth": ground_truth,
        "prediction":   "RATE_LIMIT_ERROR",
        "correct":      False
    }

# Load and filter CSV
df = pd.read_csv(CSV_PATH)
df = df[df["category"].isin(categories_to_process)]

for category, group in df.groupby("category"):
    print(f"\nProcessing category: {category}")

    group = group.copy()
    group['is_day'] = group['filename'].str.contains('Day')
    group['is_night'] = group['filename'].str.contains('Night')

    day_rows = group[group['is_day']]
    night_rows = group[group['is_night']]

    n_day = min(len(day_rows), MIN_DAY)
    n_night = min(len(night_rows), MIN_NIGHT)
    n_remaining = IMAGES_PER_CAT - (n_day + n_night)

    sampled_day = day_rows.sample(n=n_day, random_state=42) if n_day > 0 else pd.DataFrame()
    sampled_night = night_rows.sample(n=n_night, random_state=42) if n_night > 0 else pd.DataFrame()
    remaining_rows = group.drop(sampled_day.index).drop(sampled_night.index)
    sampled_remaining = remaining_rows.sample(n=n_remaining, random_state=42) if n_remaining > 0 else pd.DataFrame()

    sampled_rows = pd.concat([sampled_day, sampled_night, sampled_remaining]).reset_index(drop=True)

    results = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(process_row, row, category) for _, row in sampled_rows.iterrows()]
        for f in tqdm(as_completed(futures), total=len(futures), desc=f"VQA {category}"):
            try:
                results.append(f.result())
            except Exception as e:
                print(f"Error: {e}")

    out_csv = os.path.join(OUTPUT_DIR, f"{category.replace(' ', '_')}.csv")
    pd.DataFrame(results).to_csv(out_csv, index=False)
    print(f"Results for category written to {out_csv}")

print("\nAll categories processed and results saved in", OUTPUT_DIR)


In [None]:
import os
import pandas as pd
import base64
import time
from pathlib import Path
from tqdm import tqdm
from openai import OpenAI, RateLimitError
# from google.colab import userdata
from concurrent.futures import ThreadPoolExecutor, as_completed

# Configuration
CSV_PATH       = "vqa_dataset_test.csv"
IMAGE_DIR      = "/content/dataset/image_data/images/test"
OUTPUT_DIR     = "4o_mini"
MODEL_NAME     = "gpt-4o-mini"
IMAGES_PER_CAT = 500
MIN_DAY        = 200
MIN_NIGHT      = 200
MAX_TOKENS     = 256
MAX_WORKERS    = 5
MAX_RETRIES    = 6
INITIAL_BACKOFF = 2

categories_to_process = [
    "Object Counting",
    "Surrounding Description",
    "Object Description"
]

system_prompt = ('''
You are a driving and road safety expert. Answer visual questions about road scenes and traffic objects with precision and expertise.
For all other questions, answer in one word only, with no period at the end. If the answer is a number, use digits.
If the answer is a vehicle, be specific (for example: car, bus, truck, motorcycle, bicycle, etc).
Do not provide explanations or extra context-just the answer as specified above.
''')

os.makedirs(OUTPUT_DIR, exist_ok=True)
api_key=userdata.get('openai_api_key')
client = OpenAI(api_key=api_key)

def find_image(root_dir: str, filename: str) -> str:
    matches = list(Path(root_dir).rglob(filename))
    if not matches:
        raise FileNotFoundError(f"No image named '{filename}' under '{root_dir}'")
    if len(matches) > 1:
        print(f"Warning: multiple matches for '{filename}', using '{matches[0]}'")
    return str(matches[0])

def process_row(row, category):
    img_path     = find_image(IMAGE_DIR, row["filename"])
    question     = row["question"]
    ground_truth = row["answer"]

    # Read and encode the image as base64
    with open(img_path, "rb") as f:
        img_b64 = base64.b64encode(f.read()).decode()

    messages = [
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}
            ]
        }
    ]

    # Retry logic for rate limit errors
    for attempt in range(MAX_RETRIES):
        try:
            resp = client.chat.completions.create(
                model=MODEL_NAME,
                messages=messages,
                max_completion_tokens=MAX_TOKENS
            )
            pred = resp.choices[0].message.content.strip()
            return {
                "image_name":   row["filename"],
                "category":     category,
                "question":     question,
                "ground_truth": ground_truth,
                "prediction":   pred,
                "correct":      pred.lower() == ground_truth.lower()
            }
        except Exception as e:
            # Check if it's a rate limit error
            if hasattr(e, 'status_code') and e.status_code == 429:
                backoff = INITIAL_BACKOFF * (2 ** attempt)
                print(f"Rate limit hit. Retry {attempt+1}/{MAX_RETRIES} in {backoff:.1f}s...")
                time.sleep(backoff)
            else:
                print(f"Error: {e}")
                break
    # If all retries fail, return a failed result
    return {
        "image_name":   row["filename"],
        "category":     category,
        "question":     question,
        "ground_truth": ground_truth,
        "prediction":   "RATE_LIMIT_ERROR",
        "correct":      False
    }

# Load and filter CSV
df = pd.read_csv(CSV_PATH)
df = df[df["category"].isin(categories_to_process)]

for category, group in df.groupby("category"):
    print(f"\nProcessing category: {category}")

    group = group.copy()
    group['is_day'] = group['filename'].str.contains('Day')
    group['is_night'] = group['filename'].str.contains('Night')

    day_rows = group[group['is_day']]
    night_rows = group[group['is_night']]

    n_day = min(len(day_rows), MIN_DAY)
    n_night = min(len(night_rows), MIN_NIGHT)
    n_remaining = IMAGES_PER_CAT - (n_day + n_night)

    sampled_day = day_rows.sample(n=n_day, random_state=42) if n_day > 0 else pd.DataFrame()
    sampled_night = night_rows.sample(n=n_night, random_state=42) if n_night > 0 else pd.DataFrame()
    remaining_rows = group.drop(sampled_day.index).drop(sampled_night.index)
    sampled_remaining = remaining_rows.sample(n=n_remaining, random_state=42) if n_remaining > 0 else pd.DataFrame()

    sampled_rows = pd.concat([sampled_day, sampled_night, sampled_remaining]).reset_index(drop=True)

    results = []
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(process_row, row, category) for _, row in sampled_rows.iterrows()]
        for f in tqdm(as_completed(futures), total=len(futures), desc=f"VQA {category}"):
            try:
                results.append(f.result())
            except Exception as e:
                print(f"Error: {e}")

    out_csv = os.path.join(OUTPUT_DIR, f"{category.replace(' ', '_')}.csv")
    pd.DataFrame(results).to_csv(out_csv, index=False)
    print(f"Results for category written to {out_csv}")

print("\nAll categories processed and results saved in", OUTPUT_DIR)


In [None]:
import glob
import os
import pandas as pd

# List of folders to process
folders = ["4o", "paligemma", "phi35", "4o-mini"]

for folder_path in folders:
    folder_name = os.path.basename(folder_path)

    # Find all CSV files in the folder
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

    if csv_files:
        # Read and combine all CSV files
        dataframes = [pd.read_csv(file) for file in csv_files]
        combined_df = pd.concat(dataframes, ignore_index=True)
        print(f"Combined DataFrame for folder '{folder_name}':")
        print(combined_df.head())

        # Save the combined DataFrame to a new CSV file
        output_file = os.path.join(folder_path, f"{folder_name}_combined.csv")
        combined_df.to_csv(output_file, index=False)
        print(f"Saved combined CSV as: {output_file}")
    else:
        print(f"No CSV files found in folder: {folder_path}")

In [None]:
import os
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

# List of directories containing your category CSVs
CSV_DIRS = ["4o", "paligemma", "phi35", "4o-mini"]

def print_and_write(text, file):
    print(text)
    file.write(text + '\n')

def analyze_csvs_in_folders(folders):
    for folder in folders:
        if not os.path.exists(folder):
            print(f"❌ Folder does not exist: {folder}")
            continue

        report_txt = os.path.join(folder, 'analysis_report.txt')
        with open(report_txt, 'w') as report:
            print_and_write(f"\n{'#'*50}\nAnalyzing Folder: {folder}\n{'#'*50}\n", report)

            for csv_file in sorted(os.listdir(folder)):
                if not csv_file.endswith('.csv'):
                    continue
                file_path = os.path.join(folder, csv_file)
                sep = f"\n{'='*40}\nFile: {csv_file}\n{'='*40}\n"
                print_and_write(sep, report)
                try:
                    df = pd.read_csv(file_path)
                    df['correct'] = df['correct'].astype(bool)

                    # 1. Overall Accuracy
                    overall_accuracy = df['correct'].mean()
                    print_and_write(f"Overall Accuracy: {overall_accuracy:.2%}", report)

                    # 2. Accuracy by Category
                    cat_acc = df.groupby('category')['correct'].mean()
                    print_and_write("\nAccuracy by Category:", report)
                    print_and_write(cat_acc.to_string(), report)

                    # 3. Yes/No Questions Metrics
                    yesno_df = df[
                        (df['ground_truth'].str.lower().isin(['yes', 'no'])) &
                        (df['prediction'].str.lower().isin(['yes', 'no']))
                    ]
                    if not yesno_df.empty:
                        print_and_write("\nYes/No Questions Metrics:", report)
                        print_and_write("Confusion Matrix:", report)
                        print_and_write(str(confusion_matrix(
                            yesno_df['ground_truth'].str.lower(),
                            yesno_df['prediction'].str.lower()
                        )), report)
                        print_and_write("\nClassification Report:", report)
                        print_and_write(classification_report(
                            yesno_df['ground_truth'].str.lower(),
                            yesno_df['prediction'].str.lower()
                        ), report)
                    else:
                        print_and_write("\nNo Yes/No questions for confusion matrix.", report)

                    # 4. List Incorrect Predictions
                    print_and_write("\nSome Incorrect Predictions:", report)
                    incorrect = df[~df['correct']][['image_name', 'category', 'question', 'ground_truth', 'prediction']].head(10)
                    print_and_write(incorrect.to_string(index=False), report)

                    # 5. Questions Per Category
                    print_and_write("\nQuestions per Category:", report)
                    print_and_write(df['category'].value_counts().to_string(), report)
                except Exception as e:
                    print_and_write(f"Error processing {csv_file}: {e}", report)

        print(f"\nAnalysis complete for folder: {folder}. Stats written to {report_txt}")

# Run the analysis for all folders
analyze_csvs_in_folders(CSV_DIRS)