In [1]:
import requests
import json
import base64
import pandas as pd
import os
import cv2
import numpy as np

In [2]:
def image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [3]:
model = "meta-llama/llama-3.2-90b-vision-instruct"

In [5]:
csv_file_path = "cleaned_vqa_gt_dataset.csv"
data = pd.read_csv(csv_file_path)
data.columns

Index(['ID', 'Query', 'Description'], dtype='object')

In [12]:
system_prompt = """
You are a voice assistant for visually impaired Delhi Metro riders. Analyze the user's photo and query to provide safe, step-by-step navigation guidance. Follow these rules:

        1. SAFETY FIRST:
        - Immediately warn about dangers:
            • "Warning:
            • "Caution:

        2. NAVIGATION GUIDANCE:
        - Use detected objects (if available) for directions of vacant seats, elevator, ticket counter, etc.

        3. DELHI METRO FEATURES:
        - Mention specific features of the surrounding.

        4. RESPONSE FORMAT:
        - Keep responses short (1-2 sentences)
        - Always start with safety warnings if any
        - Use simple, directive language

        5. PROHIBITED ACTIONS:
        - Never say "look for" or mention colors
        - Don't describe bounding boxes, only what's inside them
        - If unsure: "Please ask metro staff."
        
        Examples are given for your reference only. Do not mention them in your response or copy from them. Examples describe the structure of response only.
"""

In [13]:
api_key =  "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"

In [11]:
try:
    base64_image_1 = image_to_base64(f"images_with_bbox/images_with_bbox/{data.iloc[0]['ID']}.jpg")
    base64_image_2 = image_to_base64(f"images_with_bbox/images_with_bbox/{data.iloc[1]['ID']}.jpg") 
    base64_image_3 = image_to_base64(f"images_with_bbox/images_with_bbox/{data.iloc[2]['ID']}.jpg")
    image1_query = data.iloc[0]['Query']
    image2_query = data.iloc[1]['Query']
    image3_query = data.iloc[2]['Query']
    image1_answer = data.iloc[0]['Description']
    image2_answer = data.iloc[1]['Description']
    image3_answer = data.iloc[2]['Description']
except FileNotFoundError as e:
    print(f"Error loading image: {e}")
    exit()


In [21]:
import csv

for index, row in data.iterrows():
    if (index < 3): continue
    image_path = os.path.join("images_with_bbox/images_with_bbox", f'{row['ID']}.jpg')
    base64_url = image_to_base64(image_path)
    query = row['Query']
    # Prepare the request payload
    payload = {
        "model": model,
        "temperature": 0.3,
        "frequency_penalty": 1.0,
        "presence_penalty": 0.9,
        "max_tokens": 150,
        "messages": [
            {
                "role": "system",
                "content": system_prompt
            },
            # Few-shot example 1: Obstacle detection
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": f"Example Query 1: {image1_query}"},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image_1}"}}
                ]
            },
            {
                "role": "assistant",
                "content": f"{image1_answer}"
            },
            # Few-shot example 2: Clear path
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": f"Example Query 2: {image2_query}"},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image_2}"}}
                ]
            },
            {
                "role": "assistant",
                "content": f"{image2_answer}"
            },
            # Few-shot example 3: Finding seating
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": f"Example Query 3: {image3_query}"},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image_3}"}}
                ]
            },
            {
                "role": "assistant",
                "content": f"{image3_answer}"
            },
            # Actual user query with current image
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": query},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_url}"}}
                ]
            }
        ]
    }

    # Make the API request
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    response = requests.post(
        "https://openrouter.ai/api/v1/chat/completions",
        headers=headers,
        data=json.dumps(payload)
    )

    # Print the response
    if response.status_code == 200:
        result = response.json()
        print("API Response:")
        print(json.dumps(result, indent=2))
        
        # Print just the assistant's message content
        if 'choices' in result and len(result['choices']) > 0:
            assistant_message = result['choices'][0]['message']['content']
            # print("\nAssistant's Response:")
            # Save the ID and assistant message to a CSV file
            output_file = "assistant_responses.csv"
            file_exists = os.path.isfile(output_file)
            with open(output_file, mode='a', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                # Write the header only if the file does not exist
                if not file_exists:
                    writer.writerow(["ID", "Assistant_Response"])
                # Replace any newline or comma in the assistant message to avoid creating new columns
                sanitized_message = assistant_message.replace("\n", " ").replace(",", " ")
                writer.writerow([row['ID'], sanitized_message])

            print(f"Saved response for ID {row['ID']} to {output_file}")
    else:
        print(f"Error: {response.status_code}")
        # print(response.text)
    
        

API Response:
{
  "id": "gen-1745330457-Q0LqumnxCEkEgxcvazWZ",
  "provider": "Together",
  "model": "meta-llama/llama-3.2-90b-vision-instruct",
  "object": "chat.completion",
  "created": 1745330457,
  "choices": [
    {
      "logprobs": null,
      "finish_reason": "length",
      "native_finish_reason": "length",
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "**1. Immediate Safety Alert**  \nNo immediate hazards detected. You are standing safely inside the train, likely holding onto a handrail.\n\n---\n\n**2. Scene Overview**  \nYou're inside a metro train coach. The train is moderately to heavily crowded, especially near the doors. Most seats are occupied, with standing passengers holding overhead handrails and poles.\n\n---\n\n**3. Detected Objects (Based on Bounding Boxes)**  \n- **People**: 11 detected \u2013  \n  - Several standing near doors and holding overhead bars  \n  - A few seated to your right and left, mostly women  \n  - Clustere

In [6]:
responses = pd.read_csv('assistant_responses.csv')

3     **1. Immediate Safety Alert**  \nNo immediate ...
4     **1. Immediate Safety Alert**  \nNo immediate ...
5     Immediate Safety Alert\nCAUTION: Multiple vert...
6     Immediate Safety Alert CAUTION: Center aisle i...
7     Immediate Safety Alert\nCAUTION: Train door is...
8     Immediate Safety Alert\nCAUTION: Standing pass...
9     1. Immediate Safety Alert\nCAUTION: Multiple v...
10    1. Immediate Safety Alert\r\nCAUTION: Crowded ...
11    1. **Immediate Safety Alert**  \r\nNo immediat...
12    1. **Immediate Safety Alert**  \r\nNo immediat...
13    1. **Immediate Safety Alert**  \r\nNo immediat...
14    1. **Immediate Safety Alert**  \r\nNo immediat...
15    1. **Immediate Safety Alert**  \n⚠️ No urgent ...
16    **1. Immediate Safety Alert**  \nNo urgent haz...
17    **1. Immediate Safety Alert**  \r\n✅ **Yes, th...
18    **1. Vacant Seat Locations Identified** ✅  \r\...
Name: Description, dtype: object

In [27]:
!pip install bert_score

Defaulting to user installation because normal site-packages is not writeable
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting transformers>=3.0.0 (from bert_score)
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers>=3.0.0->bert_score)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers>=3.0.0->bert_score)
  Using cached regex-2024.11.6-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers>=3.0.0->bert_score)
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers>=3.0.0->bert_score)
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
   ----------------

In [29]:
!pip install hf-xet

Defaulting to user installation because normal site-packages is not writeable
Collecting hf-xet
  Downloading hf_xet-1.0.3-cp37-abi3-win_amd64.whl.metadata (498 bytes)
Downloading hf_xet-1.0.3-cp37-abi3-win_amd64.whl (4.1 MB)
   ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
   ------------

In [None]:
from bert_score import score

def calculate_bert_score(predictions, ground_truths):
    P, R, F1 = score(predictions, ground_truths, lang="en", verbose=True)
    return P.mean().item(), R.mean().item(), F1.mean().item()

# Example usage
predicted_answers = responses['Assistant_Response'].tolist()
ground_truth_answers = data['Description'][3:].tolist()

precision, recall, f1_score = calculate_bert_score(predicted_answers, ground_truth_answers)
print(f"BERT Score - Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1_score:.2f}")
    

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:31<00:00, 31.58s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 12.49it/s]


done in 31.68 seconds, 0.50 sentences/sec
BERT Score - Precision: 0.85, Recall: 0.84, F1: 0.85


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:35<00:00, 35.83s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 18.80it/s]


done in 35.89 seconds, 0.45 sentences/sec


TypeError: unsupported format string passed to tuple.__format__

In [1]:
!pip install xet

Defaulting to user installation because normal site-packages is not writeable


ERROR: Could not find a version that satisfies the requirement xet (from versions: none)
ERROR: No matching distribution found for xet
