In [1]:
import os
import base64
import requests
import pandas as pd
from dotenv import load_dotenv
import ast
from tqdm.notebook import tqdm
import time

In [52]:
load_dotenv()

OPENAI_API = os.getenv("OPENAI_API_KEY") 
CLAUDE_API = os.getenv("CLAUDE_API_KEY")
# OPENAI_API

In [53]:
image_path = "../dataset/images"
csv_path = "outputs/test_data_openai.csv"
full_csv = "/home/sourove/code/Python/vqa-2/annotation/relevants/all/compiled_data.csv"
hedf_csv = "/home/sourove/code/Python/vqa-2/vqa_next/outputs/category_level - Sheet1.csv"


full_df = pd.read_csv(full_csv)
he_df = pd.read_csv(hedf_csv)
test_df = pd.read_csv(csv_path)
# test_df.head(2)
merged = pd.merge(he_df, full_df, 'left', on='id')
merged.head(2)

Unnamed: 0,Category,level,id,filename,question,options,answer,category
0,Food,Hard,F1CH3963,CHITTRON_3963.png,ছবিতে থাকা লোকটি কী করে?,"['কাঠের কাজ', 'রুটি বানায়', 'মাটির কাজ', 'ইটের...",রুটি বানায়,Food
1,Food,Hard,P6BN4733,BNATURE_4733.jpg,শুটকি কোন শহরে বেশি পাওয়া যায়?,"['ঢাকা', 'সিলেট', 'চট্টগ্রাম', 'যশোর']",চট্টগ্রাম,Food


In [54]:
# test_df = test_df[:1]
merged_new = pd.merge(merged, test_df, 'left', on=['id', 'filename', 'question', 'options', 'answer', 'category'], suffixes=('_df1', '_df2'))

merged_new.isna().sum(), merged_new.shape

(Category             0
 level                0
 id                   0
 filename             0
 question             0
 options              0
 answer               0
 category             0
 openai_response    112
 dtype: int64,
 (135, 9))

In [55]:
set(merged_new['openai_response'])

{'"answer: 4"', 'answer: 1', 'answer: 2', 'answer: 3', 'answer: 4', nan}

In [56]:
def prompt_prep(question:str, options:list):
    return """

You are an expert multimodal AI assistant. You will be given an image as context. Based on this image, you will be asked a multiple-choice question with four options.  
Your task is to select the most accurate answer from the given options.  

### **Instructions:**  
- The question and options will be presented in the following format:  
    - **question:** QUESTION  
    - **options:**  
        1. option_a  
        2. option_b  
        3. option_c  
        4. option_d  

- Your response should **only contain the number** corresponding to the correct answer.  
  - Example:  
    - If **option 1 is correct**, respond with `"answer: 1"`  
    - If **option 3 is correct**, respond with `"answer: 3"`  

Now, answer the following question based on the provided image:  

**question:** {}  
**options:**  
1. {}  
2. {}  
3. {}  
4. {}  

# IMPORTANT You must follow the output format "answer: _correct_option_" only

    """.format(question, options[0], options[1],options[2], options[3])

In [57]:
import datetime

def append_to_file(file_name, data):
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    with open(file_name, 'a') as file:
        file.write(f'{data}\n')

# append_to_file('openai_cost.txt', f'This is the data to append.')

### openai

In [58]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')        

In [60]:
def generate(question: str, image: base64, options:list, imgtype: str):

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {OPENAI_API}"
    }
    payload = {
        
        "model": "gpt-4o-mini",
        "messages": [
            {
                "role": "user",
                "content": [
                    
                    {"type": "text", "text": prompt_prep(question, options)},
                    {"type": "image_url", "image_url": {"url": 
                        f"data:{imgtype};base64,{image}"}}
                ]
            }
        ],
        "max_tokens":1000
    }  
    
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    
    return response.json() #['choices'][0]['message']['content']


In [61]:
# cost calculation
total_cost = 0
total_input = 0
total_output = 0

def cost_calculation(response):
    cost = response['usage']['prompt_tokens'] * .15 / 1000000+ response['usage']['completion_tokens'] * .60 / 1000000
    return cost

In [62]:
def generate_answer(item):
    image = encode_image(f"{image_path}/{item['filename']}")
    question = item["question"]
    options = ast.literal_eval(item["options"])
    if item['filename'].endswith(".jpg"):
        imgtype = "image/jpeg"
    else:
        imgtype = "image/png"
        
    if pd.isna(item["openai_response"]):
        response = generate(question, image, options, imgtype=imgtype)
    else:
        # print(item["openai_response"])
        return item["openai_response"]
    
    # cost = response['usage']['prompt_tokens'] * .15 / 1000000+ response['usage']['completion_tokens'] * .60 / 1000000
    # append_to_file('openai_cost.txt', cost)
    
    time.sleep(1)
    return response['choices'][0]['message']['content']

In [49]:
# test_df = merged_new[0:10]

In [63]:
merged_new["openai_response"] = merged_new.apply(generate_answer, axis=1)

In [65]:
merged_new.isna().sum()

Category           0
level              0
id                 0
filename           0
question           0
options            0
answer             0
category           0
openai_response    0
dtype: int64

In [153]:
# test_df.to_csv("outputs/test_data_openai_cultural.csv", index=False)

In [66]:
merged_new.to_csv("outputs/level_tagged.csv", index=False)

In [None]:
## ID fix P9BO2437  P9BO2654 P2CH6109 P7BN7212 P1CH5416 BNATURE_1295 F1CH5299 P8CH228

### claude

In [6]:
# img = f"{image_path}/CHITTRON_7834.png"

In [24]:
def encode_image_claude(image_path):
    with open(image_path, "rb") as image_file:
        return base64.standard_b64encode(image_file.read()).decode('utf-8')        

In [25]:
import anthropic

client = anthropic.Anthropic(api_key=CLAUDE_API)

In [26]:
def generate_claude(question: str, filename: str, options:list):
    
    image_media_type = "image/jpeg" if filename.endswith(".jpg") else "image/png" 
    image_data = encode_image_claude(f"{image_path}/{filename}")
    max_retries = 3
    retry_delay = 60
    # print(prompt_prep(question, options))
    for attempt in range(max_retries):
        try:
            message = client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=1024,
                messages=[{   
                    "role": "user",
                    "content": [
                        {   "type": "text",
                            "text": prompt_prep(question, options)
                        },
                        {   "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": image_media_type,
                                "data": image_data,
                            },
                        }                
                    ],}],
            )            
            return message
            
        except anthropic.RateLimitError:
            if attempt < max_retries - 1:
                print(f"Rate limit hit. Waiting {retry_delay} seconds before retry...")
                time.sleep(retry_delay)
            else:
                raise
    

In [33]:
column_name = 'claude_response'
# test_df.loc[:,column_name] = None

In [34]:
def generate_answer_claude(item):
    image = item['filename']
    question = item["question"]
    options = ast.literal_eval(item["options"])
    
    message = generate_claude(question, image, options)
    
    cost = message.usage.input_tokens * 3.75/1000000 + message.usage.output_tokens * 15/1000000
    
    append_to_file('claude_cost.txt', cost)
    
    return message.content[0].text

In [35]:
# test_df["claude_response"] = test_df.apply(generate_answer_claude, axis=1)

In [47]:
slicedf = test_df.iloc[416:417] #416
slicedf.head(3)

Unnamed: 0,filename,question,options,answer,category,id,openai_response,claude_response
416,CHITTRON_5465.png,ছবিতে কার ভাস্কর্য আছে?,"['ভাল্মিকি', 'বুধদেব', 'শীব', 'বলদেব']",ভাল্মিকি,Religious Events,P1CH5465,answer: 3,


In [48]:
for idx, row in (slicedf.iterrows()):
        
    # image = row['filename']
    
    # # image = Image.open(f"{image_path}/{image}")
    
    # question = row["question"]
    # options = ast.literal_eval(row["options"])

    
    response = generate_answer_claude(row)

    test_df.loc[test_df['id'] == str(row['id']), column_name] = response
    
    print(f"done: {row['filename']} and idx: ",idx," answer: ", str(response))
    
    time.sleep(1)

done: CHITTRON_5465.png and idx:  416  answer:  answer: 1


In [50]:
test_df.head(2)

Unnamed: 0,filename,question,options,answer,category,id,openai_response,claude_response
0,CHITTRON_7834.png,চিত্রের শিশুটি কোন ধরনের শিক্ষা গ্রহণ করছে?,"['চারুকলা', 'গান', 'একাডেমিক', 'নৃত্য']",নৃত্য,Arts & History,F3CH7834,answer: 4,answer: 4
1,CHITTRON_3642.png,বাংলার মর্মান্তিক ইতিহাস বহনকারী এই স্তম্ভটির ...,"['স্মৃতিসৌধ', 'শহীদ মিনার', 'রাজু ভাস্কর্য', '...",শহীদ মিনার,Arts & History,P9CH3642,answer: 2,answer: 2\n\nThe image shows the Shaheed Minar...


In [51]:
test_df.to_csv("outputs/test_data_claude.csv", index=False)

In [46]:
from PIL import Image


In [49]:
# def compress_image(input_path, max_size_kb=5000):
#     img = Image.open(input_path)
#     img.save(input_path, optimize=True, quality=50)

# compress_image()

### Complexity test