In [98]:
import os
import base64
import requests
import pandas as pd
from dotenv import load_dotenv
import ast
from tqdm.notebook import tqdm
import time

In [99]:
load_dotenv()

OPENAI_API = os.getenv("OPENAI_API_KEY") 
CLAUDE_API = os.getenv("CLAUDE_API_KEY")
# OPENAI_API

In [100]:
image_path = "../dataset/images"
csv_path = "outputs/test_data_claude.csv"

test_df = pd.read_csv(csv_path)
test_df.head()

Unnamed: 0,filename,question,options,answer,category,id,openai_response
0,CHITTRON_7834.png,চিত্রের শিশুটি কোন ধরনের শিক্ষা গ্রহণ করছে?,"['চারুকলা', 'গান', 'একাডেমিক', 'নৃত্য']",নৃত্য,Arts & History,F3CH7834,answer: 4
1,CHITTRON_3642.png,বাংলার মর্মান্তিক ইতিহাস বহনকারী এই স্তম্ভটির ...,"['স্মৃতিসৌধ', 'শহীদ মিনার', 'রাজু ভাস্কর্য', '...",শহীদ মিনার,Arts & History,P9CH3642,answer: 2
2,BORNON_215.jpg,ছবিতে দেখা মুদ্রাটি কোন দেশের?,"['বাংলাদেশ', 'শ্রীলঙ্কা', 'ভারত', 'পাকিস্তান']",বাংলাদেশ,Arts & History,P5BO215,answer: 1
3,BNATURE_3915.jpg,ছবিতে মেয়েটি মাইক্রোফোনটি হাতে ধরে কী করছে?,"['গান গাচ্ছে', 'নাচছে', 'কথা বলছে', 'পড়াশোনা ...",গান গাচ্ছে,Arts & History,P5BN3915,answer: 1
4,CHITTRON_496.png,প্রাচীনকালে বাংলায় এমন বড় আকৃতির মাটির পাত্র ক...,"['ধান-চাল সংরক্ষনে', 'পানি সংরক্ষনে', 'কাপড় সং...",ধান-চাল সংরক্ষনে,Arts & History,P1CH496,answer: 2


In [101]:
# test_df = test_df[:1]

In [102]:
def prompt_prep(question:str, options:list):
    return """

You are an expert multimodal AI assistant. You will be given an image as context. Based on this image, you will be asked a multiple-choice question with four options.  
Your task is to select the most accurate answer from the given options.  

### **Instructions:**  
- The question and options will be presented in the following format:  
    - **question:** QUESTION  
    - **options:**  
        1. option_a  
        2. option_b  
        3. option_c  
        4. option_d  

- Your response should **only contain the number** corresponding to the correct answer.  
  - Example:  
    - If **option 1 is correct**, respond with `"answer: 1"`  
    - If **option 3 is correct**, respond with `"answer: 3"`  

Now, answer the following question based on the provided image:  

**question:** {}  
**options:**  
1. {}  
2. {}  
3. {}  
4. {}  

# IMPORTANT You must follow the output format "answer: _correct_option_"
    """.format(question, options[0], options[1],options[2], options[3])

In [103]:
import datetime

def append_to_file(file_name, data):
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    with open(file_name, 'a') as file:
        file.write(f'{data}\n')

# append_to_file('openai_cost.txt', f'This is the data to append.')

### openai

In [5]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')        

In [7]:
def generate(question: str, image: base64, options:list):

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {OPENAI_API}"
    }
    payload = {
        
        "model": "gpt-4o-mini",
        "messages": [
            {
                "role": "user",
                "content": [
                    
                    {"type": "text", "text": prompt_prep(question, options)},
                    {"type": "image_url", "image_url": {"url": 
                        f"data:image/jpeg;base64,{image}"}}
                ]
            }
        ],
        "max_tokens": 30
    }  
    
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    
    return response.json() #['choices'][0]['message']['content']


In [8]:
# cost calculation
total_cost = 0
total_input = 0
total_output = 0

def cost_calculation(response):
    cost = response['usage']['prompt_tokens'] * .15 / 1000000+ response['usage']['completion_tokens'] * .60 / 1000000
    return cost

In [10]:
def generate_answer(item):
    image = encode_image(f"{image_path}/{item['filename']}")
    question = item["question"]
    options = ast.literal_eval(item["options"])
    response = generate(question, image, options)
    
    cost = response['usage']['prompt_tokens'] * .15 / 1000000+ response['usage']['completion_tokens'] * .60 / 1000000
    append_to_file('openai_cost.txt', cost)
    return response['choices'][0]['message']['content']

In [11]:
test_df["openai_response"] = test_df.apply(generate_answer, axis=1)

In [13]:
test_df

Unnamed: 0,filename,question,options,answer,category,id,openai_response
0,CHITTRON_7834.png,চিত্রের শিশুটি কোন ধরনের শিক্ষা গ্রহণ করছে?,"['চারুকলা', 'গান', 'একাডেমিক', 'নৃত্য']",নৃত্য,Arts & History,F3CH7834,answer: 4


In [12]:
test_df.to_csv("outputs/test_data_openai.csv", index=False)

### claude

In [6]:
# img = f"{image_path}/CHITTRON_7834.png"

In [104]:
def encode_image_claude(image_path):
    with open(image_path, "rb") as image_file:
        return base64.standard_b64encode(image_file.read()).decode('utf-8')        

In [105]:
import anthropic

client = anthropic.Anthropic(api_key=CLAUDE_API)

In [106]:
def generate_claude(question: str, filename: str, options:list):
    
    image_media_type = "image/jpeg" if filename.endswith(".jpg") else "image/png" 
    image_data = encode_image_claude(f"{image_path}/{filename}")
    max_retries = 3
    retry_delay = 60
    # print(prompt_prep(question, options))
    for attempt in range(max_retries):
        try:
            message = client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=1024,
                messages=[{   
                    "role": "user",
                    "content": [
                        {   "type": "text",
                            "text": prompt_prep(question, options)
                        },
                        {   "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": image_media_type,
                                "data": image_data,
                            },
                        }                
                    ],}],
            )            
            return message
            
        except anthropic.RateLimitError:
            if attempt < max_retries - 1:
                print(f"Rate limit hit. Waiting {retry_delay} seconds before retry...")
                time.sleep(retry_delay)
            else:
                raise
    

In [107]:
def generate_answer_claude(item):
    image = item['filename']
    question = item["question"]
    options = ast.literal_eval(item["options"])
    
    message = generate_claude(question, image, options)
    
    cost = message.usage.input_tokens * 3.75/1000000 + message.usage.output_tokens * 15/1000000
    
    append_to_file('claude_cost.txt', cost)
    
    return message.content[0].text

In [108]:
test_df["claude_response"] = test_df.apply(generate_answer_claude, axis=1)

BadRequestError: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'messages.0.content.1.image.source.base64: image exceeds 5 MB maximum: 5262204 bytes > 5242880 bytes'}}

In [109]:
test_df.head()

Unnamed: 0,filename,question,options,answer,category,id,openai_response
0,CHITTRON_7834.png,চিত্রের শিশুটি কোন ধরনের শিক্ষা গ্রহণ করছে?,"['চারুকলা', 'গান', 'একাডেমিক', 'নৃত্য']",নৃত্য,Arts & History,F3CH7834,answer: 4
1,CHITTRON_3642.png,বাংলার মর্মান্তিক ইতিহাস বহনকারী এই স্তম্ভটির ...,"['স্মৃতিসৌধ', 'শহীদ মিনার', 'রাজু ভাস্কর্য', '...",শহীদ মিনার,Arts & History,P9CH3642,answer: 2
2,BORNON_215.jpg,ছবিতে দেখা মুদ্রাটি কোন দেশের?,"['বাংলাদেশ', 'শ্রীলঙ্কা', 'ভারত', 'পাকিস্তান']",বাংলাদেশ,Arts & History,P5BO215,answer: 1
3,BNATURE_3915.jpg,ছবিতে মেয়েটি মাইক্রোফোনটি হাতে ধরে কী করছে?,"['গান গাচ্ছে', 'নাচছে', 'কথা বলছে', 'পড়াশোনা ...",গান গাচ্ছে,Arts & History,P5BN3915,answer: 1
4,CHITTRON_496.png,প্রাচীনকালে বাংলায় এমন বড় আকৃতির মাটির পাত্র ক...,"['ধান-চাল সংরক্ষনে', 'পানি সংরক্ষনে', 'কাপড় সং...",ধান-চাল সংরক্ষনে,Arts & History,P1CH496,answer: 2


In [20]:
test_df.to_csv("outputs/test_data_claude.csv", index=False)