## Calling `litellm.batch_completion` to test LLM models' performance on a batch of questions


In [5]:
Resq_base_prompt = """
You are a helper bot who is especially skilled in spatial reasoning and other common sense reasoning tasks.
Please answer Yes/No question with only Yes or No. Donot include the explanation and reasoning process in the output"""

Resq_CoT_prompt = """
You are a helper bot who is especially skilled in spatial reasoning and other common sense reasoning tasks.
When answering the Yes/No questions, try to think step by step.
1. Carefully read the context and question.
2. Identify the key objects and their attributes, and spatial relations mentioned in the question.
3. Based on step by step reasoning, answer the question  with Yes or NO.
Note: No reasoning process or explanation is needed.
"""
# Prepare the system prompt
CoT_prompt = """
You are an expert in spatial reasoning. When answering questions, try to think step by step.
1. Carefully read the context and question.
2. Identify the key objects and their attributes mentioned in the question.
3. Identify the spatial relationships between the objects and blocks.
4. Select the appropriate answer(s) from the given choices. 
Important:
- Only include the answer(s) that are in the provided choices. eg, Answer: ['left', 'far from'], [Yes],[DK]. DK means Donot know. 
- Do not include any additional text outside of the answer. No reasoning process or explanation is needed.
"""

Visualization_prompt_V1 = """To answer the Yes/No question, please create a mental image or create a simple 2D layout ASCII diagram of the described spatial scene, 
Then based on the visualizaton of the scene, answer the questions with only Yes or No. 
Donot include the ASCII diagram. Donot include the reasoning process. Donot include the explanation. Just output the final answer in the list format."""
# ResQ: A Residual Question Answering Framework for Visual Commonsense Reasoning
Visualization_prompt = """
To assist the question answering, follow the steps to create a simple 2D layout ASCII diagram of the described spatial scene:
1. Identify the main components:
   - List all blocks (e.g., A, B, C)
   - List all objects within each block
2. Create a basic structure:
   - Draw rectangular outlines for each block
   - Order the blocks (eg, top to bottom, or from left to right)
3. Add objects to each block:
   - Use symbols for shapes: ○ for circle, △ for triangle, ■ for square, ▭ for rectangle
   - Include color and size information (e.g., "red small ○")
4. Position objects within blocks:
   - Place objects according to described locations (top, bottom, left, right, etc.)
Please answer the question by choosing the best answer or answers from the choices with the help of the 2D layout ASCII diagram. 
Donot include the ASCII diagram. Donot include the reasoning process. Donot include the explanation. Just output the final answer in the list format.
Examples: 
Question: Are all of the squares in B?   Choices: ['Yes', 'No', 'DK']
Answer: ['No']
Question: Which objects are not touching a square?the red triangle  or  the black circle?  Choices: [' the green square', ' the red square', 'both of them', 'none of them']
Answer: ['both of them']
Question:What is the relation between the green square and the thing below a yellow square?  Choices:['left', 'right', 'above', 'below', 'near to', 'far from', 'touching', 'DK']
Answer: ['left', 'far from'] 
"""
Rule_prompt = """
You are an advanced logical spatial reasoning module designed to answer questions about spatial relationships between objects. 
  Follow these steps:
    1. semantic parsing task: parse natural language spatial scene despcription into atomic facts: relation(object1, object2)
    2. Carefully analyze the given facts in the context.
    3. Determine if the parsed facts directly answer the question. If so, provide the answer.
    4. If the facts don't directly answer the question, apply logical inference rules, including but not limited to:
       a) Inverse rule: If A is above B, then B is below A.
       b) Transitive rule: If A is left of B and B is left of C, then A is left of C.
       c) Symmetric rule: If A is next to B, then B is next to A.
       d) Diagonal relationships: Consider combinations of vertical and horizontal positions.
    5. Use these rules to infer new spatial relationships from the given information.
    6. Provide the most logically sound answer based on the valid logical inferences.
[Important] Donot include the reasoning process and explanation in the answer. only provide the answer in the list format.
Examples: 
Question: Are all of the squares in B?   Choices: ['Yes', 'No', 'DK'] # DK means Donot know
Answer: ['No']
Question:What is the relation between the green square and the thing below a yellow square?  Choices:['left', 'right', 'above', 'below', 'near to', 'far from', 'touching', 'DK']
Answer: ['left', 'far from'] 
"""


# Set your Groq API key
os.environ['GROQ_API_KEY'] = 

# Define the model you want to evaluate
model = "groq/llama3-70b-8192"  # Make sure this is the correct model name for Groq


In [6]:
import pandas as pd
from litellm import batch_completion
import os
import time 



# Load your dataset
df = pd.read_csv('Resq.csv').sample(n=100, random_state=19)

models = ["ollama/openchat", "ollama/llama3","ollama/mistral","ollama/phi3:mini","ollama/gemma:2b"]

# Prepare messages for batch processing
messages = []
for _, row in df.iterrows():
    context = row["Story"]
    question = row["Question"]
    choices = row["Candidate_Answers"]
    user_prompt = f"Context: {context}\nQuestion: {question}\n Please choose the answer(s) from the \nCandidate_Answers: {choices}\n Your Answer: Yes or No? "
    
    messages.append([
        {'role': 'system', 'content': Visualization_prompt_V1 },
        {'role': 'user', 'content': user_prompt}
    ])

results = {model: [] for model in models}

def process_model(model):
    start_time = time.time()
    responses = batch_completion(
            model=model,
            api_base="http://localhost:11434",
            messages=messages,
            max_tokens=1000,
            top_p=0.9,
            temperature=0.1,
            num_retries=3,
           # max_budget=0.005,
            # batch_size=10
    )
    end_time = time.time()
    
    print(f"Time taken for {model}: {end_time - start_time:.2f} seconds")
    
    return responses

for model in models:
    try:
        model_responses = process_model(model)
        for response in model_responses:
            answer = response['choices'][0]['message']['content'].strip().lower()
            results[model].append(answer)   
        print(f"Processed all questions for model: {model}")
        
    except Exception as e:
        print(f"Error processing model {model}: {e}")
        results[model] = ["ERROR: Unable to get response"] * len(df)

for model in models:
    df[model] = results[model]
# Save results to CSV
df.to_csv('5_ollama_models_Resq_visual.csv', index=False)


Time taken for ollama/openchat: 110.75 seconds
Processed all questions for model: ollama/openchat
Time taken for ollama/llama3: 96.13 seconds
Processed all questions for model: ollama/llama3
Time taken for ollama/mistral: 94.85 seconds
Processed all questions for model: ollama/mistral

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'.


[1;31mGive Feedback / Get Help: https://github.com/Berri

In [7]:
models = ["ollama/openchat", "ollama/llama3","ollama/mistral","ollama/phi3:mini","ollama/gemma:2b"]

In [56]:
import pandas as pd
from litellm import batch_completion
import os
import time 


# Load your dataset
df = pd.read_csv('Resq.csv').sample(n=210, random_state=61)

models = ["ollama/openchat", "ollama/llama3","ollama/mistral","ollama/phi3:mini","ollama/gemma:2b"]

# Prepare messages for batch processing
messages = []
for _, row in df.iterrows():
    context = row["Story"]
    question = row["Question"]
    choices = row["Candidate_Answers"]
    user_prompt = f"Context: {context}\nQuestion: {question}\n Please choose the answer(s) from the \nCandidate_Answers: {choices}\n  "
    
    messages.append([
        {'role': 'system', 'content': Resq_CoT_prompt},
        {'role': 'user', 'content': user_prompt}
    ])

results = {model: [] for model in models}

def process_model(model):
    start_time = time.time()
    responses = batch_completion(
            model=model,
            messages=messages,
            max_tokens=1000,
            top_p=0.9,
            temperature=0.1,
            num_retries=3,
           # max_budget=0.005,
            # batch_size=10
    )
    end_time = time.time()
    
    print(f"Time taken for {model}: {end_time - start_time:.2f} seconds")
    
    return responses

for model in models:
    try:
        model_responses = process_model(model)
        for response in model_responses:
            answer = response['choices'][0]['message']['content'].strip().lower()
            results[model].append(answer)   
        print(f"Processed all questions for model: {model}")
        
    except Exception as e:
        print(f"Error processing model {model}: {e}")
        results[model] = ["ERROR: Unable to get response"] * len(df)

for model in models:
    df[model] = results[model]
# Save results to CSV
df.to_csv('5_ollama_models_Resq_CoT_evaluations.csv', index=False)


Time taken for ollama/openchat: 902.19 seconds
Processed all questions for model: ollama/openchat
Time taken for ollama/llama3: 422.85 seconds
Processed all questions for model: ollama/llama3
Time taken for ollama/mistral: 386.46 seconds
Processed all questions for model: ollama/mistral
Time taken for ollama/phi3:mini: 242.00 seconds
Processed all questions for model: ollama/phi3:mini
Time taken for ollama/gemma:2b: 1432.32 seconds
Processed all questions for model: ollama/gemma:2b


In [20]:
import pandas as pd 
import re
df_results = pd.read_csv('5_ollama_models_Resq_Base_evaluations_1.csv')
df_results_2 = pd.read_csv('5_ollama_models_Resq_CoT_evaluations.csv')

def extract_yes_no(text):
    match = re.search(r'\b\w+\b', text)
    return match.group(0) if match else None
    # match = re.search(r'(?:^|\n)\s*(yes|no)', str(text).lower())
    # return match.group(1) if match else None
def calculate_accuracy(true, pred):
    correct = 0
    total = len(true)
    pred= pred.apply(extract_yes_no)
    for t, p in zip(true, pred):
        if pd.isna(t) or pd.isna(p):
            total -= 1  # Skip this pair if either is NaN
            continue
        if t.strip().lower()== p.strip().lower():
            correct += 1
    return correct / total


models = ["ollama/openchat", "ollama/llama3","ollama/mistral","ollama/phi3:mini","ollama/gemma:2b"]

metrics = {}
for model in models:
    accuracy = calculate_accuracy(df_results['Answer'], df_results[model])
    metrics[model] = {'accuracy for Base Prompt': accuracy}
    print(f"Accuracy for Base Prompt {model}: {accuracy:.4f}")

# Create a DataFrame with the metrics
metrics_df = pd.DataFrame(metrics).T
print(metrics_df)

metrics_2= {}
for model in models:
    accuracy = calculate_accuracy(df_results_2['Answer'], df_results[model])
    metrics_2[model] = {'accuracy for CoT Prompt': accuracy}
    print(f"Accuracy for CoT Prompt {model}: {accuracy:.4f}")

# Create a DataFrame with the metrics
metrics_df_2 = pd.DataFrame(metrics_2).T
print(metrics_df_2)

Accuracy for Base Prompt ollama/openchat: 0.6010
Accuracy for Base Prompt ollama/llama3: 0.5625
Accuracy for Base Prompt ollama/mistral: 0.6010
Accuracy for Base Prompt ollama/phi3:mini: 0.3750
Accuracy for Base Prompt ollama/gemma:2b: 0.5000
                  accuracy for Base Prompt
ollama/openchat                   0.600962
ollama/llama3                     0.562500
ollama/mistral                    0.600962
ollama/phi3:mini                  0.375000
ollama/gemma:2b                   0.500000
Accuracy for CoT Prompt ollama/openchat: 0.5381
Accuracy for CoT Prompt ollama/llama3: 0.4810
Accuracy for CoT Prompt ollama/mistral: 0.4810
Accuracy for CoT Prompt ollama/phi3:mini: 0.3762
Accuracy for CoT Prompt ollama/gemma:2b: 0.4762
                  accuracy for CoT Prompt
ollama/openchat                  0.538095
ollama/llama3                    0.480952
ollama/mistral                   0.480952
ollama/phi3:mini                 0.376190
ollama/gemma:2b                  0.476190


CoT
ollama/openchat                   0.680952
ollama/llama3                     0.685714
ollama/mistral                    0.638095
ollama/phi3:mini                  0.609524
ollama/gemma:2b                   0.180952
                   Prompt_2
ollama/openchat                      0.36
ollama/llama3                        0.56
ollama/mistral                       0.44
ollama/phi3:mini                     0.44
ollama/gemma:2b                      0.16
                  Base  Prompt
ollama/openchat                      0.700
ollama/llama3                        0.655
ollama/mistral                       0.640
ollama/phi3:mini                     0.475
ollama/gemma:2b                      0.520         
      2 Prompt
ollama/openchat                  0.490385
ollama/llama3                    0.495192
ollama/mistral                   0.442308
ollama/phi3:mini                 0.350962
ollama/gemma:2b                  0.500000

In [33]:
import pandas as pd 
import re
df_results = pd.read_csv('llama3_8B_reSQ.csv')

def extract_yes_no(text):
    match = re.search(r'\b\w+\b', text)
    return match.group(0) if match else None
    # match = re.search(r'(?:^|\n)\s*(yes|no)', str(text).lower())
    # return match.group(1) if match else None
def calculate_accuracy(true, pred):
    correct = 0
    total = len(true)
    pred= pred.apply(extract_yes_no)
    for t, p in zip(true, pred):
        if pd.isna(t) or pd.isna(p):
            total -= 1  # Skip this pair if either is NaN
            continue
        if t.strip().lower()== p.strip().lower():
            correct += 1
    return correct / total


accuracy = calculate_accuracy(df_results['Answer'], df_results["model_answer_visual"])

print(f"Accuracy for Base Prompt: {accuracy:.4f}")

Accuracy for Base Prompt: 0.6750


In [None]:
CoT: 75.50
Base: 74.00

In [14]:
import pandas as pd
from litellm import batch_completion
import os
import time 

# Load your dataset
df = pd.read_csv('dataset/SparQA_test.csv')

# Filter for the first 100 'YN' questions (or adjust as needed)
df_sample = df.sample(n=min(100, len(df)), random_state=42)

# Define the models you want to evaluate
models = ["ollama/openchat", "ollama/llama3","ollama/mistral","ollama/phi3:mini","ollama/gemma:2b"]

# Prepare the system prompt


# Prepare messages for batch processing
messages = []
for _, row in df_sample.iterrows():
    context = row["story"]
    question = row["question"]
    choices = row["choices"]
    user_prompt = f"Context: {context}\nQuestion: {question}\n Please choose the  best answer or answers from the choices.\nChoices: {choices}\n Answer: "
    
    messages.append([
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_prompt}
    ])

results = {model: [] for model in models}

def process_model(model):
    start_time = time.time()
    responses = batch_completion(
            model=model,
            messages=messages,
            max_tokens=1000,
            top_p=0.9,
            temperature=0.1,
            num_retries=3,
           # max_budget=0.005,
            # batch_size=10
    )
    end_time = time.time()
    
    print(f"Time taken for {model}: {end_time - start_time:.2f} seconds")
    
    return responses

for model in models:
    try:
        model_responses = process_model(model)
        for response in model_responses:
            answer = response['choices'][0]['message']['content'].strip()
            results[model].append(answer)
        
        print(f"Processed all questions for model: {model}")
    except Exception as e:
        print(f"Error processing model {model}: {e}")
        results[model] = ["ERROR: Unable to get response"] * len(df_sample)


# Prepare results for DataFrame
df_results = pd.DataFrame({
    'context' : df_sample['story'],
    'question': df_sample['question'],
    'true_answer': df_sample['answer'],
    'choices': df_sample['choices']  # Keep the original choices for reference
})

for model in models:
    df_results[model] = results[model]
# Save results to CSV
df_results.to_csv('4_ollama_Rule_2.csv', index=False)

Time taken for ollama/openchat: 2467.67 seconds
Processed all questions for model: ollama/openchat
Time taken for ollama/llama3: 1083.29 seconds
Processed all questions for model: ollama/llama3
Time taken for ollama/mistral: 1438.02 seconds
Processed all questions for model: ollama/mistral


In [None]:
import re
from typing import Union, List
import pandas as pd
import ast

import re
import ast
import pandas as pd
from typing import Union, List

def calculate_accuracy(true_answers, pred_answers):
    correct = 0
    total = len(true_answers)
    #Define a function to extract the answer part from a string
    def extract_answer(text):
        match = re.search(r'answer\s*:\s*([a-zA-Z]+(\s+[a-zA-Z]+)?(\s+[a-zA-Z]+)?)', text, re.IGNORECASE)
       # match = re.search(r'answer\s*:\s*(.*)', text, re.IGNORECASE)
        if match:
            return match.group(1).strip()
        return text  # Return the original text if no match is found
   
    # Apply the extraction function to each prediction, skipping non-string elements
    pred_answers = [extract_answer(pred) if isinstance(pred, str) else pred for pred in pred_answers]
    
    for true, pred in zip(true_answers, pred_answers):
        if pd.isna(true) or pd.isna(pred):
            total -= 1  # Skip this pair if either is NaN
            continue
        # true_set = set(true)# Convert true answers to lowercase
        # pred_set = set(pred)
        true_set = set(map(str.lower, true))  # Convert true answers to lowercase
        pred_set = set(map(str.lower, pred)) 
        if len(true_set) > 1:  # Multiple correct answers
            if true_set == pred_set:
                correct += 1  # Full match
            elif true_set.intersection(pred_set):
                correct += 0.4  # Partial match
        else:  # Single correct answer
            if true_set == pred_set:
                correct += 1
    
    return correct / total if total > 0 else 0


# Example usage:
df_results = pd.read_csv('2_ollama_models_visulization.csv')

models = ["ollama/phi3:mini","ollama/gemma:2b"]
metrics = {}
for model in models:
    accuracy = calculate_accuracy(df_results['true_answer'], df_results[model])
    metrics[model] = {'accuracy': accuracy}
    print(f"Accuracy for {model}: {accuracy:.2f}")

# Create a DataFrame with the metrics
metrics_df = pd.DataFrame(metrics).T
print("2 smaller models", metrics_df)
print("------------------------")

df_results = pd.read_csv('evaluation_0_shots.csv')

models = ["ollama/openchat", "ollama/llama3","ollama/mistral","ollama/phi3:mini","ollama/gemma:2b"]

metrics = {}
for model in models:
    accuracy = calculate_accuracy(df_results['true_answer'], df_results[model])
    metrics[model] = {'accuracy': accuracy}
    print(f"Accuracy for {model}: {accuracy:.2f}")

# Create a DataFrame with the metrics
metrics_df = pd.DataFrame(metrics).T
print(metrics_df)


In [37]:
import pandas as pd
from litellm import batch_completion
import os
import time 


# Load your dataset
df = pd.read_csv('dataset/SparQA_test.csv')
df = df.sample(n=min(100, len(df)), random_state=182)

# Filter for the first 100 'YN' questions (or adjust as needed)
# df_filtered = df[df['q_type'] == 'YN'].head(100)

# Define the models you want to evaluate
models = ["ollama/phi3:mini","ollama/gemma:2b"]

# Prepare the system prompt

# Prepare messages for batch processing
messages = []
for _, row in df.iterrows():
    context = row["story"]
    question = row["question"]
    choices = row["choices"]
    user_prompt = f"Context: {context}\nQuestion: {question}\n Please choose the  best answer or answers from the choices.\nChoices: {choices}\n Answer: "
    
    messages.append([
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_prompt}
    ])

results = {model: [] for model in models}

def process_model(model):
    start_time = time.time()
    responses = batch_completion(
            model=model,
            messages=messages,
            max_tokens=1000,
            top_p=1,
            temperature=0.1,
            num_retries=3,
           # max_budget=0.005,
            # batch_size=10
    )
    end_time = time.time()
    
    print(f"Time taken for {model}: {end_time - start_time:.2f} seconds")
    
    return responses

for model in models:
    try:
        model_responses = process_model(model)
        for response in model_responses:
            answer = response['choices'][0]['message']['content'].strip()
            results[model].append(answer)
        
        print(f"Processed all questions for model: {model}")
    except Exception as e:
        print(f"Error processing model {model}: {e}")
        results[model] = ["ERROR: Unable to get response"] * len(df)


# Prepare results for DataFrame
df_results = pd.DataFrame({
    'context' : df['story'],
    'question': df['question'],
    'true_answer': df['answer'],
    'choices': df['choices']  # Keep the original choices for reference
})

for model in models:
    df_results[model] = results[model]
# Save results to CSV
df_results.to_csv('2_ollama_models_visulization.csv', index=False)
print("Evaluation_Visualization complete.")

Time taken for ollama/phi3:mini: 1006.03 seconds
Processed all questions for model: ollama/phi3:mini
Time taken for ollama/gemma:2b: 558.48 seconds
Processed all questions for model: ollama/gemma:2b
Evaluation_Visualization complete.


In [55]:
import pandas as pd
from litellm import batch_completion
import os
import time 

# Load your dataset
df = pd.read_csv('dataset/SparQA_test.csv')

# Filter for the first 100 'YN' questions (or adjust as needed)
# df_filtered = df[df['q_type'] == 'YN'].head(100)

# Define the models you want to evaluate
models = ["ollama/openchat", "ollama/llama3","ollama/gemma","ollama/mistral"]

# Prepare messages for batch processing
messages = []
for _, row in df.iterrows():
    context = row["story"]
    question = row["question"]
    choices = row["choices"]
    user_prompt = f"Context: {context}\nQuestion: {question}\n Please choose the  best answer or answers from the choices.\nChoices: {choices}\n Answer: "
    
    messages.append([
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_prompt}
    ])

results = {model: [] for model in models}

def process_model(model):
    start_time = time.time()
    responses = batch_completion(
            model=model,
            messages=messages,
            max_tokens=10,
            top_p=1,
            temperature=0.1,
            num_retries=3,
           # max_budget=0.005,
            # batch_size=10
    )
    end_time = time.time()
    
    print(f"Time taken for {model}: {end_time - start_time:.2f} seconds")
    
    return responses

for model in models:
    try:
        model_responses = process_model(model)
        for response in model_responses:
            answer = response['choices'][0]['message']['content'].strip()
            results[model].append(answer)
        
        print(f"Processed all questions for model: {model}")
    except Exception as e:
        print(f"Error processing model {model}: {e}")
        results[model] = ["ERROR: Unable to get response"] * len(df)


# Prepare results for DataFrame
df_results = pd.DataFrame({
    'context' : df['story'],
    'question': df['question'],
    'true_answer': df['answer'],
    'choices': df['choices']  # Keep the original choices for reference
})

for model in models:
    df_results[model] = results[model]
# Save results to CSV
df_results.to_csv('4_ollama_models_visulization.csv', index=False)
print("Evaluation_Visualization complete.")

Time taken for ollama/openchat: 1992.79 seconds
Processed all questions for model: ollama/openchat
Time taken for ollama/llama3: 2133.16 seconds
Processed all questions for model: ollama/llama3
Time taken for ollama/gemma: 2412.47 seconds
Processed all questions for model: ollama/gemma
Time taken for ollama/mistral: 1911.54 seconds
Processed all questions for model: ollama/mistral
Evaluation_Visualization complete.


In [30]:
import pandas as pd
from litellm import batch_completion
import os
import time 



# Load your dataset
df = pd.read_csv('dataset/SparQA_test.csv')
df = df.sample(n=min(50, len(df)), random_state=100)
# Filter for the first 100 'YN' questions (or adjust as needed)
# df_filtered = df[df['q_type'] == 'YN'].head(100)

# Define the models you want to evaluate
models = ["ollama/openchat", "ollama/llama3","ollama/mistral","ollama/phi3:mini","ollama/gemma:2b"]


# Prepare the system prompt
system_prompt = """
Pay specail attention to the output format. The answer should be a list. Donot include the reasoning process in the output answer.
"""

# Prepare messages for batch processing
messages = []
for _, row in df.iterrows():
    context = row["story"]
    question = row["question"]
    choices = row["choices"]
    user_prompt = f"Context: {context}\nQuestion: {question}\n Please choose the best answer or answers from the choices.\nChoices: {choices}\n Answer: "
    
    messages.append([
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_prompt}
    ])

results = {model: [] for model in models}

def process_model(model):
    start_time = time.time()
    responses = batch_completion(
            model=model,
            messages=messages,
            max_tokens=10,
            top_p=1,
            temperature=0.1,
            num_retries=3,
           # max_budget=0.005,
            # batch_size=10
    )
    end_time = time.time()
    
    print(f"Time taken for {model}: {end_time - start_time:.2f} seconds")
    
    return responses

for model in models:
    try:
        model_responses = process_model(model)
        for response in model_responses:
            answer = response['choices'][0]['message']['content'].strip()
            results[model].append(answer)
        
        print(f"Processed all questions for model: {model}")
    except Exception as e:
        print(f"Error processing model {model}: {e}")
        results[model] = ["ERROR: Unable to get response"] * len(df)


# Prepare results for DataFrame
df_results = pd.DataFrame({
    'context' : df['story'],
    'question': df['question'],
    'true_answer': df['answer'],
    'choices': df['choices']  # Keep the original choices for reference
})

for model in models:
    df_results[model] = results[model]
# Save results to CSV
df_results.to_csv('evaluation_0_shots_0.csv', index=False)

Time taken for ollama/openchat: 201.90 seconds
Processed all questions for model: ollama/openchat
Time taken for ollama/llama3: 264.30 seconds
Processed all questions for model: ollama/llama3
Time taken for ollama/mistral: 201.67 seconds
Processed all questions for model: ollama/mistral
Time taken for ollama/phi3:mini: 92.29 seconds
Processed all questions for model: ollama/phi3:mini
Time taken for ollama/gemma:2b: 68.55 seconds
Processed all questions for model: ollama/gemma:2b


In [None]:
!pip install litellm python-dotenv

import pandas as pd
import litellm
from litellm import testing_batch_completion
import time


# Load your dataset
df = pd.read_csv('dataset/SparQA_test.csv')

# Filter for the first 100 'YN' questions (or adjust as needed)
df_filtered = df[df['q_type'] == 'YN'].head(50)

# Define the models you want to evaluate
models = ["ollama/openchat", "ollama/llama2"]

# Prepare the system prompt
system_prompt = """
Given the context, please answer the question by choosing the best answer from the choices. Try to think step by step. The output should be 1 or 2 words, donot include the reasoning process.
"""


# Prepare messages for batch processing
messages = []
for _, row in df_filtered.iterrows():
    context = row["story"]
    question = row["question"]
    choices = row["choices"]
    user_prompt = f"Context: {context}\nQuestion: {question}\nChoices: {choices}"
    
    messages.append([
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_prompt}],
        max_tokens=100,
        top_p=1          )

# Run the batch test
start_time = time.time()
result = testing_batch_completion(models=models, messages=messages)
end_time = time.time()

print(f"Total time taken: {end_time - start_time:.2f} seconds")

# Process results
processed_results = []
for i, (question, true_answer) in enumerate(zip(df_filtered['question'], df_filtered['answer'])):
    row = {'question': question, 'true_answer': true_answer.strip().lower()}
    for model in models:
        model_answer = result[model]['choices'][i]['message']['content'].strip().lower()
        row[model] = model_answer
    processed_results.append(row)

# Create DataFrame from results
results_df = pd.DataFrame(processed_results)

# Save results to CSV
results_df.to_csv('model_evaluation_results3.csv', index=False)
print("Evaluation complete. Results saved to 'model_evaluation_results3.csv'")

# Calculate and display metrics
def calculate_accuracy(true, pred):
    return sum(t == p for t, p in zip(true, pred)) / len(true)

metrics = {}
for model in models:
    accuracy = calculate_accuracy(results_df['true_answer'], results_df[model])
    metrics[model] = {'accuracy': accuracy}
    print(f"Accuracy for {model}: {accuracy:.2f}")

# Create a DataFrame with the metrics
metrics_df = pd.DataFrame(metrics).T
print(metrics_df)
metrics_df.to_csv('model_evaluation_metrics.csv')