## Comparing LLMs on a Test Set using LiteLLM


In [3]:

import json
import os
from litellm import completion
from itertools import islice
import os

os.environ['OPENAI_API_KEY'] = ""
os.environ['ANTHROPIC_API_KEY'] = ""



In [41]:

models = ['ollama/llama3', 'ollama/mistral'] 
# define what models you're testing, see: https://docs.litellm.ai/docs/providers

prompt = """
You are a helper bot who is especailly skilled in spatial reasoning and other common sense reasoning tasks.
Please answer the questions. You need not explain your answer, just answer "Yes" or "No" or "DK(Donot Know)".
"""

results= []

with open('/Users/rongwang/Desktop/毕业论文/dataset_spatial/spartqa_yn/spartqa_YN_test.jsonl', 'r') as file:
    # Load only the first 100 lines
    samples = islice(file, 100) # the other samples = [json.loads(line) for line in file][:100]

    for line in samples:
        spartYN = json.loads(line)  # Parse the JSON data from each line
        context = spartYN['story']  # Directly access the 'story' string
        question = spartYN['question']  # Directly access the 'question' string
        true_answer = spartYN['answer'].strip()  # Access and strip the 'answer' string
        
        row = [context, question,true_answer]

        for model in models:
            response = completion( # using litellm.completion
              model=model,
              max_tokens=10,
              top_p=1,
              #verbose=True,
              messages=[
                {'role': 'system', 'content': prompt},
                {'role': 'user', 'content': context+ question}
            ]
            )
            answer = response.choices[0].message['content']
            row.append(answer)
        results.append(row) # save results

#print(print("Calling:", model, "answer:", answer))

# Compare the accuracy 

In [45]:
def calculate_accuracy(results):
    errors = []
    correct_count1, correct_count2 = 0, 0
    total_samples = len(results)

    for row in results:
        _,_, true_answer, answer1, answer2= row

        # Check for errors
        if "error occurred" in answer1:
            errors.append(answer1)
        if "error occurred" in answer2:
            errors.append(answer2)
        

        # Remove non-word and non-whitespace characters
        pattern = r"[^\w\s]"
        expected_answer1 = re.sub(pattern, "", answer1).strip().lower()
        expected_answer2 = re.sub(pattern, "", answer2).strip().lower()
        true_answer = true_answer.lower()

        # Check for correct answers
        if expected_answer1 == true_answer:
            correct_count1 += 1
        if expected_answer2 == true_answer:
            correct_count2 += 1

    # Calculate the accuracy for each model
    accuracy1 = correct_count1 / total_samples
    accuracy2 = correct_count2 / total_samples
  
    return [accuracy1, accuracy2], errors

import re 
re2= calculate_accuracy(results)
print(re2)

([0.48, 0.0], [])


In [38]:
import re 
re1 = calculate_accuracy(results)
print(re1)

([0.014, 0.0, 0.474], [])


 ['ollama/llama3', 'ollama/mistral',"ollama/openchat"] 
([0.014, 0.0, 0.474])
import csv
    # Write the results to a CSV file
with open('results.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(results)

## Visualizing Results

In [39]:
# Create a table to visualize results
import pandas as pd

columns = ["context", "question","true_answer"] + models
df = pd.DataFrame(results, columns=columns)

df

Unnamed: 0,context,question,true_answer,ollama/llama3,ollama/mistral,ollama/openchat
0,"We have three blocks. Lets call them A, B and ...","Is the medium black square , above any medium ...",Yes,Let's think step by step!\n\nWe know that,"Based on the information provided, ""No"" the",No
1,"We have three blocks. Lets call them A, B and ...","Is there any medium yellow squares , above a s...",Yes,Let's break this down step by step!\n\nWe,DK. The information provided does not specify if,Yes
2,"There are three blocks, A, B and C. Block A is...",Is there a medium black thing that is to the r...,No,I'll think step by step!\n\nFrom the description,"DK. Based on the given information, it",Yes
3,"There are three blocks, A, B and C. Block A is...","Is the medium blue triangle , below the medium...",Yes,A spatial reasoning puzzle! I love these.\n\nLet,"Based on the information provided, the answer...",Yes
4,"There are two blocks, A and B. Block A is to t...","Are all medium yellow squares , below the medi...",DK,Let's break it down step by step.\n\nBlock,No. Medium yellow square number two is not,Yes
...,...,...,...,...,...,...
495,We have two blocks. Lets call them A and B. Bl...,Is there a square that is to the left of a blu...,Yes,Let's break it down step by step!\n\nWe,"Based on the information given, the answer is """,Yes
496,"There are three blocks, A, B and C. There is a...","Is the medium blue square , below any medium s...",Yes,I'll think step by step!\n\nLet's break,"Based on the information provided, the answer...",Yes
497,"There are three blocks, A, B and C. There is a...",Is there a medium object that is above a mediu...,No,A spatial reasoning puzzle!\n\nLet's break it ...,"Based on the information provided, the answer...",Yes
498,"We have three blocks. Lets call them A, B and ...","Is medium yellow square number one, below medi...",Yes,A spatial reasoning challenge!\n\nLet's break ...,"Based on the information provided, the answer...",Yes
