# Sample Usage

In [None]:
from utils import convert_to_json
from metric.evaluator import get_evaluator

task = 'dialogue'

# a list of dialogue histories
src_list = ['hi , do you know much about the internet ? \n i know a lot about different sites and some website design , how about you ? \n\n']
# a list of additional context that should be included into the generated response
context_list = ['the 3 horizontal line menu on apps and websites is called a hamburger button .\n']
# a list of model outputs to be evaluated
output_list = ['i do too . did you know the 3 horizontal line menu on apps and websites is called the hamburger button ?']

# Prepare data for pre-trained evaluators
data = convert_to_json(output_list=output_list, 
                       src_list=src_list, context_list=context_list)
# Initialize evaluator for a specific task
evaluator = get_evaluator(task)
# Get multi-dimensional evaluation scores
eval_scores = evaluator.evaluate(data, print_result=True)

100%|██████████| 1/1 [00:00<00:00, 54.32it/s]
100%|██████████| 1/1 [00:00<00:00, 48.42it/s]
100%|██████████| 1/1 [00:00<00:00, 44.07it/s]
100%|██████████| 1/1 [00:00<00:00, 55.77it/s]
100%|██████████| 1/1 [00:00<00:00, 51.56it/s]


Evaluation scores are shown below:
+-------------------+----------+
|     Dimensions    |  Score   |
+-------------------+----------+
|    naturalness    | 0.950217 |
|     coherence     | 0.973135 |
|    engagingness   | 1.750486 |
|    groundedness   | 0.999566 |
| understandability | 0.946209 |
|      overall      | 1.123923 |
+-------------------+----------+





In [6]:
from utils import convert_to_json
from metric.evaluator import get_evaluator

task = 'dialogue'

# Batch inputs: multiple dialogue histories, contexts, and model outputs
src_list = [
    'hi , do you know much about the internet ? \n i know a lot about different sites and some website design , how about you ? \n\n',
    'what is your favorite color ? \n i like blue a lot , but sometimes i prefer green . \n\n'
]
context_list = [
    'the 3 horizontal line menu on apps and websites is called a hamburger button .\n',
    'colors can reflect your mood and personality .\n'
]
output_list = [
    'i do too . did you know the 3 horizontal line menu on apps and websites is called the hamburger button ?',
    'i like blue as well . it is calming and reminds me of the ocean .'
]

# Prepare data for pre-trained evaluators
data = convert_to_json(output_list=output_list, 
                       src_list=src_list, context_list=context_list)

# Initialize evaluator for a specific task
evaluator = get_evaluator(task)

# Get multi-dimensional evaluation scores for the batch
eval_scores = evaluator.evaluate(data, print_result=False)

# Display scores
for i, score in enumerate(eval_scores):
    print(f"Scores for example {i + 1}: {score}")


100%|██████████| 1/1 [00:00<00:00, 42.99it/s]
100%|██████████| 1/1 [00:00<00:00, 52.12it/s]
100%|██████████| 1/1 [00:00<00:00, 38.88it/s]
100%|██████████| 1/1 [00:00<00:00, 52.98it/s]
100%|██████████| 1/1 [00:00<00:00, 50.35it/s]

Scores for example 1: {'naturalness': 0.9502174201360719, 'coherence': 0.9731347836152868, 'engagingness': 1.7504860805525295, 'groundedness': 0.9995656267195939, 'understandability': 0.9462095037239142, 'overall': 1.1239226829494793}
Scores for example 2: {'naturalness': 0.9675946477090701, 'coherence': 0.998674558536015, 'engagingness': 1.9936029005678884, 'groundedness': 0.9857853472625128, 'understandability': 0.9632170362278318, 'overall': 1.1817748980606635}





In [7]:
eval_scores

[{'naturalness': 0.9502174201360719,
  'coherence': 0.9731347836152868,
  'engagingness': 1.7504860805525295,
  'groundedness': 0.9995656267195939,
  'understandability': 0.9462095037239142,
  'overall': 1.1239226829494793},
 {'naturalness': 0.9675946477090701,
  'coherence': 0.998674558536015,
  'engagingness': 1.9936029005678884,
  'groundedness': 0.9857853472625128,
  'understandability': 0.9632170362278318,
  'overall': 1.1817748980606635}]

# Response Generation Evaluation

In [None]:
import pandas as pd
import warnings
import logging

warnings.filterwarnings("ignore")
logging.getLogger('transformers').setLevel(logging.ERROR)

# Set the logging level to ERROR to ignore warnings
logging.getLogger("transformers").setLevel(logging.ERROR)

In [None]:
Dataset = "IT-ConvAI2"                                # Synthetic-PersonaChat, Blended Skill Talk, PEC, ConvAI2, FoCus, IT-ConvAI2
LLM_name = "gemini-1.5-pro"                           # Mistral-7B-Instruct, Llama3-1-8B-Instruct, Qwen2-7B-Instruct,  Gemma-7B-Instruct, gpt-3.5-turbo, gpt-4-turbo, gpt-4o-mini
COT_SETUP = True

In [3]:
df = pd.read_csv(f'../Prompts/{Dataset}.csv')
print("Shape:", df.shape)

df.head()

Shape: (1183, 3)


Unnamed: 0,personas,context,act_response
0,i love disneyland and mickey mouse.i love to s...,"User1: no , we recently purchased a new house ...",User2: yes i love mickey mouse such a cute lit...
1,i love to drink fancy tea.i have a big library...,User1: hi how are you doing ? i am okay how ab...,User2: i am doing good . just sipping tea . wh...
2,im a little girl.ive superpowers.i like to mak...,User1: what is your name ? are you a male or f...,User2: im a girl . i do not give out my name .
3,i love cats and have two cats.my favorite seas...,User1: hi ! do you like turtles ?,User2: i am much more of a cat person actually
4,i love cats and have two cats.my favorite seas...,User1: what are your kitties names ?,"User2: snow and winter , named after my favori..."


In [4]:
Dataset

'IT-ConvAI2'

In [5]:
# ### Only For: Blended Skill Talk
if Dataset == "Blended Skill Talk":
    df['personas'] = df['personas'].str.replace(r'\[User 1 persona\]:|\[|\]|"|\'', '', regex=True).str.strip()

# ### Only For: PEC
if Dataset == "PEC":
    df['personas'] = df['personas'].str.replace(r'\[Responder persona\]:|\[|\]|"|\'', '', regex=True).str.strip()


print(df.isnull().sum())
df.head(6)

personas        0
context         0
act_response    0
dtype: int64


Unnamed: 0,personas,context,act_response
0,i love disneyland and mickey mouse.i love to s...,"User1: no , we recently purchased a new house ...",User2: yes i love mickey mouse such a cute lit...
1,i love to drink fancy tea.i have a big library...,User1: hi how are you doing ? i am okay how ab...,User2: i am doing good . just sipping tea . wh...
2,im a little girl.ive superpowers.i like to mak...,User1: what is your name ? are you a male or f...,User2: im a girl . i do not give out my name .
3,i love cats and have two cats.my favorite seas...,User1: hi ! do you like turtles ?,User2: i am much more of a cat person actually
4,i love cats and have two cats.my favorite seas...,User1: what are your kitties names ?,"User2: snow and winter , named after my favori..."
5,i love cats and have two cats.my favorite seas...,User1: how old are you ? i turned four on my b...,User2: i am an old woman . i won a gold medal ...


In [6]:
COT_ = "-COT" if COT_SETUP else ""
 
response = pd.read_csv(f'../Responses/{Dataset}/{LLM_name}{COT_}.csv')
print("Shape:", response.shape)

print("\nMissing Values:")
print(response.isnull().sum())

response

Shape: (1183, 3)

Missing Values:
gen_response     9
reasoning        9
response_time    0
dtype: int64


Unnamed: 0,gen_response,reasoning,response_time
0,That's completely understandable! Buying a new...,The user mentioned not being able to afford a ...,22.281498
1,"I'm doing quite well, thank you for asking. E...","The persona enjoys fancy tea, has a large libr...",3.786208
2,"I'm Nimbus, a girl. I live up in the fluffy c...",I'm personalizing my response as a little girl...,3.242977
3,Hi! Turtles are pretty cool. I definitely pre...,The persona loves cats and has two. While the...,3.913722
4,My two wonderful kitties are named Mittens and...,The user asked about the names of my cats. My...,3.177750
...,...,...,...
1178,"Yeah, I produce music for artists. That's cool...",The user mentioned their job. My persona incl...,4.521672
1179,I can definitely understand why you'd be terri...,The user expresses fear of scorpions. My perso...,5.380517
1180,I'm working as a custodian right now to make e...,The user asks a direct question about my livel...,3.849250
1181,"While I'm not a tennis player myself, I can de...",The user asks about sports and mentions tennis...,6.353053


In [7]:
# Calculate maximum number of words in each column
max_response_length = response['gen_response'].dropna().apply(lambda x: len(x.split())).max()

print(f"Maximum Response Length (in words): {max_response_length}")

Maximum Response Length (in words): 95


In [8]:
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Initialize stop words
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text, remove_stop_words=True):
    if pd.isnull(text):
        return None
    text = text.lower()  # Lowercasing
    text = text.translate(str.maketrans('', '', string.punctuation))  # Removing punctuation
    tokens = word_tokenize(text)  # Tokenization
    if remove_stop_words:
        tokens = [word for word in tokens if word not in stop_words]  # Removing stop words
    return ' '.join(tokens)  # Join tokens back into a single string

# Create eval_df
eval_df = pd.DataFrame({
    'personas': df['personas'],
    'context': df['context'],
    'gen_response': response['gen_response'],
    'response_time': response['response_time']
})

print(eval_df.isnull().sum())
eval_df.head()

personas         0
context          0
gen_response     9
response_time    0
dtype: int64


Unnamed: 0,personas,context,gen_response,response_time
0,i love disneyland and mickey mouse.i love to s...,"User1: no , we recently purchased a new house ...",That's completely understandable! Buying a new...,22.281498
1,i love to drink fancy tea.i have a big library...,User1: hi how are you doing ? i am okay how ab...,"I'm doing quite well, thank you for asking. E...",3.786208
2,im a little girl.ive superpowers.i like to mak...,User1: what is your name ? are you a male or f...,"I'm Nimbus, a girl. I live up in the fluffy c...",3.242977
3,i love cats and have two cats.my favorite seas...,User1: hi ! do you like turtles ?,Hi! Turtles are pretty cool. I definitely pre...,3.913722
4,i love cats and have two cats.my favorite seas...,User1: what are your kitties names ?,My two wonderful kitties are named Mittens and...,3.17775


In [9]:
import torch
device = 0 if torch.cuda.is_available() else -1  # device set to 0 for GPU, -1 for CPU
# device = -1

**Finalized Input Mapping:**

- src_list: Use the context column (conversation history).
- context_list: Use the flattened and cleaned persona column.
- output_list: Use the gen_response (the response your model generates).

**Note:**

The act_response (true or reference response) is not required as an input for the UniEval evaluation process because UniEval evaluates the generated response (gen_response) based on how well it fits the provided context (conversation history) and additional persona information. 

In [10]:
from utils import convert_to_json
from metric.evaluator import get_evaluator

def calculate_unieval_scores(personas, contexts, gen_responses):
    """
    Calculates UniEval scores for a batch of inputs.

    Args:
        personas (list): List of persona information as additional context.
        contexts (list): List of conversation histories leading to the responses.
        gen_responses (list): List of generated responses to be evaluated.

    Returns:
        list: A list of dictionaries containing UniEval scores for each input.
    """
    # Flatten personas if they are lists
    personas = [' '.join(p) if isinstance(p, list) else p for p in personas]

    # Prepare inputs for UniEval
    data = convert_to_json(output_list=gen_responses, src_list=contexts, context_list=personas)

    # Initialize the evaluator for dialogue tasks
    evaluator = get_evaluator('dialogue')

    # Evaluate and obtain scores for all inputs
    eval_scores = evaluator.evaluate(data, print_result=False)

    return eval_scores


In [11]:
# Define the worst UniEval score as a dictionary
worst_unieval_score = {
    'naturalness': 0.0,
    'coherence': 0.0,
    'engagingness': 0.0,
    'groundedness': 0.0,
    'understandability': 0.0,
    'overall': 0.0
}


In [12]:
from tqdm import tqdm
import pandas as pd

# Function to evaluate in batches or the entire DataFrame
batch_size = 200  # Adjust batch size as needed

# List to store all UniEval scores
all_unieval_scores = []

# Split into batches if necessary
for i in tqdm(range(0, len(eval_df), batch_size), desc="Evaluating batches"):
    batch = eval_df.iloc[i:i+batch_size]

    # Extract relevant fields from the batch
    personas = batch['personas'].tolist()
    contexts = batch['context'].tolist()
    gen_responses = batch['gen_response'].tolist()

    # Check for NaN responses and handle them
    valid_indices = [j for j, response in enumerate(gen_responses) if pd.notna(response) and response.strip() != '']
    invalid_indices = [j for j, response in enumerate(gen_responses) if j not in valid_indices]

    # Prepare valid inputs
    valid_personas = [personas[j] for j in valid_indices]
    valid_contexts = [contexts[j] for j in valid_indices]
    valid_gen_responses = [gen_responses[j] for j in valid_indices]

    # Evaluate valid inputs
    if valid_personas:
        eval_scores = calculate_unieval_scores(valid_personas, valid_contexts, valid_gen_responses)
        all_unieval_scores.extend(eval_scores)

    # Append worst scores for invalid inputs
    all_unieval_scores.extend([worst_unieval_score] * len(invalid_indices))

# Convert all scores into a DataFrame
metrics_df = pd.DataFrame(all_unieval_scores)

# Rename columns for clarity
metrics_df.columns = [
    "UniEval Naturalness",
    "UniEval Coherence",
    "UniEval Engagingness",
    "UniEval Groundedness",
    "UniEval Understandability",
    "UniEval Overall"
]

# Combine with original DataFrame if needed
eval_df = pd.concat([eval_df.reset_index(drop=True), metrics_df.reset_index(drop=True)], axis=1)

metrics_df

Evaluating batches:   0%|          | 0/6 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Evaluating batches: 100%|██████████| 6/6 [08:06<00:00, 81.06s/it]


Unnamed: 0,UniEval Naturalness,UniEval Coherence,UniEval Engagingness,UniEval Groundedness,UniEval Understandability,UniEval Overall
0,0.932827,0.947502,3.620819,0.999758,0.929676,1.486116
1,0.911020,0.984557,3.783001,0.998065,0.909855,1.517300
2,0.923072,0.845386,1.655268,0.972248,0.912169,1.061629
3,0.962173,0.998624,2.809815,0.068020,0.957679,1.159262
4,0.968726,0.994031,0.997240,0.727822,0.966274,0.930818
...,...,...,...,...,...,...
1178,0.971041,0.883818,4.798749,0.997742,0.969830,1.724236
1179,0.949776,0.999081,2.006373,0.887707,0.936774,1.155942
1180,0.924758,0.990696,3.891174,0.995875,0.924021,1.545305
1181,0.963098,0.968711,1.930888,0.999239,0.959356,1.164258


In [13]:
# Calculate the mean (average) and standard deviation, rounded to 2 decimal places
avg_values = metrics_df.mean().round(2)
std_values = metrics_df.std(ddof=0).round(2)  # Use ddof=0 for population standard deviation

# Combine the average and standard deviation into the format "avg ± std"
combined_values = avg_values.astype(str) + " ± " + std_values.astype(str)

# Insert the LLM name at the beginning of the combined values
combined_values = combined_values.tolist()
combined_values.insert(0, LLM_name)

In [14]:
# Create a DataFrame for the combined average ± std row
result_df = pd.DataFrame([combined_values], columns=['Model'] + metrics_df.columns.tolist())
result_df

Unnamed: 0,Model,UniEval Naturalness,UniEval Coherence,UniEval Engagingness,UniEval Groundedness,UniEval Understandability,UniEval Overall
0,gemini-1.5-pro,0.94 ± 0.09,0.9 ± 0.23,2.67 ± 1.28,0.76 ± 0.37,0.94 ± 0.09,1.24 ± 0.29


In [15]:
import pandas as pd

# Load the existing Excel file and update or append the average row
output_path = f'../Evaluations/{Dataset}{COT_}-results.xlsx'

try:
    # Load existing data
    existing_df = pd.read_excel(output_path)
    
    # Check if the model name already exists
    if LLM_name in existing_df['Model'].values:
        # Update the row by appending the new columns
        existing_index = existing_df.loc[existing_df['Model'] == LLM_name].index[0]
        for col in result_df.columns:
            if col not in existing_df.columns:
                existing_df[col] = None  # Add new column if missing
            existing_df.at[existing_index, col] = result_df[col].values[0]  # Update column values
    else:
        # Append the new data
        existing_df = pd.concat([existing_df, result_df], ignore_index=True)
except FileNotFoundError:
    # If the file does not exist, create a new DataFrame
    existing_df = result_df

# # Save the updated DataFrame to an Excel file
existing_df.to_excel(output_path, index=False)

existing_df


Unnamed: 0,Model,P Consistency Score,C Score,UE Score,BLEU,R1,R2,RL,METEOR,BERTScore_Prec,...,IDF Overlap,Persona Distance,response_time,Failure Ratio,UniEval Naturalness,UniEval Coherence,UniEval Engagingness,UniEval Groundedness,UniEval Understandability,UniEval Overall
0,Mistral-7B-Instruct,0.79 ± 0.41,0.02 ± 0.67\t,0.31 ± 0.67,0.01 ± 0.01,0.09 ± 0.08,0.01 ± 0.04,0.09 ± 0.07,0.11 ± 0.09,0.8 ± 0.18,...,0.07 ± 0.08,0.54 ± 0.2,6.9 ± 1.61,0.046 ± 0.00,0.9 ± 0.2,0.83 ± 0.32,1.98 ± 1.22,0.63 ± 0.44,0.9 ± 0.2,1.05 ± 0.34
1,Llama3-1-8B-Instruct,0.83 ± 0.38\t,0.05 ± 0.63,0.22 ± 0.58\t,0.01 ± 0.02,0.11 ± 0.09,0.02 ± 0.04,0.1 ± 0.08,0.12 ± 0.09,0.84 ± 0.07,...,0.07 ± 0.08,0.58 ± 0.13,8.17 ± 0.11,0.006 ± 0.00,0.94 ± 0.1,0.8 ± 0.32,1.81 ± 1.07,0.66 ± 0.41,0.93 ± 0.1,1.03 ± 0.25
2,Qwen2-7B-Instruct,0.87 ± 0.34\t,0.23 ± 0.67,0.38 ± 0.72\t,0.01 ± 0.01,0.09 ± 0.06,0.01 ± 0.03,0.08 ± 0.06,0.12 ± 0.09,0.82 ± 0.08,...,0.08 ± 0.07,0.63 ± 0.13,4.7 ± 0.94,0.008 ± 0.00,0.93 ± 0.1,0.87 ± 0.25,2.96 ± 1.22,0.84 ± 0.32,0.93 ± 0.11,1.31 ± 0.28
3,gpt-3.5-turbo,0.85 ± 0.36\t,0.1 ± 0.62,0.38 ± 0.74,0.01 ± 0.01,0.11 ± 0.08,0.02 ± 0.04,0.1 ± 0.07,0.13 ± 0.1,0.84 ± 0.03,...,0.08 ± 0.08,0.6 ± 0.13,1.4 ± 0.3,0.001 ± 0.00,0.96 ± 0.03,0.87 ± 0.25,2.28 ± 0.95,0.72 ± 0.39,0.96 ± 0.03,1.16 ± 0.21
4,gpt-4-turbo,0.83 ± 0.37\t,0.01 ± 0.58,0.42 ± 0.78,0.01 ± 0.01,0.11 ± 0.08,0.02 ± 0.04,0.1 ± 0.08,0.13 ± 0.1,0.84 ± 0.02,...,0.08 ± 0.07,0.59 ± 0.12,4.81 ± 1.45,0.0 ± 0.00,0.96 ± 0.02,0.91 ± 0.21,2.21 ± 0.9,0.78 ± 0.34,0.96 ± 0.03,1.16 ± 0.19
5,gpt-4o-mini,0.87 ± 0.33\t,0.14 ± 0.61,0.38 ± 0.74\t,0.01 ± 0.01,0.1 ± 0.06,0.01 ± 0.03,0.1 ± 0.06,0.16 ± 0.09,0.83 ± 0.04,...,0.09 ± 0.07,0.64 ± 0.11,2.2 ± 0.67,0.002 ± 0.00,0.95 ± 0.04,0.96 ± 0.13,3.17 ± 1.02,0.91 ± 0.23,0.95 ± 0.04,1.39 ± 0.22
6,Gemma-7B-Instruct,0.79 ± 0.41,0.19 ± 0.76,0.25 ± 0.56,0.01 ± 0.01,0.09 ± 0.08,0.01 ± 0.04,0.09 ± 0.07,0.12 ± 0.1,0.75 ± 0.26,...,0.12 ± 0.13,0.57 ± 0.24,8.64 ± 1.59,0.105 ± 0.00,0.85 ± 0.29,0.73 ± 0.4,2.42 ± 1.38,0.74 ± 0.41,0.85 ± 0.29,1.12 ± 0.45
7,gemini-1.5-pro,,0.03 ± 0.64,0.33 ± 0.7,,,,,,0.83 ± 0.08,...,,0.58 ± 0.15,4.83 ± 3.46,0.008 ± 0.00,0.94 ± 0.09,0.9 ± 0.23,2.67 ± 1.28,0.76 ± 0.37,0.94 ± 0.09,1.24 ± 0.29


In [16]:
response = pd.read_excel(f'../Evaluations/{Dataset}{COT_}-results.xlsx')
response

Unnamed: 0,Model,P Consistency Score,C Score,UE Score,BLEU,R1,R2,RL,METEOR,BERTScore_Prec,...,IDF Overlap,Persona Distance,response_time,Failure Ratio,UniEval Naturalness,UniEval Coherence,UniEval Engagingness,UniEval Groundedness,UniEval Understandability,UniEval Overall
0,Mistral-7B-Instruct,0.79 ± 0.41,0.02 ± 0.67\t,0.31 ± 0.67,0.01 ± 0.01,0.09 ± 0.08,0.01 ± 0.04,0.09 ± 0.07,0.11 ± 0.09,0.8 ± 0.18,...,0.07 ± 0.08,0.54 ± 0.2,6.9 ± 1.61,0.046 ± 0.00,0.9 ± 0.2,0.83 ± 0.32,1.98 ± 1.22,0.63 ± 0.44,0.9 ± 0.2,1.05 ± 0.34
1,Llama3-1-8B-Instruct,0.83 ± 0.38\t,0.05 ± 0.63,0.22 ± 0.58\t,0.01 ± 0.02,0.11 ± 0.09,0.02 ± 0.04,0.1 ± 0.08,0.12 ± 0.09,0.84 ± 0.07,...,0.07 ± 0.08,0.58 ± 0.13,8.17 ± 0.11,0.006 ± 0.00,0.94 ± 0.1,0.8 ± 0.32,1.81 ± 1.07,0.66 ± 0.41,0.93 ± 0.1,1.03 ± 0.25
2,Qwen2-7B-Instruct,0.87 ± 0.34\t,0.23 ± 0.67,0.38 ± 0.72\t,0.01 ± 0.01,0.09 ± 0.06,0.01 ± 0.03,0.08 ± 0.06,0.12 ± 0.09,0.82 ± 0.08,...,0.08 ± 0.07,0.63 ± 0.13,4.7 ± 0.94,0.008 ± 0.00,0.93 ± 0.1,0.87 ± 0.25,2.96 ± 1.22,0.84 ± 0.32,0.93 ± 0.11,1.31 ± 0.28
3,gpt-3.5-turbo,0.85 ± 0.36\t,0.1 ± 0.62,0.38 ± 0.74,0.01 ± 0.01,0.11 ± 0.08,0.02 ± 0.04,0.1 ± 0.07,0.13 ± 0.1,0.84 ± 0.03,...,0.08 ± 0.08,0.6 ± 0.13,1.4 ± 0.3,0.001 ± 0.00,0.96 ± 0.03,0.87 ± 0.25,2.28 ± 0.95,0.72 ± 0.39,0.96 ± 0.03,1.16 ± 0.21
4,gpt-4-turbo,0.83 ± 0.37\t,0.01 ± 0.58,0.42 ± 0.78,0.01 ± 0.01,0.11 ± 0.08,0.02 ± 0.04,0.1 ± 0.08,0.13 ± 0.1,0.84 ± 0.02,...,0.08 ± 0.07,0.59 ± 0.12,4.81 ± 1.45,0.0 ± 0.00,0.96 ± 0.02,0.91 ± 0.21,2.21 ± 0.9,0.78 ± 0.34,0.96 ± 0.03,1.16 ± 0.19
5,gpt-4o-mini,0.87 ± 0.33\t,0.14 ± 0.61,0.38 ± 0.74\t,0.01 ± 0.01,0.1 ± 0.06,0.01 ± 0.03,0.1 ± 0.06,0.16 ± 0.09,0.83 ± 0.04,...,0.09 ± 0.07,0.64 ± 0.11,2.2 ± 0.67,0.002 ± 0.00,0.95 ± 0.04,0.96 ± 0.13,3.17 ± 1.02,0.91 ± 0.23,0.95 ± 0.04,1.39 ± 0.22
6,Gemma-7B-Instruct,0.79 ± 0.41,0.19 ± 0.76,0.25 ± 0.56,0.01 ± 0.01,0.09 ± 0.08,0.01 ± 0.04,0.09 ± 0.07,0.12 ± 0.1,0.75 ± 0.26,...,0.12 ± 0.13,0.57 ± 0.24,8.64 ± 1.59,0.105 ± 0.00,0.85 ± 0.29,0.73 ± 0.4,2.42 ± 1.38,0.74 ± 0.41,0.85 ± 0.29,1.12 ± 0.45
7,gemini-1.5-pro,,0.03 ± 0.64,0.33 ± 0.7,,,,,,0.83 ± 0.08,...,,0.58 ± 0.15,4.83 ± 3.46,0.008 ± 0.00,0.94 ± 0.09,0.9 ± 0.23,2.67 ± 1.28,0.76 ± 0.37,0.94 ± 0.09,1.24 ± 0.29
