# Sample Usage

In [None]:
from utils import convert_to_json
from metric.evaluator import get_evaluator

task = 'dialogue'

# a list of dialogue histories
src_list = ['hi , do you know much about the internet ? \n i know a lot about different sites and some website design , how about you ? \n\n']
# a list of additional context that should be included into the generated response
context_list = ['the 3 horizontal line menu on apps and websites is called a hamburger button .\n']
# a list of model outputs to be evaluated
output_list = ['i do too . did you know the 3 horizontal line menu on apps and websites is called the hamburger button ?']

# Prepare data for pre-trained evaluators
data = convert_to_json(output_list=output_list, 
                       src_list=src_list, context_list=context_list)
# Initialize evaluator for a specific task
evaluator = get_evaluator(task)
# Get multi-dimensional evaluation scores
eval_scores = evaluator.evaluate(data, print_result=True)

100%|██████████| 1/1 [00:00<00:00, 54.32it/s]
100%|██████████| 1/1 [00:00<00:00, 48.42it/s]
100%|██████████| 1/1 [00:00<00:00, 44.07it/s]
100%|██████████| 1/1 [00:00<00:00, 55.77it/s]
100%|██████████| 1/1 [00:00<00:00, 51.56it/s]


Evaluation scores are shown below:
+-------------------+----------+
|     Dimensions    |  Score   |
+-------------------+----------+
|    naturalness    | 0.950217 |
|     coherence     | 0.973135 |
|    engagingness   | 1.750486 |
|    groundedness   | 0.999566 |
| understandability | 0.946209 |
|      overall      | 1.123923 |
+-------------------+----------+





In [6]:
from utils import convert_to_json
from metric.evaluator import get_evaluator

task = 'dialogue'

# Batch inputs: multiple dialogue histories, contexts, and model outputs
src_list = [
    'hi , do you know much about the internet ? \n i know a lot about different sites and some website design , how about you ? \n\n',
    'what is your favorite color ? \n i like blue a lot , but sometimes i prefer green . \n\n'
]
context_list = [
    'the 3 horizontal line menu on apps and websites is called a hamburger button .\n',
    'colors can reflect your mood and personality .\n'
]
output_list = [
    'i do too . did you know the 3 horizontal line menu on apps and websites is called the hamburger button ?',
    'i like blue as well . it is calming and reminds me of the ocean .'
]

# Prepare data for pre-trained evaluators
data = convert_to_json(output_list=output_list, 
                       src_list=src_list, context_list=context_list)

# Initialize evaluator for a specific task
evaluator = get_evaluator(task)

# Get multi-dimensional evaluation scores for the batch
eval_scores = evaluator.evaluate(data, print_result=False)

# Display scores
for i, score in enumerate(eval_scores):
    print(f"Scores for example {i + 1}: {score}")


100%|██████████| 1/1 [00:00<00:00, 42.99it/s]
100%|██████████| 1/1 [00:00<00:00, 52.12it/s]
100%|██████████| 1/1 [00:00<00:00, 38.88it/s]
100%|██████████| 1/1 [00:00<00:00, 52.98it/s]
100%|██████████| 1/1 [00:00<00:00, 50.35it/s]

Scores for example 1: {'naturalness': 0.9502174201360719, 'coherence': 0.9731347836152868, 'engagingness': 1.7504860805525295, 'groundedness': 0.9995656267195939, 'understandability': 0.9462095037239142, 'overall': 1.1239226829494793}
Scores for example 2: {'naturalness': 0.9675946477090701, 'coherence': 0.998674558536015, 'engagingness': 1.9936029005678884, 'groundedness': 0.9857853472625128, 'understandability': 0.9632170362278318, 'overall': 1.1817748980606635}





In [7]:
eval_scores

[{'naturalness': 0.9502174201360719,
  'coherence': 0.9731347836152868,
  'engagingness': 1.7504860805525295,
  'groundedness': 0.9995656267195939,
  'understandability': 0.9462095037239142,
  'overall': 1.1239226829494793},
 {'naturalness': 0.9675946477090701,
  'coherence': 0.998674558536015,
  'engagingness': 1.9936029005678884,
  'groundedness': 0.9857853472625128,
  'understandability': 0.9632170362278318,
  'overall': 1.1817748980606635}]

# Response Generation Evaluation

In [1]:
import pandas as pd
import warnings
import logging

warnings.filterwarnings("ignore")
logging.getLogger('transformers').setLevel(logging.ERROR)

# Set the logging level to ERROR to ignore warnings
logging.getLogger("transformers").setLevel(logging.ERROR)


In [2]:
Dataset = "FoCus"                                # Synthetic-PersonaChat, Blended Skill Talk, PEC, ConvAI2, FoCus, IT-ConvAI2
LLM_name = "Gemma-7B-Instruct"                                # Mistral-7B-Instruct, Llama3-1-8B-Instruct, Qwen2-7B-Instruct,  Gemma-7B-Instruct, gpt-3.5-turbo, gpt-4-turbo, gpt-4o-mini
COT_SETUP = True

In [3]:
df = pd.read_csv(f'../Prompts/{Dataset}.csv')
print("Shape:", df.shape)

df.head()

Shape: (1000, 3)


Unnamed: 0,personas,context,act_response
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,User2: The history of the house you are intere...
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",User2: This house was use as a stop for slaves...
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","User2: Sure, you will like to know that this p..."
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,User2: Technische Universität Darmstadt in the...
4,I like to visit england.I love church.I would ...,User1: Where is this place?\nUser2: This place...,"User2: I suggest a place, for your wish of see..."


In [4]:
Dataset

'FoCus'

In [5]:
# ### Only For: Blended Skill Talk
if Dataset == "Blended Skill Talk":
    df['personas'] = df['personas'].str.replace(r'\[User 1 persona\]:|\[|\]|"|\'', '', regex=True).str.strip()

# ### Only For: PEC
if Dataset == "PEC":
    df['personas'] = df['personas'].str.replace(r'\[Responder persona\]:|\[|\]|"|\'', '', regex=True).str.strip()


print(df.isnull().sum())
df.head(6)

personas        0
context         0
act_response    0
dtype: int64


Unnamed: 0,personas,context,act_response
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,User2: The history of the house you are intere...
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",User2: This house was use as a stop for slaves...
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","User2: Sure, you will like to know that this p..."
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,User2: Technische Universität Darmstadt in the...
4,I like to visit england.I love church.I would ...,User1: Where is this place?\nUser2: This place...,"User2: I suggest a place, for your wish of see..."
5,I would like to go to University.I live in Mic...,User1: I think Ive been there before but I don...,User2: They offer 132 bachelors degree program...


In [6]:
COT_ = "-COT" if COT_SETUP else ""
 
response = pd.read_csv(f'../Responses/{Dataset}/{LLM_name}{COT_}.csv')
print("Shape:", response.shape)

print("\nMissing Values:")
print(response.isnull().sum())

response

Shape: (1000, 2)

Missing Values:
gen_response     150
response_time      0
dtype: int64


Unnamed: 0,gen_response,response_time
0,The Nazareth House is a place you would enjoy ...,9.720944
1,This house is definitely one of the ones I wou...,8.374085
2,"I think you would definitely love this place, ...",7.135097
3,Hello! I'm glad you're interested in Technisch...,11.084497
4,,9.350121
...,...,...
995,The museum come pharmacy in which you are work...,9.927329
996,"This is a fascinating archaeological site, Mah...",11.045374
997,Hello! This is Armagh County Museum. It is a m...,11.122940
998,Nyanga National Park is a perfect place for yo...,8.307516


In [7]:
# Calculate maximum number of words in each column
max_response_length = response['gen_response'].dropna().apply(lambda x: len(x.split())).max()

print(f"Maximum Response Length (in words): {max_response_length}")

Maximum Response Length (in words): 117


In [8]:
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Initialize stop words
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text, remove_stop_words=True):
    if pd.isnull(text):
        return None
    text = text.lower()  # Lowercasing
    text = text.translate(str.maketrans('', '', string.punctuation))  # Removing punctuation
    tokens = word_tokenize(text)  # Tokenization
    if remove_stop_words:
        tokens = [word for word in tokens if word not in stop_words]  # Removing stop words
    return ' '.join(tokens)  # Join tokens back into a single string

# Create eval_df
eval_df = pd.DataFrame({
    'personas': df['personas'],
    'context': df['context'],
    'gen_response': response['gen_response'],
    'response_time': response['response_time']
})

print(eval_df.isnull().sum())
eval_df.head()

personas           0
context            0
gen_response     150
response_time      0
dtype: int64


Unnamed: 0,personas,context,gen_response,response_time
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,The Nazareth House is a place you would enjoy ...,9.720944
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",This house is definitely one of the ones I wou...,8.374085
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","I think you would definitely love this place, ...",7.135097
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,Hello! I'm glad you're interested in Technisch...,11.084497
4,I like to visit england.I love church.I would ...,User1: Where is this place?\nUser2: This place...,,9.350121


In [9]:
import torch
device = 0 if torch.cuda.is_available() else -1  # device set to 0 for GPU, -1 for CPU
# device = -1

In [10]:
device

0

**Finalized Input Mapping:**

- src_list: Use the context column (conversation history).
- context_list: Use the flattened and cleaned persona column.
- output_list: Use the gen_response (the response your model generates).

**Note:**

The act_response (true or reference response) is not required as an input for the UniEval evaluation process because UniEval evaluates the generated response (gen_response) based on how well it fits the provided context (conversation history) and additional persona information. 

In [11]:
from utils import convert_to_json
from metric.evaluator import get_evaluator

def calculate_unieval_scores(personas, contexts, gen_responses):
    """
    Calculates UniEval scores for a batch of inputs.

    Args:
        personas (list): List of persona information as additional context.
        contexts (list): List of conversation histories leading to the responses.
        gen_responses (list): List of generated responses to be evaluated.

    Returns:
        list: A list of dictionaries containing UniEval scores for each input.
    """
    # Flatten personas if they are lists
    personas = [' '.join(p) if isinstance(p, list) else p for p in personas]

    # Prepare inputs for UniEval
    data = convert_to_json(output_list=gen_responses, src_list=contexts, context_list=personas)

    # Initialize the evaluator for dialogue tasks
    evaluator = get_evaluator('dialogue')

    # Evaluate and obtain scores for all inputs
    eval_scores = evaluator.evaluate(data, print_result=False)

    return eval_scores


In [12]:
# Define the worst UniEval score as a dictionary
worst_unieval_score = {
    'naturalness': 0.0,
    'coherence': 0.0,
    'engagingness': 0.0,
    'groundedness': 0.0,
    'understandability': 0.0,
    'overall': 0.0
}


In [None]:
from tqdm import tqdm
import pandas as pd

# Function to evaluate in batches or the entire DataFrame
batch_size = 200  # Adjust batch size as needed

# List to store all UniEval scores
all_unieval_scores = []

# Split into batches if necessary
for i in tqdm(range(0, len(eval_df), batch_size), desc="Evaluating batches"):
    batch = eval_df.iloc[i:i+batch_size]

    # Extract relevant fields from the batch
    personas = batch['personas'].tolist()
    contexts = batch['context'].tolist()
    gen_responses = batch['gen_response'].tolist()

    # Check for NaN responses and handle them
    valid_indices = [j for j, response in enumerate(gen_responses) if pd.notna(response) and response.strip() != '']
    invalid_indices = [j for j, response in enumerate(gen_responses) if j not in valid_indices]

    # Prepare valid inputs
    valid_personas = [personas[j] for j in valid_indices]
    valid_contexts = [contexts[j] for j in valid_indices]
    valid_gen_responses = [gen_responses[j] for j in valid_indices]

    # Evaluate valid inputs
    if valid_personas:
        eval_scores = calculate_unieval_scores(valid_personas, valid_contexts, valid_gen_responses)
        all_unieval_scores.extend(eval_scores)

    # Append worst scores for invalid inputs
    all_unieval_scores.extend([worst_unieval_score] * len(invalid_indices))

# Convert all scores into a DataFrame
metrics_df = pd.DataFrame(all_unieval_scores)

# Rename columns for clarity
metrics_df.columns = [
    "UniEval Naturalness",
    "UniEval Coherence",
    "UniEval Engagingness",
    "UniEval Groundedness",
    "UniEval Understandability",
    "UniEval Overall"
]

# Combine with original DataFrame if needed
eval_df = pd.concat([eval_df.reset_index(drop=True), metrics_df.reset_index(drop=True)], axis=1)

metrics_df

Evaluating batches:   0%|          | 0/5 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [None]:
# Calculate the mean (average) and standard deviation, rounded to 2 decimal places
avg_values = metrics_df.mean().round(2)
std_values = metrics_df.std(ddof=0).round(2)  # Use ddof=0 for population standard deviation

# Combine the average and standard deviation into the format "avg ± std"
combined_values = avg_values.astype(str) + " ± " + std_values.astype(str)

# Insert the LLM name at the beginning of the combined values
combined_values = combined_values.tolist()
combined_values.insert(0, LLM_name)

In [None]:
# Create a DataFrame for the combined average ± std row
result_df = pd.DataFrame([combined_values], columns=['Model'] + metrics_df.columns.tolist())
result_df

Unnamed: 0,Model,UniEval Naturalness,UniEval Coherence,UniEval Engagingness,UniEval Groundedness,UniEval Understandability,UniEval Overall
0,Gemma-7B-Instruct,0.54 ± 0.46,0.57 ± 0.49,2.1 ± 1.97,0.55 ± 0.49,0.54 ± 0.46,0.86 ± 0.76


In [None]:
import pandas as pd

# Load the existing Excel file and update or append the average row
output_path = f'../Evaluations/{Dataset}{COT_}-results.xlsx'

try:
    # Load existing data
    existing_df = pd.read_excel(output_path)
    
    # Check if the model name already exists
    if LLM_name in existing_df['Model'].values:
        # Update the row by appending the new columns
        existing_index = existing_df.loc[existing_df['Model'] == LLM_name].index[0]
        for col in result_df.columns:
            if col not in existing_df.columns:
                existing_df[col] = None  # Add new column if missing
            existing_df.at[existing_index, col] = result_df[col].values[0]  # Update column values
    else:
        # Append the new data
        existing_df = pd.concat([existing_df, result_df], ignore_index=True)
except FileNotFoundError:
    # If the file does not exist, create a new DataFrame
    existing_df = result_df

# # Save the updated DataFrame to an Excel file
existing_df.to_excel(output_path, index=False)

existing_df


Unnamed: 0,Model,P Consistency Score,C Score,UE Score,BLEU,R1,R2,RL,METEOR,BERTScore_Prec,...,IDF Overlap,Persona Distance,response_time,Failure Ratio,UniEval Naturalness,UniEval Coherence,UniEval Engagingness,UniEval Groundedness,UniEval Understandability,UniEval Overall
0,Llama3-1-8B-Instruct,0.49 ± 0.5,-0.22 ± 0.87,0.17 ± 0.48\t,0.02 ± 0.08,0.11 ± 0.15,0.03 ± 0.1,0.1 ± 0.14,0.11 ± 0.14,0.47 ± 0.42,...,0.06 ± 0.1,0.33 ± 0.33,4.54 ± 0.21,0.448 ± 0.00,0.51 ± 0.46,0.55 ± 0.49,1.71 ± 1.88,0.42 ± 0.47,0.51 ± 0.46,0.74 ± 0.7
1,gpt-3.5-turbo,0.82 ± 0.39,0.26 ± 0.75,0.37 ± 0.67,0.04 ± 0.09,0.2 ± 0.14,0.06 ± 0.12,0.18 ± 0.14,0.2 ± 0.15,0.78 ± 0.25,...,0.07 ± 0.07,0.51 ± 0.21,1.3 ± 0.42,0.09 ± 0.00,0.84 ± 0.27,0.91 ± 0.29,2.31 ± 1.27,0.72 ± 0.39,0.84 ± 0.27,1.12 ± 0.42
2,gpt-4o-mini,0.81 ± 0.39\t,0.16 ± 0.71,0.35 ± 0.69,0.03 ± 0.05,0.19 ± 0.11,0.06 ± 0.08,0.17 ± 0.1,0.22 ± 0.15,0.78 ± 0.24,...,0.06 ± 0.05,0.54 ± 0.19,1.43 ± 0.66,0.086 ± 0.00,0.85 ± 0.27,0.91 ± 0.28,2.61 ± 1.2,0.72 ± 0.36,0.86 ± 0.26,1.19 ± 0.41
3,gpt-4-turbo,0.84 ± 0.36,0.14 ± 0.66,0.39 ± 0.73,0.04 ± 0.08,0.23 ± 0.13,0.08 ± 0.11,0.2 ± 0.13,0.23 ± 0.16,0.84 ± 0.16,...,0.05 ± 0.05,0.51 ± 0.15,3.32 ± 1.24,0.035 ± 0.00,0.91 ± 0.18,0.96 ± 0.18,2.2 ± 0.94,0.66 ± 0.38,0.91 ± 0.18,1.13 ± 0.28
4,Qwen2-7B-Instruct,0.45 ± 0.5,-0.31 ± 0.84,0.17 ± 0.48,0.01 ± 0.02,0.08 ± 0.1,0.02 ± 0.04,0.08 ± 0.09,0.1 ± 0.13,0.41 ± 0.42,...,0.04 ± 0.06,0.29 ± 0.3,3.74 ± 0.76,0.509 ± 0.00,0.46 ± 0.47,0.49 ± 0.5,1.72 ± 1.93,0.43 ± 0.47,0.46 ± 0.47,0.71 ± 0.75
5,Mistral-7B-Instruct,0.52 ± 0.5\t,-0.23 ± 0.82,0.21 ± 0.52,0.01 ± 0.04,0.11 ± 0.12,0.03 ± 0.06,0.1 ± 0.11,0.11 ± 0.14,0.48 ± 0.42,...,0.05 ± 0.07,0.3 ± 0.28,4.39 ± 0.36,0.435 ± 0.00,0.53 ± 0.46,0.56 ± 0.49,1.54 ± 1.55,0.45 ± 0.46,0.53 ± 0.46,0.72 ± 0.65
6,Gemma-7B-Instruct,0.53 ± 0.5,-0.13 ± 0.89,0.22 ± 0.53,0.01 ± 0.01,0.09 ± 0.1,0.01 ± 0.03,0.08 ± 0.09,0.09 ± 0.1,0.48 ± 0.42,...,0.08 ± 0.1,0.39 ± 0.34,5.17 ± 1.03,0.425 ± 0.00,0.54 ± 0.46,0.57 ± 0.49,2.1 ± 1.97,0.55 ± 0.49,0.54 ± 0.46,0.86 ± 0.76


In [None]:
response = pd.read_excel(f'../Evaluations/{Dataset}{COT_}-results.xlsx')
response

Unnamed: 0,Model,P Consistency Score,C Score,UE Score,BLEU,R1,R2,RL,METEOR,BERTScore_Prec,...,IDF Overlap,Persona Distance,response_time,Failure Ratio,UniEval Naturalness,UniEval Coherence,UniEval Engagingness,UniEval Groundedness,UniEval Understandability,UniEval Overall
0,Llama3-1-8B-Instruct,0.49 ± 0.5,-0.22 ± 0.87,0.17 ± 0.48\t,0.02 ± 0.08,0.11 ± 0.15,0.03 ± 0.1,0.1 ± 0.14,0.11 ± 0.14,0.47 ± 0.42,...,0.06 ± 0.1,0.33 ± 0.33,4.54 ± 0.21,0.448 ± 0.00,0.51 ± 0.46,0.55 ± 0.49,1.71 ± 1.88,0.42 ± 0.47,0.51 ± 0.46,0.74 ± 0.7
1,gpt-3.5-turbo,0.82 ± 0.39,0.26 ± 0.75,0.37 ± 0.67,0.04 ± 0.09,0.2 ± 0.14,0.06 ± 0.12,0.18 ± 0.14,0.2 ± 0.15,0.78 ± 0.25,...,0.07 ± 0.07,0.51 ± 0.21,1.3 ± 0.42,0.09 ± 0.00,0.84 ± 0.27,0.91 ± 0.29,2.31 ± 1.27,0.72 ± 0.39,0.84 ± 0.27,1.12 ± 0.42
2,gpt-4o-mini,0.81 ± 0.39\t,0.16 ± 0.71,0.35 ± 0.69,0.03 ± 0.05,0.19 ± 0.11,0.06 ± 0.08,0.17 ± 0.1,0.22 ± 0.15,0.78 ± 0.24,...,0.06 ± 0.05,0.54 ± 0.19,1.43 ± 0.66,0.086 ± 0.00,0.85 ± 0.27,0.91 ± 0.28,2.61 ± 1.2,0.72 ± 0.36,0.86 ± 0.26,1.19 ± 0.41
3,gpt-4-turbo,0.84 ± 0.36,0.14 ± 0.66,0.39 ± 0.73,0.04 ± 0.08,0.23 ± 0.13,0.08 ± 0.11,0.2 ± 0.13,0.23 ± 0.16,0.84 ± 0.16,...,0.05 ± 0.05,0.51 ± 0.15,3.32 ± 1.24,0.035 ± 0.00,0.91 ± 0.18,0.96 ± 0.18,2.2 ± 0.94,0.66 ± 0.38,0.91 ± 0.18,1.13 ± 0.28
4,Qwen2-7B-Instruct,0.45 ± 0.5,-0.31 ± 0.84,0.17 ± 0.48,0.01 ± 0.02,0.08 ± 0.1,0.02 ± 0.04,0.08 ± 0.09,0.1 ± 0.13,0.41 ± 0.42,...,0.04 ± 0.06,0.29 ± 0.3,3.74 ± 0.76,0.509 ± 0.00,0.46 ± 0.47,0.49 ± 0.5,1.72 ± 1.93,0.43 ± 0.47,0.46 ± 0.47,0.71 ± 0.75
5,Mistral-7B-Instruct,0.52 ± 0.5\t,-0.23 ± 0.82,0.21 ± 0.52,0.01 ± 0.04,0.11 ± 0.12,0.03 ± 0.06,0.1 ± 0.11,0.11 ± 0.14,0.48 ± 0.42,...,0.05 ± 0.07,0.3 ± 0.28,4.39 ± 0.36,0.435 ± 0.00,0.53 ± 0.46,0.56 ± 0.49,1.54 ± 1.55,0.45 ± 0.46,0.53 ± 0.46,0.72 ± 0.65
6,Gemma-7B-Instruct,0.53 ± 0.5,-0.13 ± 0.89,0.22 ± 0.53,0.01 ± 0.01,0.09 ± 0.1,0.01 ± 0.03,0.08 ± 0.09,0.09 ± 0.1,0.48 ± 0.42,...,0.08 ± 0.1,0.39 ± 0.34,5.17 ± 1.03,0.425 ± 0.00,0.54 ± 0.46,0.57 ± 0.49,2.1 ± 1.97,0.55 ± 0.49,0.54 ± 0.46,0.86 ± 0.76
