## Performance Evaluation

In [None]:
# pip install bert_score openpyxl gensim requests nltk

In [1]:
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('punkt_tab')

In [1]:
import pandas as pd
import warnings
import logging

warnings.filterwarnings("ignore")
logging.getLogger('transformers').setLevel(logging.ERROR)

# Set the logging level to ERROR to ignore warnings
logging.getLogger("transformers").setLevel(logging.ERROR)


In [2]:
Dataset = "FoCus"                                # Synthetic-PersonaChat, Blended Skill Talk, PEC, ConvAI2, FoCus, IT-ConvAI2
LLM_name = "Gemma-7B-Instruct"                               # Mistral-7B-Instruct, Llama3-1-8B-Instruct, Qwen2-7B-Instruct,  Gemma-7B-Instruct, gpt-3.5-turbo, gpt-4-turbo, gpt-4o-mini
COT_SETUP = False

In [3]:
df = pd.read_csv(f'./Prompts/{Dataset}.csv')
print("Shape:", df.shape)

df.head()

Shape: (1000, 3)


Unnamed: 0,personas,context,act_response
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,User2: The history of the house you are intere...
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",User2: This house was use as a stop for slaves...
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","User2: Sure, you will like to know that this p..."
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,User2: Technische Universität Darmstadt in the...
4,I like to visit england.I love church.I would ...,User1: Where is this place?\nUser2: This place...,"User2: I suggest a place, for your wish of see..."


In [4]:
Dataset

'FoCus'

In [5]:
### Only For: FoCus, IT-ConvAI2
if Dataset == "FoCus" or Dataset == "IT-ConvAI2":
    df['act_response'] = df['act_response'].apply(lambda x: x.split(':', 1)[1].strip() if ':' in x else x.strip())

# ### Only For: Blended Skill Talk
if Dataset == "Blended Skill Talk":
    df['personas'] = df['personas'].str.replace(r'\[User 1 persona\]:|\[|\]|"|\'', '', regex=True).str.strip()

# ### Only For: PEC
if Dataset == "PEC":
    df['personas'] = df['personas'].str.replace(r'\[Responder persona\]:|\[|\]|"|\'', '', regex=True).str.strip()


print(df.isnull().sum())
df.head(6)

personas        0
context         0
act_response    0
dtype: int64


Unnamed: 0,personas,context,act_response
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,The history of the house you are interested in...
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",This house was use as a stop for slaves trying...
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","Sure, you will like to know that this place wa..."
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,Technische Universität Darmstadt in the top 25...
4,I like to visit england.I love church.I would ...,User1: Where is this place?\nUser2: This place...,"I suggest a place, for your wish of see librar..."
5,I would like to go to University.I live in Mic...,User1: I think Ive been there before but I don...,They offer 132 bachelors degree programs and 1...


In [6]:
COT_ = "-COT" if COT_SETUP else ""
 
response = pd.read_csv(f'Responses/{Dataset}/{LLM_name}{COT_}.csv')
print("Shape:", response.shape)

print("\nMissing Values:")
print(response.isnull().sum())

response

Shape: (1000, 2)

Missing Values:
gen_response     425
response_time      0
dtype: int64


Unnamed: 0,gen_response,response_time
0,,5.545006
1,This is such a cool historic house! I've alway...,5.607118
2,"I'm glad you found this place amazing, it coul...",5.250293
3,"Hello, I'm interested in studying science in G...",3.402483
4,,5.625911
...,...,...
995,The museum is a pharmacy in which I am working...,5.616063
996,This is a fascinating archaeological site. Mah...,4.874345
997,Hello! This is Armagh County Museum. It is a m...,4.586782
998,I'm glad you're interested in Nyanga National ...,5.682001


In [7]:
# Calculate maximum number of words in each column
max_response_length = response['gen_response'].dropna().apply(lambda x: len(x.split())).max()

print(f"Maximum Response Length (in words): {max_response_length}")

Maximum Response Length (in words): 80


In [8]:
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Initialize stop words
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text, remove_stop_words=True):
    if pd.isnull(text):
        return None
    text = text.lower()  # Lowercasing
    text = text.translate(str.maketrans('', '', string.punctuation))  # Removing punctuation
    tokens = word_tokenize(text)  # Tokenization
    if remove_stop_words:
        tokens = [word for word in tokens if word not in stop_words]  # Removing stop words
    return ' '.join(tokens)  # Join tokens back into a single string

# Create eval_df
eval_df = pd.DataFrame({
    'personas': df['personas'],
    'act_response': df['act_response'],
    'gen_response': response['gen_response'],
    'response_time': response['response_time']
})

print(eval_df.isnull().sum())
eval_df.head()

personas           0
act_response       0
gen_response     425
response_time      0
dtype: int64


Unnamed: 0,personas,act_response,gen_response,response_time
0,I would like to visit the Nazareth House again...,The history of the house you are interested in...,,5.545006
1,I have been to Vermont a few times to go skiin...,This house was use as a stop for slaves trying...,This is such a cool historic house! I've alway...,5.607118
2,I am fascinated by the Spanish Colonial Reviva...,"Sure, you will like to know that this place wa...","I'm glad you found this place amazing, it coul...",5.250293
3,I want to become a college student.I want to s...,Technische Universität Darmstadt in the top 25...,"Hello, I'm interested in studying science in G...",3.402483
4,I like to visit england.I love church.I would ...,"I suggest a place, for your wish of see librar...",,5.625911


In [9]:
import torch
device = 0 if torch.cuda.is_available() else -1  # device set to 0 for GPU, -1 for CPU
# device = -1

In [10]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge import Rouge
import bert_score
from tqdm import tqdm
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gensim
import torch
from nltk.corpus import stopwords
from transformers import pipeline, BertForSequenceClassification, BertTokenizer


# Initialize ROUGE scorer
rouge = Rouge()

# Lists to store the metrics
bleu_scores = []
rouge_scores = []
meteor_scores = []
bertscore_prec = []
bertscore_rec = []
bertscore_f1 = []
distinct_1 = []
distinct_2 = []
ue_scores = []
c_scores = []
consistency_scores = []
idf_scores = []
persona_distance_scores = []


bert_snli_dir = "Fine-tuning/output/bert_snli"
bert_snli_model = BertForSequenceClassification.from_pretrained(bert_snli_dir)
bert_snli_tokenizer = BertTokenizer.from_pretrained(bert_snli_dir)

# Initialize the NLI pipeline for UE Score
bert_on_snli = pipeline('text-classification', model = bert_snli_model, tokenizer = bert_snli_tokenizer, device=0)

bert_dnli_dir = "Fine-tuning/output/bert_dnli"
bert_dnli_model = BertForSequenceClassification.from_pretrained(bert_dnli_dir)
bert_dnli_tokenizer = BertTokenizer.from_pretrained(bert_dnli_dir)

# Initialize the NLI pipeline
bert_on_dnli = pipeline('text-classification', model = bert_dnli_model, tokenizer = bert_dnli_tokenizer, device=0)


# Initialize the Word2Vec Model
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin", binary=True)
stop_words = set(stopwords.words('english'))


# Initialize smoothing function
smoothing_function = SmoothingFunction().method1

# Helper functions
def compute_bleu(reference, hypothesis):
    reference = [str(reference).replace('\n', ' ').split()]
    hypothesis = str(hypothesis).replace('\n', ' ').split()
    return sentence_bleu(reference, hypothesis, smoothing_function=smoothing_function)

def compute_rouge(reference, hypothesis):
    scores = rouge.get_scores(str(hypothesis).replace('\n', ' '), str(reference).replace('\n', ' '), avg=True)
    return scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f']

def compute_meteor(reference, hypothesis):
    reference = [str(reference).replace('\n', ' ').split()]
    hypothesis = str(hypothesis).replace('\n', ' ').split()
    return meteor_score(reference, hypothesis)

def compute_distinct_ngrams(text, n):
    tokens = str(text).replace('\n', ' ').split()
    ngrams = list(zip(*[tokens[i:] for i in range(n)]))
    distinct_ngrams = len(set(ngrams))
    total_ngrams = len(ngrams)
    return distinct_ngrams / total_ngrams if total_ngrams > 0 else 0


# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

def calculate_c_score(gen_response, persona):
    """
    Calculate the C score based on the entailment results between a generated response (R)
    and a given persona (P).

    Returns:
    int: C-score with possible values:
         1 for entailment (positive),
         0 for neutral,
         -1 for contradiction (negative).
    """

    # Define the label mapping to interpret the NLI model's output
    label_mapping = {
        'LABEL_0': 'negative',
        'LABEL_1': 'neutral',
        'LABEL_2': 'positive'
    }
    
    # Check entailment between persona (P) and generated response (R)
    result_pr = bert_on_dnli(f"{persona} {gen_response}")
    label_pr = label_mapping.get(result_pr[0]['label'], 'unknown')

    # Determine C score based on entailment results
    if label_pr == 'positive':
        return 1
    elif label_pr == 'neutral':
        return 0
    elif label_pr == 'negative':
        return -1
    else:
        raise ValueError(f"Unexpected label encountered: {label_pr}")


def calculate_consistency_score(gen_response, persona):
    """
    Calculate the Consistency Score based on the binary entailment results 
    between a generated response (R) and a given persona (P).

    Returns:
    int: Consistency Score with binary values:
         1 for entailment or neutral,
         0 for contradiction.
    """

    # Define the label mapping for binary classification
    label_mapping = {
        'LABEL_0': 'negative',
        'LABEL_1': 'neutral',
        'LABEL_2': 'positive'
    }

    # Check entailment between persona (P) and generated response (R)
    result_pr = bert_on_dnli(f"{persona} {gen_response}")
    label_pr = label_mapping.get(result_pr[0]['label'], 'unknown')

    # Determine Consistency Score based on binary entailment results
    if label_pr in ['positive', 'neutral']:
        return 1
    elif label_pr == 'negative':
        return 0
    else:
        raise ValueError(f"Unexpected label encountered: {label_pr}")


def calculate_ue_score(act_response, gen_response, persona):
    """
    Calculate the UE score based on entailment between persona, actual response, and generated response.

    Returns:
    int: UE score with possible values 2, 1, or 0.
    """

    # Define the label mapping to interpret the NLI model's output
    label_mapping = {
        'LABEL_0': 'entailment',
        'LABEL_1': 'neutral',
        'LABEL_2': 'contradiction'
    }
    
    # Check entailment between persona (P) and generated response (R)
    result_pr = bert_on_snli(f"{persona} [SEP] {gen_response}")
    label_pr = label_mapping.get(result_pr[0]['label'], 'unknown')

    # Check entailment between actual response (Q) and generated response (R)
    result_qr = bert_on_snli(f"{act_response} [SEP] {gen_response}")
    label_qr = label_mapping.get(result_qr[0]['label'], 'unknown')

    # Determine UE score based on entailment results
    if label_pr == 'entailment' and label_qr == 'entailment':
        return 2
    elif label_pr == 'entailment':
        return 1
    else:
        return 0



def calculate_idf_weighted_overlap(persona, response):
    # Fit TF-IDF on both texts and calculate cosine similarity

    processed_persona = preprocess_text(persona)
    processed_response = preprocess_text(response)
    persona_new = str(processed_persona) if not isinstance(processed_persona, str) else processed_persona
    response_new = str(processed_response) if not isinstance(processed_response, str) else processed_response
    texts = [persona_new, response_new]

    # texts = [persona, response]
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return cosine_sim[0][0]


def compute_persona_distance(persona, response, model, stop_words):
    # Tokenize and filter stopwords
    persona_tokens = [word for word in persona.lower().split() if word not in stop_words]
    response_tokens = [word for word in response.lower().split() if word not in stop_words]
    
    # Get word vectors
    persona_vecs = [model[word] for word in persona_tokens if word in model]
    response_vecs = [model[word] for word in response_tokens if word in model]
    
    # If no vectors found, return zero similarity
    if not persona_vecs or not response_vecs:
        return 0.0
    
    # Compute average vectors
    persona_avg_vec = np.mean(persona_vecs, axis=0)
    response_avg_vec = np.mean(response_vecs, axis=0)
    
    # Compute cosine similarity
    return cosine_similarity([persona_avg_vec], [response_avg_vec])[0][0]



Device set to use cuda:0
Device set to use cuda:0


In [11]:
import logging

# Set the logging level to ERROR to suppress warnings about training
logging.getLogger("transformers").setLevel(logging.ERROR)

# Default worst-case values
worst_bleu = 0.0
worst_rouge = (0.0, 0.0, 0.0)
worst_meteor = 0.0
worst_bertscore = (0.0, 0.0, 0.0)
worst_distinct = 0.0
worst_c_score = -1.0
worst_consistency_score = 0.0
worst_idf_score = 0.0
worst_ue_score = 0.0
worst_persona_distance_score = 0.0

In [12]:
# Initialize a counter for invalid gen_response
invalid_gen_res_count = 0

# Iterate over each row
for index, row in tqdm(eval_df.iterrows(), total=len(eval_df)):
    personas = row['personas']
    act_response = row['act_response']
    gen_response = row['gen_response']

    # Check for NaN or None in gen_response
    if pd.isna(gen_response):
        invalid_gen_res_count += 1
        
        bleu_scores.append(worst_bleu)
        rouge_scores.append(worst_rouge)
        meteor_scores.append(worst_meteor)
        bertscore_prec.append(worst_bertscore[0])
        bertscore_rec.append(worst_bertscore[1])
        bertscore_f1.append(worst_bertscore[2])
        distinct_1.append(worst_distinct)
        distinct_2.append(worst_distinct)
        c_scores.append(worst_c_score)
        consistency_scores.append(worst_consistency_score)
        idf_scores.append(worst_idf_score)
        persona_distance_scores.append(worst_persona_distance_score)
        ue_scores.append(worst_ue_score)

        continue

    bleu = compute_bleu(act_response, gen_response)
    bleu_scores.append(bleu)
    
    rouge_1, rouge_2, rouge_l = compute_rouge(act_response, gen_response)
    rouge_scores.append((rouge_1, rouge_2, rouge_l))
    
    meteor = compute_meteor(act_response, gen_response)
    meteor_scores.append(meteor)
    
    P, R, F1 = bert_score.score([gen_response], [act_response], lang="en", verbose=False)
    bertscore_prec.append(P.mean().item())
    bertscore_rec.append(R.mean().item())
    bertscore_f1.append(F1.mean().item())
    
    distinct_1.append(compute_distinct_ngrams(gen_response, 1))
    
    distinct_2.append(compute_distinct_ngrams(gen_response, 2))
    
    c_scores.append(calculate_c_score(personas, gen_response))
    
    consistency_scores.append(calculate_consistency_score(personas, gen_response))
    
    ue_scores.append(calculate_ue_score(act_response, gen_response, personas))

    idf_scores.append(calculate_idf_weighted_overlap(personas, gen_response))
    
    persona_distance = compute_persona_distance(personas, gen_response, word2vec_model, stop_words)
    persona_distance_scores.append(persona_distance)


# Compile metrics into DataFrame
metrics_df = pd.DataFrame({
    'BLEU': bleu_scores,
    'R1': [score[0] for score in rouge_scores],
    'R2': [score[1] for score in rouge_scores],
    'RL': [score[2] for score in rouge_scores],
    'METEOR': meteor_scores,
    'BERTScore_Prec': bertscore_prec,
    'BERTScore_Rec': bertscore_rec,
    'BERTScore_F1': bertscore_f1,
    'Dist1': distinct_1,
    'Dist2': distinct_2,
    'C Score': c_scores,
    'P Consistency Score': consistency_scores,
    'IDF Overlap': idf_scores,
    'UE Score': ue_scores,
    'Persona Distance': persona_distance_scores
})

metrics_df

100%|██████████| 1000/1000 [12:59<00:00,  1.28it/s]


Unnamed: 0,BLEU,R1,R2,RL,METEOR,BERTScore_Prec,BERTScore_Rec,BERTScore_F1,Dist1,Dist2,C Score,P Consistency Score,IDF Overlap,UE Score,Persona Distance
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-1.0,0.0,0.000000,0.0,0.000000
1,0.005120,0.205882,0.000000,0.176471,0.161943,0.848211,0.872643,0.860254,0.827586,0.964912,1.0,1.0,0.238574,0.0,0.857521
2,0.012124,0.236842,0.044944,0.236842,0.202705,0.851323,0.847875,0.849595,0.827586,0.982456,1.0,1.0,0.149118,0.0,0.690035
3,0.022863,0.171429,0.055556,0.171429,0.153216,0.857064,0.851652,0.854349,0.941176,1.000000,0.0,1.0,0.068436,0.0,0.640356
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-1.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.008049,0.196721,0.027027,0.163934,0.169082,0.811197,0.837593,0.824184,0.714286,0.935484,1.0,1.0,0.410034,1.0,0.751643
996,0.004392,0.044444,0.000000,0.044444,0.037879,0.832957,0.867341,0.849801,0.833333,1.000000,1.0,1.0,0.191487,0.0,0.836204
997,0.016133,0.250000,0.088889,0.200000,0.350067,0.856305,0.890171,0.872910,0.815789,0.972973,1.0,1.0,0.055004,1.0,0.593685
998,0.008649,0.210526,0.024096,0.184211,0.238983,0.840385,0.834732,0.837549,0.841270,0.951613,0.0,1.0,0.040507,0.0,0.677568


In [13]:
# Add the 'response_time' column to 'metrics_df'
metrics_df['response_time'] = eval_df['response_time']

In [14]:
# Calculate the mean (average) and standard deviation, rounded to 2 decimal places
avg_values = metrics_df.mean().round(2)
std_values = metrics_df.std(ddof=0).round(2)  # Use ddof=0 for population standard deviation

# Combine the average and standard deviation into the format "avg ± std"
combined_values = avg_values.astype(str) + " ± " + std_values.astype(str)

# Insert the LLM name at the beginning of the combined values
combined_values = combined_values.tolist()
combined_values.insert(0, LLM_name)

# Create a DataFrame for the combined average ± std row
result_df = pd.DataFrame([combined_values], columns=['Model'] + metrics_df.columns.tolist())

# Add the ratio of invalid gen_response
invalid_gen_res_ratio = invalid_gen_res_count / len(eval_df)
result_df['Failure Ratio'] = f"{round(invalid_gen_res_ratio, 3)} ± 0.00"  # No std for Failure Ratio

result_df

Unnamed: 0,Model,BLEU,R1,R2,RL,METEOR,BERTScore_Prec,BERTScore_Rec,BERTScore_F1,Dist1,Dist2,C Score,P Consistency Score,IDF Overlap,UE Score,Persona Distance,response_time,Failure Ratio
0,Gemma-7B-Instruct,0.01 ± 0.01,0.09 ± 0.1,0.01 ± 0.03,0.08 ± 0.09,0.09 ± 0.1,0.48 ± 0.42,0.49 ± 0.42,0.49 ± 0.42,0.48 ± 0.42,0.56 ± 0.49,-0.13 ± 0.89,0.53 ± 0.5,0.08 ± 0.1,0.22 ± 0.53,0.39 ± 0.34,5.17 ± 1.03,0.425 ± 0.00


In [15]:
# Load the existing Excel file and update or append the average row
output_path = f'./Evaluations/{Dataset}{COT_}-results.xlsx'

try:
    # Load existing data
    existing_df = pd.read_excel(output_path)
    # Check if the model name already exists
    if LLM_name in existing_df['Model'].values:
        # Update the row with the same model name
        existing_df.loc[existing_df['Model'] == LLM_name, :] = result_df.values
    else:
        # Append the new data
        existing_df = pd.concat([existing_df, result_df], ignore_index=True)
except FileNotFoundError:
    # If the file does not exist, create a new DataFrame
    existing_df = result_df

# Save the updated DataFrame to an Excel file
existing_df.to_excel(output_path, index=False)

existing_df

Unnamed: 0,Model,P Consistency Score,C Score,UE Score,BLEU,R1,R2,RL,METEOR,BERTScore_Prec,...,IDF Overlap,Persona Distance,response_time,Failure Ratio,UniEval Naturalness,UniEval Coherence,UniEval Engagingness,UniEval Groundedness,UniEval Understandability,UniEval Overall
0,Llama3-1-8B-Instruct,0.49 ± 0.5,-0.22 ± 0.87,0.17 ± 0.48\t,0.02 ± 0.08,0.11 ± 0.15,0.03 ± 0.1,0.1 ± 0.14,0.11 ± 0.14,0.47 ± 0.42,...,0.06 ± 0.1,0.33 ± 0.33,4.54 ± 0.21,0.448 ± 0.00,0.51 ± 0.46,0.55 ± 0.49,1.71 ± 1.88,0.42 ± 0.47,0.51 ± 0.46,0.74 ± 0.7
1,gpt-3.5-turbo,0.82 ± 0.39,0.26 ± 0.75,0.37 ± 0.67,0.04 ± 0.09,0.2 ± 0.14,0.06 ± 0.12,0.18 ± 0.14,0.2 ± 0.15,0.78 ± 0.25,...,0.07 ± 0.07,0.51 ± 0.21,1.3 ± 0.42,0.09 ± 0.00,0.84 ± 0.27,0.91 ± 0.29,2.31 ± 1.27,0.72 ± 0.39,0.84 ± 0.27,1.12 ± 0.42
2,gpt-4o-mini,0.81 ± 0.39\t,0.16 ± 0.71,0.35 ± 0.69,0.03 ± 0.05,0.19 ± 0.11,0.06 ± 0.08,0.17 ± 0.1,0.22 ± 0.15,0.78 ± 0.24,...,0.06 ± 0.05,0.54 ± 0.19,1.43 ± 0.66,0.086 ± 0.00,0.85 ± 0.27,0.91 ± 0.28,2.61 ± 1.2,0.72 ± 0.36,0.86 ± 0.26,1.19 ± 0.41
3,gpt-4-turbo,0.84 ± 0.36,0.14 ± 0.66,0.39 ± 0.73,0.04 ± 0.08,0.23 ± 0.13,0.08 ± 0.11,0.2 ± 0.13,0.23 ± 0.16,0.84 ± 0.16,...,0.05 ± 0.05,0.51 ± 0.15,3.32 ± 1.24,0.035 ± 0.00,0.91 ± 0.18,0.96 ± 0.18,2.2 ± 0.94,0.66 ± 0.38,0.91 ± 0.18,1.13 ± 0.28
4,Qwen2-7B-Instruct,0.45 ± 0.5,-0.31 ± 0.84,0.17 ± 0.48,0.01 ± 0.02,0.08 ± 0.1,0.02 ± 0.04,0.08 ± 0.09,0.1 ± 0.13,0.41 ± 0.42,...,0.04 ± 0.06,0.29 ± 0.3,3.74 ± 0.76,0.509 ± 0.00,0.46 ± 0.47,0.49 ± 0.5,1.72 ± 1.93,0.43 ± 0.47,0.46 ± 0.47,0.71 ± 0.75
5,Mistral-7B-Instruct,0.52 ± 0.5\t,-0.23 ± 0.82,0.21 ± 0.52,0.01 ± 0.04,0.11 ± 0.12,0.03 ± 0.06,0.1 ± 0.11,0.11 ± 0.14,0.48 ± 0.42,...,0.05 ± 0.07,0.3 ± 0.28,4.39 ± 0.36,0.435 ± 0.00,0.53 ± 0.46,0.56 ± 0.49,1.54 ± 1.55,0.45 ± 0.46,0.53 ± 0.46,0.72 ± 0.65
6,Gemma-7B-Instruct,0.53 ± 0.5,-0.13 ± 0.89,0.22 ± 0.53,0.01 ± 0.01,0.09 ± 0.1,0.01 ± 0.03,0.08 ± 0.09,0.09 ± 0.1,0.48 ± 0.42,...,0.08 ± 0.1,0.39 ± 0.34,5.17 ± 1.03,0.425 ± 0.00,,,,,,


In [16]:
response = pd.read_excel(f'./Evaluations/{Dataset}{COT_}-results.xlsx')
response

Unnamed: 0,Model,P Consistency Score,C Score,UE Score,BLEU,R1,R2,RL,METEOR,BERTScore_Prec,...,IDF Overlap,Persona Distance,response_time,Failure Ratio,UniEval Naturalness,UniEval Coherence,UniEval Engagingness,UniEval Groundedness,UniEval Understandability,UniEval Overall
0,Llama3-1-8B-Instruct,0.49 ± 0.5,-0.22 ± 0.87,0.17 ± 0.48\t,0.02 ± 0.08,0.11 ± 0.15,0.03 ± 0.1,0.1 ± 0.14,0.11 ± 0.14,0.47 ± 0.42,...,0.06 ± 0.1,0.33 ± 0.33,4.54 ± 0.21,0.448 ± 0.00,0.51 ± 0.46,0.55 ± 0.49,1.71 ± 1.88,0.42 ± 0.47,0.51 ± 0.46,0.74 ± 0.7
1,gpt-3.5-turbo,0.82 ± 0.39,0.26 ± 0.75,0.37 ± 0.67,0.04 ± 0.09,0.2 ± 0.14,0.06 ± 0.12,0.18 ± 0.14,0.2 ± 0.15,0.78 ± 0.25,...,0.07 ± 0.07,0.51 ± 0.21,1.3 ± 0.42,0.09 ± 0.00,0.84 ± 0.27,0.91 ± 0.29,2.31 ± 1.27,0.72 ± 0.39,0.84 ± 0.27,1.12 ± 0.42
2,gpt-4o-mini,0.81 ± 0.39\t,0.16 ± 0.71,0.35 ± 0.69,0.03 ± 0.05,0.19 ± 0.11,0.06 ± 0.08,0.17 ± 0.1,0.22 ± 0.15,0.78 ± 0.24,...,0.06 ± 0.05,0.54 ± 0.19,1.43 ± 0.66,0.086 ± 0.00,0.85 ± 0.27,0.91 ± 0.28,2.61 ± 1.2,0.72 ± 0.36,0.86 ± 0.26,1.19 ± 0.41
3,gpt-4-turbo,0.84 ± 0.36,0.14 ± 0.66,0.39 ± 0.73,0.04 ± 0.08,0.23 ± 0.13,0.08 ± 0.11,0.2 ± 0.13,0.23 ± 0.16,0.84 ± 0.16,...,0.05 ± 0.05,0.51 ± 0.15,3.32 ± 1.24,0.035 ± 0.00,0.91 ± 0.18,0.96 ± 0.18,2.2 ± 0.94,0.66 ± 0.38,0.91 ± 0.18,1.13 ± 0.28
4,Qwen2-7B-Instruct,0.45 ± 0.5,-0.31 ± 0.84,0.17 ± 0.48,0.01 ± 0.02,0.08 ± 0.1,0.02 ± 0.04,0.08 ± 0.09,0.1 ± 0.13,0.41 ± 0.42,...,0.04 ± 0.06,0.29 ± 0.3,3.74 ± 0.76,0.509 ± 0.00,0.46 ± 0.47,0.49 ± 0.5,1.72 ± 1.93,0.43 ± 0.47,0.46 ± 0.47,0.71 ± 0.75
5,Mistral-7B-Instruct,0.52 ± 0.5\t,-0.23 ± 0.82,0.21 ± 0.52,0.01 ± 0.04,0.11 ± 0.12,0.03 ± 0.06,0.1 ± 0.11,0.11 ± 0.14,0.48 ± 0.42,...,0.05 ± 0.07,0.3 ± 0.28,4.39 ± 0.36,0.435 ± 0.00,0.53 ± 0.46,0.56 ± 0.49,1.54 ± 1.55,0.45 ± 0.46,0.53 ± 0.46,0.72 ± 0.65
6,Gemma-7B-Instruct,0.53 ± 0.5,-0.13 ± 0.89,0.22 ± 0.53,0.01 ± 0.01,0.09 ± 0.1,0.01 ± 0.03,0.08 ± 0.09,0.09 ± 0.1,0.48 ± 0.42,...,0.08 ± 0.1,0.39 ± 0.34,5.17 ± 1.03,0.425 ± 0.00,,,,,,


## Testing the Functions

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
import numpy as np
import gensim
import torch

device = 0 if torch.cuda.is_available() else -1  # device set to 0 for GPU, -1 for CPU

word2vec_model = gensim.models.KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin", binary=True)

nli_model = pipeline('text-classification', model='facebook/bart-large-mnli', device=device)

persona_text = "I am a software engineer. I love coding in Python. I also enjoy hiking during weekends."
response_text = "Coding in Python is one of my favorite activities. On weekends, I often go hiking."
gen_response_text = "On weekends, I often go hiking, and Python coding is something I really enjoy."



In [None]:
def calculate_persona_distance(persona, response, word2vec):
    
    # persona and response are already pre-processed and stopwords are removed
    
    persona_tokens = [word for word in persona.lower().split()]
    response_tokens = [word for word in response.lower().split()]
    
    
    # Get embeddings for tokens if they exist in the word2vec model
    persona_embeddings = [word2vec[word] for word in persona_tokens if word in word2vec]
    response_embeddings = [word2vec[word] for word in response_tokens if word in word2vec]
    
    # Calculate similarity matrices M_i for each persona keyword embedding p_i
    similarity_matrices = []
    for p_i in persona_embeddings:
        similarity_matrix = cosine_similarity([p_i], response_embeddings)
        print(similarity_matrix)
        similarity_matrices.append(np.max(similarity_matrix))
    
    # Calculate the P.Distance
    p_distance = np.mean(similarity_matrices)
    
    return p_distance



# Example usage:
calculate_persona_distance(persona_text, gen_response_text, word2vec_model)


[[-0.00336669  1.         -0.01056319  0.13576175  0.11221063  0.19117984
   0.09335288  0.1638258   1.          0.2615446 ]]
[[ 0.18068513  0.25550863 -0.03758506  0.21812178  0.02882948  0.05052864
   0.34898528  0.13838342  0.2555086   0.21406093]]
[[0.00118619 0.09245683 0.03459306 0.02209745 0.14510264 0.51879144
  0.06418717 0.0685657  0.09245681 0.0643034 ]]
[[-0.00336669  1.         -0.01056319  0.13576175  0.11221063  0.19117984
   0.09335288  0.1638258   1.          0.2615446 ]]
[[0.04461894 0.16269058 0.1994664  0.20273286 0.06281446 0.03281252
  0.06216637 0.31501308 0.16269056 0.41482794]]
[[-2.1985812e-02  1.9117984e-01  1.2112212e-01  2.7989851e-02
   1.6500372e-01  1.0000000e+00 -3.7418772e-04  1.0601519e-01
   1.9117984e-01  6.1645295e-02]]
[[0.3958767  0.01996717 0.14580318 0.15322813 0.0025511  0.02053415
  0.243254   0.08290297 0.01996717 0.18737666]]
[[-0.00336669  1.         -0.01056319  0.13576175  0.11221063  0.19117984
   0.09335288  0.1638258   1.          0.2

0.5649009

In [None]:
def calculate_coh_con_score(act_response, gen_response, persona):
    # Check entailment between full Persona (P) and Generated Response (gen_response)
    nli_result_pr = nli_model(f"{persona} [SEP] {gen_response}")
    label_pr = nli_result_pr[0]['label'].lower()
    print(f"Persona Premise: {persona}")
    print(f"Generated Response Hypothesis: {gen_response}")
    print(f"NLI Result for (P, gen_response): {nli_result_pr}")
    is_persona_entails_response = label_pr == 'entailment'

    # Check entailment between full Actual Response (act_response) and Generated Response (gen_response)
    nli_result_qr = nli_model(f"{act_response} [SEP] {gen_response}")
    label_qr = nli_result_qr[0]['label'].lower()
    print(f"Actual Response Premise: {act_response}")
    print(f"Generated Response Hypothesis: {gen_response}")
    print(f"NLI Result for (act_response, gen_response): {nli_result_qr}")
    is_act_entails_response = label_qr == 'entailment'

    # Assign the Coh-Con.Score based on the entailment results
    if is_persona_entails_response and is_act_entails_response:
        score = 2
    elif is_persona_entails_response:
        score = 1
    else:
        score = 0
    
    return score

In [None]:
calculate_coh_con_score(response_text, gen_response_text, persona_text)

Persona Premise: I am a software engineer. I love coding in Python. I also enjoy hiking during weekends.
Generated Response Hypothesis: On weekends, I often go hiking, and Python coding is something I really enjoy.
NLI Result for (P, gen_response): [{'label': 'entailment', 'score': 0.9853878617286682}]
Actual Response Premise: Coding in Python is one of my favorite activities. On weekends, I often go hiking.
Generated Response Hypothesis: On weekends, I often go hiking, and Python coding is something I really enjoy.
NLI Result for (act_response, gen_response): [{'label': 'entailment', 'score': 0.9843922853469849}]


2

### Testing the fine-tuned BERT on SNLI dataset

Labels of the SNLI dataset:

- 0: entailment
- 1: neutral
- 2: contradiction


UE-score:

- 2: R is aligned with P and Q
- 1: R is alinged with P
- 0: no alignment

In [None]:
from transformers import pipeline, BertForSequenceClassification, BertTokenizer

# Load the model and tokenizer (assuming the model is already fine-tuned on SNLI)
model_dir = "Fine-tuning/output/bert_snli"
model = BertForSequenceClassification.from_pretrained(model_dir)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Initialize the NLI pipeline
bert_on_snli = pipeline('text-classification', model=model, tokenizer=tokenizer, device=0)

def calculate_ue_score(act_response, gen_response, persona):
    """
    Calculate the UE score based on entailment between persona, actual response, and generated response.

    Returns:
    int: UE score with possible values 2, 1, or 0.
    """

    # Define the label mapping to interpret the NLI model's output
    label_mapping = {
        'LABEL_0': 'entailment',
        'LABEL_1': 'neutral',
        'LABEL_2': 'contradiction'
    }
    
    # Check entailment between persona (P) and generated response (R)
    result_pr = bert_on_snli(f"{persona} [SEP] {gen_response}")
    label_pr = label_mapping.get(result_pr[0]['label'], 'unknown')

    # Check entailment between actual response (Q) and generated response (R)
    result_qr = bert_on_snli(f"{act_response} [SEP] {gen_response}")
    label_qr = label_mapping.get(result_qr[0]['label'], 'unknown')

    # Determine UE score based on entailment results
    if label_pr == 'entailment' and label_qr == 'entailment':
        return 2
    elif label_pr == 'entailment':
        return 1
    else:
        return 0


In [None]:
# Actual response
act_response = "Coding in Python is one of my favorite activities. On weekends, I often go hiking."

# Generated response
gen_response_text = "On weekends, I often go hiking, and Python coding is something I really enjoy."

persona_text = "I am a software engineer. I love coding in Python. I also enjoy hiking during weekends."


In [None]:
calculate_ue_score(act_response, gen_response_text, persona_text)

2

### Testing the fine-tuned BERT on DNLI dataset

Labels of the DNLI dataset:

- 0: negative
- 1: neutral
- 2: positive

C-score:

- 1: R,P entailment
- 0: R,P neutral
- -1: R,P contradiction

In [4]:
from transformers import pipeline, BertForSequenceClassification, BertTokenizer

# Load the model and tokenizer (assuming the model is already fine-tuned on DNLI)
model_dir = "Fine-tuning/output/bert_dnli"
model = BertForSequenceClassification.from_pretrained(model_dir)
tokenizer = BertTokenizer.from_pretrained(model_dir)

# Initialize the NLI pipeline
bert_on_dnli = pipeline('text-classification', model=model, tokenizer=tokenizer, device=0)

def calculate_c_score(gen_response, persona):
    """
    Calculate the C score based on the entailment results between a generated response (R)
    and a given persona (P).

    Returns:
    int: C-score with possible values:
         1 for entailment (positive),
         0 for neutral,
         -1 for contradiction (negative).
    """

    # Define the label mapping to interpret the NLI model's output
    label_mapping = {
        'LABEL_0': 'negative',
        'LABEL_1': 'neutral',
        'LABEL_2': 'positive'
    }
    
    # Check entailment between persona (P) and generated response (R)
    result_pr = bert_on_dnli(f"{persona} {gen_response}")
    label_pr = label_mapping.get(result_pr[0]['label'], 'unknown')

    # Determine C score based on entailment results
    if label_pr == 'positive':
        return 1
    elif label_pr == 'neutral':
        return 0
    elif label_pr == 'negative':
        return -1
    else:
        raise ValueError(f"Unexpected label encountered: {label_pr}")

In [5]:
# Generated response
gen_response_text = "On weekends, I often go hiking, and Python coding is something I really enjoy."

persona_text = "I am a software engineer. I love coding in Python. I also enjoy hiking during weekends."


In [6]:
calculate_c_score(gen_response_text, persona_text)

1