## Statistical Significance Test

In [1]:
# pip install gensim requests

In [3]:
import pandas as pd
import warnings
import logging

warnings.filterwarnings("ignore")

logging.getLogger('transformers').setLevel(logging.ERROR)

# Set the logging level to ERROR to ignore warnings
logging.getLogger("transformers").setLevel(logging.ERROR)

In [None]:
Dataset = "FoCus"                               
LLM_name = "Mistral-7B-Instruct"           # Llama3-1-8B-Instruct,  gpt-3.5-turbo, gpt-4o-mini, Gemma-7B-Instruct

In [5]:
df = pd.read_csv(f'./Prompts/{Dataset}.csv')
print("Shape:", df.shape)

df.head()

Shape: (1000, 3)


Unnamed: 0,personas,context,act_response
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,User2: The history of the house you are intere...
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",User2: This house was use as a stop for slaves...
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","User2: Sure, you will like to know that this p..."
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,User2: Technische Universität Darmstadt in the...
4,I like to visit england.I love church.I would ...,User1: Where is this place?\nUser2: This place...,"User2: I suggest a place, for your wish of see..."


In [6]:
### Only For: FoCus, IT-ConvAI2
df['act_response'] = df['act_response'].apply(lambda x: x.split(':', 1)[1].strip() if ':' in x else x.strip())


print(df.isnull().sum())
df.head(6)

personas        0
context         0
act_response    0
dtype: int64


Unnamed: 0,personas,context,act_response
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,The history of the house you are interested in...
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",This house was use as a stop for slaves trying...
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","Sure, you will like to know that this place wa..."
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,Technische Universität Darmstadt in the top 25...
4,I like to visit england.I love church.I would ...,User1: Where is this place?\nUser2: This place...,"I suggest a place, for your wish of see librar..."
5,I would like to go to University.I live in Mic...,User1: I think Ive been there before but I don...,They offer 132 bachelors degree programs and 1...


In [None]:
response = pd.read_csv(f'Responses/{Dataset}/{LLM_name}{"-COT"}.csv')
print("Shape:", response.shape)

print("\nMissing Values:")
print(response.isnull().sum())

response

Shape: (1000, 2)

Missing Values:
gen_response     102
response_time      0
dtype: int64


Unnamed: 0,gen_response,response_time
0,Nazareth House is a historic benevolent instit...,6.746092
1,The Truman Galusha House is a fascinating hist...,8.609739
2,The Marion Palace Theatre is a stunning exampl...,6.194755
3,Technische Universität Darmstadt is a great ch...,7.916744
4,"The Boston Stump, as it's commonly known, is a...",7.536268
...,...,...
995,"You're working in a fascinating place, a museu...",8.595941
996,Mahasthangarh is an ancient city located in th...,8.572887
997,Armagh County Museum is a museum located in Ar...,8.644711
998,The Nyanga National Park is a perfect destinat...,7.161740


In [8]:
# Calculate maximum number of words in each column
max_response_length = response['gen_response'].dropna().apply(lambda x: len(x.split())).max()

print(f"Maximum Response Length (in words): {max_response_length}")

Maximum Response Length (in words): 131


In [9]:
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Initialize stop words
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text, remove_stop_words=True):
    if pd.isnull(text):
        return None
    text = text.lower()  # Lowercasing
    text = text.translate(str.maketrans('', '', string.punctuation))  # Removing punctuation
    tokens = word_tokenize(text)  # Tokenization
    if remove_stop_words:
        tokens = [word for word in tokens if word not in stop_words]  # Removing stop words
    return ' '.join(tokens)  # Join tokens back into a single string

# Create eval_df
eval_df = pd.DataFrame({
    'personas': df['personas'],
    'act_response': df['act_response'],
    'context': df['context'],
    'gen_response': response['gen_response'],
    'response_time': response['response_time']
})

print(eval_df.isnull().sum())
eval_df.head()

personas           0
act_response       0
context            0
gen_response     102
response_time      0
dtype: int64


Unnamed: 0,personas,act_response,context,gen_response,response_time
0,I would like to visit the Nazareth House again...,The history of the house you are interested in...,User1: I think Ive been there before but I don...,Nazareth House is a historic benevolent instit...,6.746092
1,I have been to Vermont a few times to go skiin...,This house was use as a stop for slaves trying...,"User1: Wow, this is amazing! What is this?\nUs...",The Truman Galusha House is a fascinating hist...,8.609739
2,I am fascinated by the Spanish Colonial Reviva...,"Sure, you will like to know that this place wa...","User1: Wow, this is amazing! What is this?\nUs...",The Marion Palace Theatre is a stunning exampl...,6.194755
3,I want to become a college student.I want to s...,Technische Universität Darmstadt in the top 25...,User1: Where is this place?\nUser2: Hello! Wel...,Technische Universität Darmstadt is a great ch...,7.916744
4,I like to visit england.I love church.I would ...,"I suggest a place, for your wish of see librar...",User1: Where is this place?\nUser2: This place...,"The Boston Stump, as it's commonly known, is a...",7.536268


In [None]:
import torch
device = 0 if torch.cuda.is_available() else -1  # device set to 0 for GPU, -1 for CPU

In [None]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gensim
from nltk.corpus import stopwords
from transformers import pipeline, BertForSequenceClassification, BertTokenizer

import sys
sys.path.append('/home/salehafzoon/Desktop/Perso-LLM-Benchmarking/UniEval')

from UniEval.metric.evaluator import get_evaluator
from UniEval.utils import convert_to_json


ue_scores = []
c_scores = []
persona_distance_scores = []


bert_snli_dir = "Fine-tuning/output/bert_snli"
bert_snli_model = BertForSequenceClassification.from_pretrained(bert_snli_dir)
bert_snli_tokenizer = BertTokenizer.from_pretrained(bert_snli_dir)

# Initialize the NLI pipeline for UE Score
bert_on_snli = pipeline('text-classification', model = bert_snli_model, tokenizer = bert_snli_tokenizer, device=0)

bert_dnli_dir = "Fine-tuning/output/bert_dnli"
bert_dnli_model = BertForSequenceClassification.from_pretrained(bert_dnli_dir)
bert_dnli_tokenizer = BertTokenizer.from_pretrained(bert_dnli_dir)

# Initialize the NLI pipeline
bert_on_dnli = pipeline('text-classification', model = bert_dnli_model, tokenizer = bert_dnli_tokenizer, device=0)


# Initialize the Word2Vec Model
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin", binary=True)
stop_words = set(stopwords.words('english'))


# Initialize smoothing function
smoothing_function = SmoothingFunction().method1


def compute_distinct_ngrams(text, n):
    tokens = str(text).replace('\n', ' ').split()
    ngrams = list(zip(*[tokens[i:] for i in range(n)]))
    distinct_ngrams = len(set(ngrams))
    total_ngrams = len(ngrams)
    return distinct_ngrams / total_ngrams if total_ngrams > 0 else 0


# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

def calculate_c_score(gen_response, persona):
    """
    Calculate the C score based on the entailment results between a generated response (R)
    and a given persona (P).

    Returns:
    int: C-score with possible values:
         1 for entailment (positive),
         0 for neutral,
         -1 for contradiction (negative).
    """

    # Define the label mapping to interpret the NLI model's output
    label_mapping = {
        'LABEL_0': 'negative',
        'LABEL_1': 'neutral',
        'LABEL_2': 'positive'
    }
    
    # Check entailment between persona (P) and generated response (R)
    result_pr = bert_on_dnli(f"{persona} {gen_response}")
    label_pr = label_mapping.get(result_pr[0]['label'], 'unknown')

    # Determine C score based on entailment results
    if label_pr == 'positive':
        return 1
    elif label_pr == 'neutral':
        return 0
    elif label_pr == 'negative':
        return -1
    else:
        raise ValueError(f"Unexpected label encountered: {label_pr}")



def calculate_ue_score(act_response, gen_response, persona):
    """
    Calculate the UE score based on entailment between persona, actual response, and generated response.

    Returns:
    int: UE score with possible values 2, 1, or 0.
    """

    # Define the label mapping to interpret the NLI model's output
    label_mapping = {
        'LABEL_0': 'entailment',
        'LABEL_1': 'neutral',
        'LABEL_2': 'contradiction'
    }
    
    # Check entailment between persona (P) and generated response (R)
    result_pr = bert_on_snli(f"{persona} [SEP] {gen_response}")
    label_pr = label_mapping.get(result_pr[0]['label'], 'unknown')

    # Check entailment between actual response (Q) and generated response (R)
    result_qr = bert_on_snli(f"{act_response} [SEP] {gen_response}")
    label_qr = label_mapping.get(result_qr[0]['label'], 'unknown')

    # Determine UE score based on entailment results
    if label_pr == 'entailment' and label_qr == 'entailment':
        return 2
    elif label_pr == 'entailment':
        return 1
    else:
        return 0



def compute_persona_distance(persona, response, model, stop_words):
    # Tokenize and filter stopwords
    persona_tokens = [word for word in persona.lower().split() if word not in stop_words]
    response_tokens = [word for word in response.lower().split() if word not in stop_words]
    
    # Get word vectors
    persona_vecs = [model[word] for word in persona_tokens if word in model]
    response_vecs = [model[word] for word in response_tokens if word in model]
    
    # If no vectors found, return zero similarity
    if not persona_vecs or not response_vecs:
        return 0.0
    
    # Compute average vectors
    persona_avg_vec = np.mean(persona_vecs, axis=0)
    response_avg_vec = np.mean(response_vecs, axis=0)
    
    # Compute cosine similarity
    return cosine_similarity([persona_avg_vec], [response_avg_vec])[0][0]


def calculate_unieval_scores(personas, contexts, gen_responses):
    """
    Calculates UniEval scores for a batch of inputs.

    Args:
        personas (list): List of persona information as additional context.
        contexts (list): List of conversation histories leading to the responses.
        gen_responses (list): List of generated responses to be evaluated.

    Returns:
        list: A list of dictionaries containing UniEval scores for each input.
    """
    # Flatten personas if they are lists
    personas = [' '.join(p) if isinstance(p, list) else p for p in personas]

    # Prepare inputs for UniEval
    data = convert_to_json(output_list=gen_responses, src_list=contexts, context_list=personas)

    # Initialize the evaluator for dialogue tasks
    evaluator = get_evaluator('dialogue')

    # Evaluate and obtain scores for all inputs
    eval_scores = evaluator.evaluate(data, print_result=False)

    return eval_scores


Device set to use cuda:0
Device set to use cuda:0


In [None]:
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

worst_c_score = -1.0
worst_ue_score = 0.0
worst_persona_distance_score = 0.0

worst_unieval_score = {
    'naturalness': 0.0,
    'coherence': 0.0,
    'engagingness': 0.0,
    'groundedness': 0.0,
    'understandability': 0.0,
    'overall': 0.0
}

In [None]:
# Initialize a counter for invalid gen_response
invalid_gen_res_count = 0

# Iterate over each row
for index, row in tqdm(eval_df.iterrows(), total=len(eval_df)):
    personas = row['personas']
    act_response = row['act_response']
    gen_response = row['gen_response']

    # Check for NaN or None in gen_response
    if pd.isna(gen_response):
        invalid_gen_res_count += 1
        
        c_scores.append(worst_c_score)
        persona_distance_scores.append(worst_persona_distance_score)
        ue_scores.append(worst_ue_score)

        continue
    
    c_scores.append(calculate_c_score(personas, gen_response))
    
    ue_scores.append(calculate_ue_score(act_response, gen_response, personas))

    persona_distance = compute_persona_distance(personas, gen_response, word2vec_model, stop_words)
    persona_distance_scores.append(persona_distance)


# Compile metrics into DataFrame
metrics_df_1  = pd.DataFrame({
    'C Score': c_scores,
    'UE Score': ue_scores,
    'Persona Distance': persona_distance_scores
})

metrics_df_1 

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:09<00:00, 100.94it/s]


Unnamed: 0,C Score,UE Score,Persona Distance
0,1.0,2.0,0.604714
1,1.0,0.0,0.734168
2,1.0,0.0,0.737407
3,1.0,0.0,0.761614
4,1.0,0.0,0.504854
...,...,...,...
995,0.0,0.0,0.627368
996,0.0,0.0,0.425594
997,1.0,0.0,0.361952
998,1.0,0.0,0.539867


In [15]:
from tqdm import tqdm
import pandas as pd

# Function to evaluate in batches or the entire DataFrame
batch_size = 200  # Adjust batch size as needed

# List to store all UniEval scores
all_unieval_scores = []

# Split into batches if necessary
for i in tqdm(range(0, len(eval_df), batch_size), desc="Evaluating batches"):
    batch = eval_df.iloc[i:i+batch_size]

    # Extract relevant fields from the batch
    personas = batch['personas'].tolist()
    contexts = batch['context'].tolist()
    gen_responses = batch['gen_response'].tolist()

    # Check for NaN responses and handle them
    valid_indices = [j for j, response in enumerate(gen_responses) if pd.notna(response) and response.strip() != '']
    invalid_indices = [j for j, response in enumerate(gen_responses) if j not in valid_indices]

    # Prepare valid inputs
    valid_personas = [personas[j] for j in valid_indices]
    valid_contexts = [contexts[j] for j in valid_indices]
    valid_gen_responses = [gen_responses[j] for j in valid_indices]

    # Evaluate valid inputs
    if valid_personas:
        eval_scores = calculate_unieval_scores(valid_personas, valid_contexts, valid_gen_responses)
        all_unieval_scores.extend(eval_scores)

    # Append worst scores for invalid inputs
    all_unieval_scores.extend([worst_unieval_score] * len(invalid_indices))

# Convert all scores into a DataFrame
metrics_df_2  = pd.DataFrame(all_unieval_scores)

# Rename columns for clarity
metrics_df_2 .columns = [
    "UniEval Naturalness",
    "UniEval Coherence",
    "UniEval Engagingness",
    "UniEval Groundedness",
    "UniEval Understandability",
    "UniEval Overall"
]

metrics_df_2 

Evaluating batches: 100%|██████████| 5/5 [15:39<00:00, 187.93s/it]


Unnamed: 0,UniEval Naturalness,UniEval Coherence,UniEval Engagingness,UniEval Groundedness,UniEval Understandability,UniEval Overall
0,0.930204,0.998417,3.990297,0.998103,0.936348,1.570674
1,0.918790,0.997200,2.969362,0.909232,0.927470,1.344411
2,0.947152,0.998238,3.981049,0.990371,0.943944,1.572151
3,0.821111,0.998489,4.949582,0.998149,0.858559,1.725178
4,0.863987,0.998076,3.965243,0.997227,0.870674,1.539041
...,...,...,...,...,...,...
995,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
996,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
997,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
998,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [16]:
# Combine both metric sets
metrics_df = pd.concat([metrics_df_1.reset_index(drop=True), metrics_df_2.reset_index(drop=True)], axis=1)


In [17]:
# Add the 'response_time' column to 'metrics_df'
metrics_df['response_time'] = eval_df['response_time']
metrics_df

Unnamed: 0,C Score,UE Score,Persona Distance,UniEval Naturalness,UniEval Coherence,UniEval Engagingness,UniEval Groundedness,UniEval Understandability,UniEval Overall,response_time
0,1.0,2.0,0.604714,0.930204,0.998417,3.990297,0.998103,0.936348,1.570674,6.746092
1,1.0,0.0,0.734168,0.918790,0.997200,2.969362,0.909232,0.927470,1.344411,8.609739
2,1.0,0.0,0.737407,0.947152,0.998238,3.981049,0.990371,0.943944,1.572151,6.194755
3,1.0,0.0,0.761614,0.821111,0.998489,4.949582,0.998149,0.858559,1.725178,7.916744
4,1.0,0.0,0.504854,0.863987,0.998076,3.965243,0.997227,0.870674,1.539041,7.536268
...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.627368,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8.595941
996,0.0,0.0,0.425594,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8.572887
997,1.0,0.0,0.361952,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8.644711
998,1.0,0.0,0.539867,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7.161740


In [None]:
metrics_df.to_csv(f'Metric_values/{Dataset}_{LLM_name}-COT_metrics.csv', index=False)

# Test

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel, wilcoxon

# --------------------------
# CONFIGURE THESE
# --------------------------
DATASET = "FoCus"

# Llama3-1-8B-Instruct,  gpt-3.5-turbo, gpt-4o-mini

# LLMs
models = {
    'Llama3-1-8B': f'Metric_values/{DATASET}_Llama3-1-8B-Instruct-COT_metrics.csv',
    'GPT-4O-Mini': f'Metric_values/{DATASET}_gpt-4o-mini-COT_metrics.csv',
}

# List of metrics to test
metrics_to_compare = [
    'C Score',
    'BERTScore_Prec',
    'BERTScore_Rec',
    'BERTScore_F1',
    'Dist1',
    'Dist2',
    'Persona Distance',
    'UniEval Naturalness',
    'UniEval Coherence',
    'UniEval Overall'
]

# --------------------------
# TEST FUNCTION
# --------------------------
def run_statistical_tests(df1, df2, model1, model2):
    print(f"\n🔍 COMPARISON: {model1} vs {model2}")
    for metric in metrics_to_compare:
        try:
            scores1 = df1[metric].values
            scores2 = df2[metric].values

            # Paired t-test
            t_stat, p_t = ttest_rel(scores1, scores2)

            # Wilcoxon test
            w_stat, p_w = wilcoxon(scores1, scores2)

            print(f"\n📌 Metric: {metric}")
            print(f"  Paired t-test:     t = {t_stat:.4f}, p = {p_t:.5f}")
            print(f"  Wilcoxon test:     W = {w_stat:.4f}, p = {p_w:.5f}")
        except Exception as e:
            print(f"  ❌ Could not compute {metric}: {e}")

# --------------------------
# LOAD CSVs & COMPARE
# --------------------------

# COT comparison
df_c1 = pd.read_csv(models['Llama3-1-8B'])
df_c2 = pd.read_csv(models['GPT-4O-Mini'])
run_statistical_tests(df_c1, df_c2, 'Llama3-1-8B', 'GPT-4O-Mini')


🔍 COMPARISON: Llama3-1-8B vs GPT-4O-Mini

📌 Metric: C Score
  Paired t-test:     t = 9.6888, p = 0.00000
  Wilcoxon test:     W = 30110.0000, p = 0.00000

📌 Metric: BERTScore_Prec
  Paired t-test:     t = -8.9430, p = 0.00000
  Wilcoxon test:     W = 55052.0000, p = 0.00000

📌 Metric: BERTScore_Rec
  Paired t-test:     t = -10.4733, p = 0.00000
  Wilcoxon test:     W = 35670.0000, p = 0.00000

📌 Metric: BERTScore_F1
  Paired t-test:     t = -9.6948, p = 0.00000
  Wilcoxon test:     W = 39380.0000, p = 0.00000

📌 Metric: Dist1
  Paired t-test:     t = -15.0159, p = 0.00000
  Wilcoxon test:     W = 78414.5000, p = 0.00000

📌 Metric: Dist2
  Paired t-test:     t = -8.7999, p = 0.00000
  Wilcoxon test:     W = 47954.0000, p = 0.00000

📌 Metric: Persona Distance
  Paired t-test:     t = 4.3339, p = 0.00002
  Wilcoxon test:     W = 150969.0000, p = 0.00000

📌 Metric: UniEval Naturalness
  Paired t-test:     t = -12.1395, p = 0.00000
  Wilcoxon test:     W = 96850.0000, p = 0.00000

📌 Metric