## Performance Evaluation

In [None]:
# pip install bert_score openpyxl gensim requests nltk

In [1]:
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('punkt_tab')

In [53]:
import pandas as pd
import warnings
import logging

warnings.filterwarnings("ignore")
logging.getLogger('transformers').setLevel(logging.ERROR)

# Set the logging level to ERROR to ignore warnings
logging.getLogger("transformers").setLevel(logging.ERROR)


In [54]:
Dataset = "FoCus"                                   # Synthetic-PersonaChat, Blended Skill Talk, PEC, ConvAI2, FoCus, IT-ConvAI2
LLM_name = "Llama3-1-8B-Instruct"                         # Mistral-7B-Instruct, Llama3-1-8B-Instruct, Qwen2-7B-Instruct,  Gemma-7B-Instruct, gpt-3.5-turbo, gpt-4-turbo, gpt-4o-mini, gemini-1.5-pro
COT_SETUP = True

In [55]:
df = pd.read_csv(f'./Prompts/{Dataset}.csv')
print("Shape:", df.shape)

df.head()

Shape: (1000, 3)


Unnamed: 0,personas,context,act_response
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,User2: The history of the house you are intere...
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",User2: This house was use as a stop for slaves...
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","User2: Sure, you will like to know that this p..."
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,User2: Technische Universität Darmstadt in the...
4,I like to visit england.I love church.I would ...,User1: Where is this place?\nUser2: This place...,"User2: I suggest a place, for your wish of see..."


In [56]:
Dataset

'FoCus'

In [57]:
### Only For: FoCus, IT-ConvAI2
if Dataset == "FoCus" or Dataset == "IT-ConvAI2":
    df['act_response'] = df['act_response'].apply(lambda x: x.split(':', 1)[1].strip() if ':' in x else x.strip())

# ### Only For: Blended Skill Talk
if Dataset == "Blended Skill Talk":
    df['personas'] = df['personas'].str.replace(r'\[User 1 persona\]:|\[|\]|"|\'', '', regex=True).str.strip()

# ### Only For: PEC
if Dataset == "PEC":
    df['personas'] = df['personas'].str.replace(r'\[Responder persona\]:|\[|\]|"|\'', '', regex=True).str.strip()


print(df.isnull().sum())
df.head(6)

personas        0
context         0
act_response    0
dtype: int64


Unnamed: 0,personas,context,act_response
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,The history of the house you are interested in...
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",This house was use as a stop for slaves trying...
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","Sure, you will like to know that this place wa..."
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,Technische Universität Darmstadt in the top 25...
4,I like to visit england.I love church.I would ...,User1: Where is this place?\nUser2: This place...,"I suggest a place, for your wish of see librar..."
5,I would like to go to University.I live in Mic...,User1: I think Ive been there before but I don...,They offer 132 bachelors degree programs and 1...


In [58]:
COT_ = "-COT" if COT_SETUP else ""
 
response = pd.read_csv(f'Responses/{Dataset}/{LLM_name}{COT_}.csv')
print("Shape:", response.shape)

print("\nMissing Values:")
print(response.isnull().sum())

response

Shape: (1000, 2)

Missing Values:
gen_response     50
response_time     0
dtype: int64


Unnamed: 0,gen_response,response_time
0,"Nazareth House is a truly remarkable place, an...",8.849191
1,I'm glad you're interested in historic houses!...,8.896329
2,The Marion Palace Theatre's architecture is a ...,8.867732
3,Technische Universität Darmstadt is a great ch...,9.021704
4,,8.908265
...,...,...
995,I think you're referring to the museum come ph...,8.841254
996,It's great to know that Mahasthangarh has such...,8.828058
997,Armagh County Museum is a must-visit destinati...,8.916159
998,The Nyanga National Park is a great destinatio...,8.955311


In [59]:
# Calculate maximum number of words in each column
max_response_length = response['gen_response'].dropna().apply(lambda x: len(x.split())).max()

print(f"Maximum Response Length (in words): {max_response_length}")

Maximum Response Length (in words): 116


In [60]:
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Initialize stop words
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text, remove_stop_words=True):
    if pd.isnull(text):
        return None
    text = text.lower()  # Lowercasing
    text = text.translate(str.maketrans('', '', string.punctuation))  # Removing punctuation
    tokens = word_tokenize(text)  # Tokenization
    if remove_stop_words:
        tokens = [word for word in tokens if word not in stop_words]  # Removing stop words
    return ' '.join(tokens)  # Join tokens back into a single string

# Create eval_df
eval_df = pd.DataFrame({
    'personas': df['personas'],
    'act_response': df['act_response'],
    'gen_response': response['gen_response'],
    'response_time': response['response_time']
})

print(eval_df.isnull().sum())
eval_df.head()

personas          0
act_response      0
gen_response     50
response_time     0
dtype: int64


Unnamed: 0,personas,act_response,gen_response,response_time
0,I would like to visit the Nazareth House again...,The history of the house you are interested in...,"Nazareth House is a truly remarkable place, an...",8.849191
1,I have been to Vermont a few times to go skiin...,This house was use as a stop for slaves trying...,I'm glad you're interested in historic houses!...,8.896329
2,I am fascinated by the Spanish Colonial Reviva...,"Sure, you will like to know that this place wa...",The Marion Palace Theatre's architecture is a ...,8.867732
3,I want to become a college student.I want to s...,Technische Universität Darmstadt in the top 25...,Technische Universität Darmstadt is a great ch...,9.021704
4,I like to visit england.I love church.I would ...,"I suggest a place, for your wish of see librar...",,8.908265


In [10]:
import torch
device = 0 if torch.cuda.is_available() else -1  # device set to 0 for GPU, -1 for CPU
# device = -1

In [11]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge import Rouge
import bert_score
from tqdm import tqdm
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gensim
import torch
from nltk.corpus import stopwords
from transformers import pipeline, BertForSequenceClassification, BertTokenizer


# Initialize ROUGE scorer
rouge = Rouge()

# Lists to store the metrics
bleu_scores = []
rouge_scores = []
meteor_scores = []
bertscore_prec = []
bertscore_rec = []
bertscore_f1 = []
distinct_1 = []
distinct_2 = []
ue_scores = []
c_scores = []
consistency_scores = []
idf_scores = []
persona_distance_scores = []


bert_snli_dir = "Fine-tuning/output/bert_snli"
bert_snli_model = BertForSequenceClassification.from_pretrained(bert_snli_dir)
bert_snli_tokenizer = BertTokenizer.from_pretrained(bert_snli_dir)

# Initialize the NLI pipeline for UE Score
bert_on_snli = pipeline('text-classification', model = bert_snli_model, tokenizer = bert_snli_tokenizer, device=0)

bert_dnli_dir = "Fine-tuning/output/bert_dnli"
bert_dnli_model = BertForSequenceClassification.from_pretrained(bert_dnli_dir)
bert_dnli_tokenizer = BertTokenizer.from_pretrained(bert_dnli_dir)

# Initialize the NLI pipeline
bert_on_dnli = pipeline('text-classification', model = bert_dnli_model, tokenizer = bert_dnli_tokenizer, device=0)


# Initialize the Word2Vec Model
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format("./GoogleNews-vectors-negative300.bin", binary=True)
stop_words = set(stopwords.words('english'))


# Initialize smoothing function
smoothing_function = SmoothingFunction().method1

# Helper functions
def compute_bleu(reference, hypothesis):
    reference = [str(reference).replace('\n', ' ').split()]
    hypothesis = str(hypothesis).replace('\n', ' ').split()
    return sentence_bleu(reference, hypothesis, smoothing_function=smoothing_function)

def compute_rouge(reference, hypothesis):
    scores = rouge.get_scores(str(hypothesis).replace('\n', ' '), str(reference).replace('\n', ' '), avg=True)
    return scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f']

def compute_meteor(reference, hypothesis):
    reference = [str(reference).replace('\n', ' ').split()]
    hypothesis = str(hypothesis).replace('\n', ' ').split()
    return meteor_score(reference, hypothesis)

def compute_distinct_ngrams(text, n):
    tokens = str(text).replace('\n', ' ').split()
    ngrams = list(zip(*[tokens[i:] for i in range(n)]))
    distinct_ngrams = len(set(ngrams))
    total_ngrams = len(ngrams)
    return distinct_ngrams / total_ngrams if total_ngrams > 0 else 0


# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

def calculate_c_score(gen_response, persona):
    """
    Calculate the C score based on the entailment results between a generated response (R)
    and a given persona (P).

    Returns:
    int: C-score with possible values:
         1 for entailment (positive),
         0 for neutral,
         -1 for contradiction (negative).
    """

    # Define the label mapping to interpret the NLI model's output
    label_mapping = {
        'LABEL_0': 'negative',
        'LABEL_1': 'neutral',
        'LABEL_2': 'positive'
    }
    
    # Check entailment between persona (P) and generated response (R)
    result_pr = bert_on_dnli(f"{persona} {gen_response}")
    label_pr = label_mapping.get(result_pr[0]['label'], 'unknown')

    # Determine C score based on entailment results
    if label_pr == 'positive':
        return 1
    elif label_pr == 'neutral':
        return 0
    elif label_pr == 'negative':
        return -1
    else:
        raise ValueError(f"Unexpected label encountered: {label_pr}")


def calculate_consistency_score(gen_response, persona):
    """
    Calculate the Consistency Score based on the binary entailment results 
    between a generated response (R) and a given persona (P).

    Returns:
    int: Consistency Score with binary values:
         1 for entailment or neutral,
         0 for contradiction.
    """

    # Define the label mapping for binary classification
    label_mapping = {
        'LABEL_0': 'negative',
        'LABEL_1': 'neutral',
        'LABEL_2': 'positive'
    }

    # Check entailment between persona (P) and generated response (R)
    result_pr = bert_on_dnli(f"{persona} {gen_response}")
    label_pr = label_mapping.get(result_pr[0]['label'], 'unknown')

    # Determine Consistency Score based on binary entailment results
    if label_pr in ['positive', 'neutral']:
        return 1
    elif label_pr == 'negative':
        return 0
    else:
        raise ValueError(f"Unexpected label encountered: {label_pr}")


def calculate_ue_score(act_response, gen_response, persona):
    """
    Calculate the UE score based on entailment between persona, actual response, and generated response.

    Returns:
    int: UE score with possible values 2, 1, or 0.
    """

    # Define the label mapping to interpret the NLI model's output
    label_mapping = {
        'LABEL_0': 'entailment',
        'LABEL_1': 'neutral',
        'LABEL_2': 'contradiction'
    }
    
    # Check entailment between persona (P) and generated response (R)
    result_pr = bert_on_snli(f"{persona} [SEP] {gen_response}")
    label_pr = label_mapping.get(result_pr[0]['label'], 'unknown')

    # Check entailment between actual response (Q) and generated response (R)
    result_qr = bert_on_snli(f"{act_response} [SEP] {gen_response}")
    label_qr = label_mapping.get(result_qr[0]['label'], 'unknown')

    # Determine UE score based on entailment results
    if label_pr == 'entailment' and label_qr == 'entailment':
        return 2
    elif label_pr == 'entailment':
        return 1
    else:
        return 0



def calculate_idf_weighted_overlap(persona, response):
    # Fit TF-IDF on both texts and calculate cosine similarity

    processed_persona = preprocess_text(persona)
    processed_response = preprocess_text(response)
    persona_new = str(processed_persona) if not isinstance(processed_persona, str) else processed_persona
    response_new = str(processed_response) if not isinstance(processed_response, str) else processed_response
    texts = [persona_new, response_new]

    # texts = [persona, response]
    
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return cosine_sim[0][0]


def compute_persona_distance(persona, response, model, stop_words):
    # Tokenize and filter stopwords
    persona_tokens = [word for word in persona.lower().split() if word not in stop_words]
    response_tokens = [word for word in response.lower().split() if word not in stop_words]
    
    # Get word vectors
    persona_vecs = [model[word] for word in persona_tokens if word in model]
    response_vecs = [model[word] for word in response_tokens if word in model]
    
    # If no vectors found, return zero similarity
    if not persona_vecs or not response_vecs:
        return 0.0
    
    # Compute average vectors
    persona_avg_vec = np.mean(persona_vecs, axis=0)
    response_avg_vec = np.mean(response_vecs, axis=0)
    
    # Compute cosine similarity
    return cosine_similarity([persona_avg_vec], [response_avg_vec])[0][0]



Device set to use cuda:0
Device set to use cuda:0


In [12]:
import logging

# Set the logging level to ERROR to suppress warnings about training
logging.getLogger("transformers").setLevel(logging.ERROR)

# Default worst-case values
worst_bleu = 0.0
worst_rouge = (0.0, 0.0, 0.0)
worst_meteor = 0.0
worst_bertscore = (0.0, 0.0, 0.0)
worst_distinct = 0.0
worst_c_score = -1.0
worst_consistency_score = 0.0
worst_idf_score = 0.0
worst_ue_score = 0.0
worst_persona_distance_score = 0.0

In [13]:
# Initialize a counter for invalid gen_response
invalid_gen_res_count = 0

# Iterate over each row
for index, row in tqdm(eval_df.iterrows(), total=len(eval_df)):
    personas = row['personas']
    act_response = row['act_response']
    gen_response = row['gen_response']

    # Check for NaN or None in gen_response
    if pd.isna(gen_response):
        invalid_gen_res_count += 1
        
        # bleu_scores.append(worst_bleu)
        # rouge_scores.append(worst_rouge)
        # meteor_scores.append(worst_meteor)
        bertscore_prec.append(worst_bertscore[0])
        bertscore_rec.append(worst_bertscore[1])
        bertscore_f1.append(worst_bertscore[2])
        distinct_1.append(worst_distinct)
        distinct_2.append(worst_distinct)
        c_scores.append(worst_c_score)
        # consistency_scores.append(worst_consistency_score)
        # idf_scores.append(worst_idf_score)
        persona_distance_scores.append(worst_persona_distance_score)
        ue_scores.append(worst_ue_score)

        continue

    # bleu = compute_bleu(act_response, gen_response)
    # bleu_scores.append(bleu)
    
    # rouge_1, rouge_2, rouge_l = compute_rouge(act_response, gen_response)
    # rouge_scores.append((rouge_1, rouge_2, rouge_l))
    
    # meteor = compute_meteor(act_response, gen_response)
    # meteor_scores.append(meteor)
    
    P, R, F1 = bert_score.score([gen_response], [act_response], lang="en", verbose=False)
    bertscore_prec.append(P.mean().item())
    bertscore_rec.append(R.mean().item())
    bertscore_f1.append(F1.mean().item())
    
    distinct_1.append(compute_distinct_ngrams(gen_response, 1))
    
    distinct_2.append(compute_distinct_ngrams(gen_response, 2))
    
    c_scores.append(calculate_c_score(personas, gen_response))
    
    # consistency_scores.append(calculate_consistency_score(personas, gen_response))
    
    ue_scores.append(calculate_ue_score(act_response, gen_response, personas))

    # idf_scores.append(calculate_idf_weighted_overlap(personas, gen_response))
    
    persona_distance = compute_persona_distance(personas, gen_response, word2vec_model, stop_words)
    persona_distance_scores.append(persona_distance)


# Compile metrics into DataFrame
metrics_df = pd.DataFrame({
    # 'BLEU': bleu_scores,
    # 'R1': [score[0] for score in rouge_scores],
    # 'R2': [score[1] for score in rouge_scores],
    # 'RL': [score[2] for score in rouge_scores],
    # 'METEOR': meteor_scores,
    'BERTScore_Prec': bertscore_prec,
    'BERTScore_Rec': bertscore_rec,
    'BERTScore_F1': bertscore_f1,
    'Dist1': distinct_1,
    'Dist2': distinct_2,
    'C Score': c_scores,
    # 'P Consistency Score': consistency_scores,
    # 'IDF Overlap': idf_scores,
    'UE Score': ue_scores,
    'Persona Distance': persona_distance_scores
})

metrics_df

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [18:03<00:00,  1.08s/it]


Unnamed: 0,BERTScore_Prec,BERTScore_Rec,BERTScore_F1,Dist1,Dist2,C Score,UE Score,Persona Distance
0,0.846562,0.816107,0.831056,0.702128,0.956989,1.0,2.0,0.562215
1,0.837872,0.889504,0.862917,0.881356,1.000000,1.0,0.0,0.743971
2,0.841942,0.870469,0.855968,0.829268,1.000000,1.0,0.0,0.760262
3,0.847506,0.925325,0.884708,0.864198,1.000000,1.0,0.0,0.601566
4,0.836625,0.870675,0.853311,0.830986,1.000000,1.0,0.0,0.662243
...,...,...,...,...,...,...,...,...
995,0.826543,0.835938,0.831214,0.918033,1.000000,0.0,0.0,0.619469
996,0.829818,0.911610,0.868793,0.921875,1.000000,0.0,0.0,0.644514
997,0.831442,0.945189,0.884674,0.859375,1.000000,1.0,1.0,0.671408
998,0.855791,0.855816,0.855804,0.896552,1.000000,0.0,0.0,0.536340


In [14]:
# Add the 'response_time' column to 'metrics_df'
metrics_df['response_time'] = eval_df['response_time']

In [15]:
# Calculate the mean (average) and standard deviation, rounded to 2 decimal places
avg_values = metrics_df.mean().round(2)
std_values = metrics_df.std(ddof=0).round(2)  # Use ddof=0 for population standard deviation

# Combine the average and standard deviation into the format "avg ± std"
combined_values = avg_values.astype(str) + " ± " + std_values.astype(str)

# Insert the LLM name at the beginning of the combined values
combined_values = combined_values.tolist()
combined_values.insert(0, LLM_name)

# Create a DataFrame for the combined average ± std row
result_df = pd.DataFrame([combined_values], columns=['Model'] + metrics_df.columns.tolist())

# Add the ratio of invalid gen_response
invalid_gen_res_ratio = invalid_gen_res_count / len(eval_df)
result_df['Failure Ratio'] = f"{round(invalid_gen_res_ratio, 3)} ± 0.00"  # No std for Failure Ratio

result_df

Unnamed: 0,Model,BERTScore_Prec,BERTScore_Rec,BERTScore_F1,Dist1,Dist2,C Score,UE Score,Persona Distance,response_time,Failure Ratio
0,gemini-1.5-pro,0.7 ± 0.32,0.73 ± 0.34,0.71 ± 0.33,0.71 ± 0.33,0.82 ± 0.38,0.15 ± 0.79,0.32 ± 0.67,0.48 ± 0.24,4.75 ± 3.43,0.173 ± 0.00


In [None]:
# Load the existing Excel file and update or append the average row
output_path = f'./Evaluations/{Dataset}{COT_}-results.xlsx'

try:
    # Load existing data
    existing_df = pd.read_excel(output_path)
    # Check if the model name already exists
    if LLM_name in existing_df['Model'].values:
        # Update the row with the same model name
        existing_df.loc[existing_df['Model'] == LLM_name, :] = result_df.values
    else:
        # Append the new data
        existing_df = pd.concat([existing_df, result_df], ignore_index=True)
except FileNotFoundError:
    # If the file does not exist, create a new DataFrame
    existing_df = result_df

# Save the updated DataFrame to an Excel file
existing_df.to_excel(output_path, index=False)

existing_df

In [None]:
response = pd.read_excel(f'./Evaluations/{Dataset}{COT_}-results.xlsx')
response

Unnamed: 0,Model,P Consistency Score,C Score,UE Score,BLEU,R1,R2,RL,METEOR,BERTScore_Prec,...,IDF Overlap,Persona Distance,response_time,Failure Ratio,UniEval Naturalness,UniEval Coherence,UniEval Engagingness,UniEval Groundedness,UniEval Understandability,UniEval Overall
0,gpt-4-turbo,0.84 ± 0.37,-0.05 ± 0.53,0.50 ± 0.82,0.01 ± 0.01,0.11 ± 0.07,0.01 ± 0.03,0.1 ± 0.07,0.12 ± 0.08,0.84 ± 0.03,...,0.06 ± 0.07,0.47 ± 0.13,5.15 ± 1.13,0.001 ± 0.00,0.96 ± 0.03,0.98 ± 0.09,2.54 ± 0.79,0.55 ± 0.4,0.95 ± 0.03,1.2 ± 0.19
1,Mistral-7B-Instruct,0.78 ± 0.41,-0.08 ± 0.59,0.44 ± 0.79,0.01 ± 0.01,0.1 ± 0.07,0.01 ± 0.02,0.09 ± 0.07,0.11 ± 0.08,0.8 ± 0.18,...,0.05 ± 0.07,0.43 ± 0.17,6.71 ± 1.85,0.046 ± 0.00,0.9 ± 0.2,0.94 ± 0.22,2.54 ± 1.28,0.39 ± 0.43,0.9 ± 0.2,1.13 ± 0.35
2,Llama3-1-8B-Instruct,0.83 ± 0.37,-0.05 ± 0.53,0.37 ± 0.73\t,0.01 ± 0.01,0.11 ± 0.08,0.01 ± 0.03,0.1 ± 0.07,0.11 ± 0.08,0.85 ± 0.02,...,0.06 ± 0.08,0.46 ± 0.14,8.69 ± 0.15,0.0 ± 0.00,0.95 ± 0.02,0.94 ± 0.19,2.01 ± 0.94,0.49 ± 0.44,0.95 ± 0.01,1.07 ± 0.21
3,gpt-3.5-turbo,0.83 ± 0.38,0.02 ± 0.6,0.57 ± 0.86,0.01 ± 0.01,0.11 ± 0.07,0.01 ± 0.03,0.1 ± 0.06,0.12 ± 0.08,0.82 ± 0.13,...,0.07 ± 0.09,0.48 ± 0.16,1.56 ± 0.47,0.023 ± 0.00,0.92 ± 0.15,0.96 ± 0.16,2.81 ± 0.98,0.54 ± 0.44,0.93 ± 0.14,1.23 ± 0.28
4,gpt-4o-mini,0.84 ± 0.37,-0.05 ± 0.52,0.39 ± 0.76,0.01 ± 0.01,0.12 ± 0.07,0.01 ± 0.03,0.11 ± 0.06,0.14 ± 0.08,0.84 ± 0.02,...,0.06 ± 0.06,0.5 ± 0.13,1.63 ± 0.42,0.0 ± 0.00,0.95 ± 0.02,0.99 ± 0.03,3.23 ± 0.89,0.72 ± 0.35,0.95 ± 0.02,1.37 ± 0.2
5,Qwen2-7B-Instruct,0.84 ± 0.36,0.01 ± 0.57\t,0.52 ± 0.84,0.01 ± 0.01,0.09 ± 0.06,0.01 ± 0.02,0.09 ± 0.06,0.11 ± 0.07,0.83 ± 0.05,...,0.06 ± 0.06,0.49 ± 0.13,4.8 ± 0.88,0.003 ± 0.00,0.93 ± 0.07,0.97 ± 0.13,3.14 ± 1.17,0.72 ± 0.37,0.94 ± 0.06,1.34 ± 0.25
6,Gemma-7B-Instruct,0.77 ± 0.42,-0.03 ± 0.65,0.35 ± 0.71,0.01 ± 0.01,0.12 ± 0.08,0.01 ± 0.03,0.11 ± 0.07,0.12 ± 0.09,0.78 ± 0.23,...,0.08 ± 0.12,0.46 ± 0.19,7.47 ± 1.25,0.078 ± 0.00,0.87 ± 0.26,0.91 ± 0.28,2.91 ± 1.36,0.47 ± 0.45,0.87 ± 0.26,1.2 ± 0.42
7,gemini-1.5-pro,,-0.1 ± 0.55,0.44 ± 0.79,,,,,,0.82 ± 0.1,...,,0.48 ± 0.14,5.21 ± 5.4,0.015 ± 0.00,,,,,,


# Example Review

In [1]:
import pandas as pd

In [2]:
prompt = pd.read_csv(f'./Prompts/FoCus.csv')
print("Shape:", prompt.shape)


Shape: (1000, 3)


In [3]:
qwen2_res = pd.read_csv(f'Responses/FoCus/Qwen2-7B-Instruct.csv')
print("Qwen2-7B Average Response Length (in words):", round(qwen2_res['gen_response'].str.split().str.len().mean(),2))
qwen2_df = pd.concat([prompt, qwen2_res], axis=1)
qwen2_df.head()

Qwen2-7B Average Response Length (in words): 56.68


Unnamed: 0,personas,context,act_response,gen_response,response_time
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,User2: The history of the house you are intere...,Nazareth House is a historic place located on ...,3.733489
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",User2: This house was use as a stop for slaves...,,4.19715
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","User2: Sure, you will like to know that this p...",The Marion Palace Theatre is a stunning exampl...,2.774832
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,User2: Technische Universität Darmstadt in the...,,4.265017
4,I like to visit england.I love church.I would ...,User1: Where is this place?\nUser2: This place...,"User2: I suggest a place, for your wish of see...",,4.194262


In [4]:
gemini_res = pd.read_csv(f'Responses/FoCus/gemini-1.5-pro-COT.csv')
print("Gemini 1.5 Pro Average Response Length (in words):", round(gemini_res['gen_response'].str.split().str.len().mean(),2))
gemini_df = pd.concat([prompt, gemini_res], axis=1)
gemini_df.head()

Gemini 1.5 Pro Average Response Length (in words): 55.35


Unnamed: 0,personas,context,act_response,gen_response,reasoning,response_time
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,User2: The history of the house you are intere...,Nazareth House has a rich history rooted in be...,The user persona indicates a strong interest i...,5.792017
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",User2: This house was use as a stop for slaves...,It's fascinating to think about what life was ...,"The persona indicates an interest in history, ...",5.167329
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","User2: Sure, you will like to know that this p...",The Marion Palace Theatre is a stunning exampl...,The user persona expresses interest in Histori...,4.482702
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,User2: Technische Universität Darmstadt in the...,Technische Universität Darmstadt consistently ...,The user persona expresses a strong interest i...,5.374144
4,I like to visit england.I love church.I would ...,User1: Where is this place?\nUser2: This place...,"User2: I suggest a place, for your wish of see...",Given your interest in libraries and church or...,The user persona expresses interest in England...,4.855018


In [5]:
llama3_res = pd.read_csv(f'Responses/FoCus/Llama3-1-8B-Instruct-COT.csv')
print("Llama 3.1-8B Average Response Length (in words):", round(llama3_res['gen_response'].str.split().str.len().mean(),2))
llama3_df = pd.concat([prompt, llama3_res], axis=1)
llama3_df.head()

Llama 3.1-8B Average Response Length (in words): 66.84


Unnamed: 0,personas,context,act_response,gen_response,response_time
0,I would like to visit the Nazareth House again...,User1: I think Ive been there before but I don...,User2: The history of the house you are intere...,"Nazareth House is a truly remarkable place, an...",8.849191
1,I have been to Vermont a few times to go skiin...,"User1: Wow, this is amazing! What is this?\nUs...",User2: This house was use as a stop for slaves...,I'm glad you're interested in historic houses!...,8.896329
2,I am fascinated by the Spanish Colonial Reviva...,"User1: Wow, this is amazing! What is this?\nUs...","User2: Sure, you will like to know that this p...",The Marion Palace Theatre's architecture is a ...,8.867732
3,I want to become a college student.I want to s...,User1: Where is this place?\nUser2: Hello! Wel...,User2: Technische Universität Darmstadt in the...,Technische Universität Darmstadt is a great ch...,9.021704
4,I like to visit england.I love church.I would ...,User1: Where is this place?\nUser2: This place...,"User2: I suggest a place, for your wish of see...",,8.908265


In [6]:
idx = 858  # or any row index you want

In [7]:
print("Personas:", prompt.iloc[idx]['personas'])
print("\nContext:", prompt.iloc[idx]['context'])
print("\nActual Response:", prompt.iloc[idx]['act_response'])

Personas: My father served in the Navy.I live in California.I like basketball.Ive never been on a ship before.I would like to visit Panama.

Context: User1: I think Ive been there before but I dont remember the name of this place.
User2: This is the USS Midway, which was formerly an aircraft carrier for the United States Navy, the branch of military that your father served in.
User1: Where is the ship right now?
User2: The ship is moored at her final location as a museum in San Diego, California, your home state!
User1: When did it open as a museum?
User2: The USS Midway was opened to the public as a museum on June 7th, 2004.
User1: When was this ship decommisioned?

Actual Response: User2: The USS Midway was decommissioned in a ceremony at Naval Air Station North Island on April 11th, 1992.


In [8]:
print("Qwen2 7B:")
print("\tResponse Length:", len(qwen2_df.iloc[idx]['gen_response']))
print("\tResponse:", qwen2_df.iloc[idx]['gen_response'])

Qwen2 7B:
	Response Length: 154
	Response: Wow, it's amazing that the USS Midway is now a museum in San Diego. I'm glad it opened to the public in 2004. I wonder when exactly it was decommissioned?


In [9]:
print("Gemini 1.5 Pro:")
print("\tResponse Length:", len(gemini_df.iloc[idx]['gen_response']))
print("\tResponse:", gemini_df.iloc[idx]['gen_response'])

Gemini 1.5 Pro:
	Response Length: 261
	Response: The USS Midway was decommissioned on April 11, 1992.  Given your father's Navy background and the fact that you live in California, a visit might be a particularly meaningful experience.  It's a pretty impressive piece of history right there in your home state!


In [10]:
print("Llama3.1 8B:")
print("\tResponse Length:", len(llama3_df.iloc[idx]['gen_response']))
print("\tResponse:", llama3_df.iloc[idx]['gen_response'])

Llama3.1 8B:
	Response Length: 345
	Response: I think it would be really cool to visit the USS Midway in San Diego, California. It's a great way to learn about naval history and the role of the US Navy. Plus, it's located in my home state, so it's definitely on my bucket list. Who knows, maybe one day I'll even get to visit Panama and see the Panama Canal, which is a major shipping route.
