In [10]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np


In [2]:
df = pd.read_csv("results-gpt4o-mini.csv")
df = df.iloc[:300]


In [3]:
model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [4]:
answer_llm = df.iloc[0].answer_llm


### Question 1

In [24]:
embedding_model.encode(answer_llm)[0]

-0.42244688

In [20]:
evaluations  = []
evaluations_normalized  = []

def normalizaed(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm

    return v_norm 

for i in range(len(df)):
    v_llm = embedding_model.encode(df["answer_llm"][i])
    v_llm_norm = normalizaed(v_llm)
    v_orig = embedding_model.encode(df["answer_orig"][i])
    v_orig_norm = normalizaed(v_orig)

    v = v_llm.dot(v_orig)
    evaluations.append(v)
    v_norm = v_llm_norm.dot(v_orig_norm)
    evaluations_normalized.append(v_norm)


### Question 2

In [21]:
percentile_75 = np.percentile(evaluations, 75)

print(f"The 75th percentile of the scores is: {percentile_75}")

The 75th percentile of the scores is: 31.674302101135254


### Question 3

In [23]:
percentile_75 = np.percentile(evaluations_normalized, 75)

print(f"The 75th percentile of the normalized scores is: {percentile_75}")

The 75th percentile of the normalized scores is: 0.8362348079681396


In [30]:
df['document'][10]

'5170565b'

In [63]:
from rouge import Rouge
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(df['answer_llm'], df['answer_orig'])[10]

### Question 4

In [64]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [66]:
scores['rouge-1']['f']

0.45454544954545456

### Question 5

In [53]:
avg_f = []
for score_dict in scores:
    f_scores = [score_dict[metric]['f'] for metric in score_dict]
    avg_f.append(sum(f_scores) / len(f_scores))

print(f"The average of the F-scores is: {sum(avg_f) / len(avg_f):.6f}")

The average of the F-scores is: 0.354900


### Question 6

In [58]:
data_list = []
for i in range(len(df)):
    scores = rouge_scorer.get_scores(df['answer_llm'][i], df['answer_orig'][i])
    for item in scores:
        for metric, scores in item.items():
            row = {'Metric': metric, 'R': scores['r'], 'P': scores['p'], 'F': scores['f']}
            data_list.append(row)

# Create the DataFrame
df_new = pd.DataFrame(data_list)



In [59]:
df_new

Unnamed: 0,Metric,R,P,F
0,rouge-1,0.061224,0.214286,0.095238
1,rouge-2,0.017544,0.071429,0.028169
2,rouge-l,0.061224,0.214286,0.095238
3,rouge-1,0.081633,0.266667,0.125000
4,rouge-2,0.035088,0.133333,0.055556
...,...,...,...,...
895,rouge-2,0.135593,0.129032,0.132231
896,rouge-l,0.285714,0.326531,0.304762
897,rouge-1,0.125000,0.318182,0.179487
898,rouge-2,0.016949,0.038462,0.023529


In [61]:
df_new[df_new["Metric"] == "rouge-2"]["F"].mean()

0.20696501983423318