### Getting the data

In [1]:
import pandas as pd

df = pd.read_csv('./data/results-gpt4o-mini.csv')
df = df.iloc[:300]

# Q1.

In [2]:
from sentence_transformers import SentenceTransformer
model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [3]:
answer_llm = df.iloc[0].answer_llm
print(embedding_model.encode(answer_llm)[0])

-0.42244655


# Q2.

In [4]:
import numpy as np

In [5]:
evaluations = []

for idx, row in df.iterrows():
    orig_v = embedding_model.encode(row['answer_orig'])
    llm_v = embedding_model.encode(row['answer_llm'])
    
    evaluations.append(llm_v.dot(orig_v))

In [6]:
print(np.percentile(evaluations, 75))

31.6743106842041


# Q3.

In [7]:
def normalize(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

evaluations = []

for idx, row in df.iterrows():
    orig_v = embedding_model.encode(row['answer_orig'])
    llm_v = embedding_model.encode(row['answer_llm'])
    
    orig_v = normalize(orig_v)
    llm_v = normalize(llm_v)
    
    evaluations.append(llm_v.dot(orig_v))

In [8]:
print(np.percentile(evaluations, 75))

0.8362347632646561


# Q4.

In [9]:
# ! pip install rouge

In [10]:
import rouge
from rouge import Rouge

print("Version: ", rouge.__version__)

rouge_scorer = Rouge()

r = df.iloc[10].to_dict()

print(r['document'])

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

Version:  1.0.1
5170565b


In [11]:
print(scores['rouge-1']['f'])

0.45454544954545456


# Q5.

In [12]:
print((scores['rouge-1']['f'] + scores['rouge-2']['f'] + scores['rouge-l']['f'])/3)

0.35490034990035496


# Q6.

In [14]:

evaluations = []

for idx, r in df.iterrows():
    scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
        
    rouge_1 = scores['rouge-1']['f']
    rouge_2 = scores['rouge-2']['f']
    rouge_l = scores['rouge-l']['f']
    rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3
    
    evaluations.append({'rouge_1': rouge_1,
                       'rouge_2': rouge_2,
                       'rouge_l': rouge_l,
                       'mean_rouge': rouge_avg})

In [15]:
print(pd.DataFrame(evaluations)['rouge_2'].mean())

0.20696501983423318
