In [69]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import numpy as np
from rouge import Rouge

In [4]:
github_url = "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/04-monitoring/data/results-gpt4o-mini.csv"
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [6]:
df = df.iloc[:300]

## Question 1

In [47]:
model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [48]:
answer_llm = df.iloc[0].answer_llm

In [49]:
v = embedding_model.encode(answer_llm)

In [42]:
v[0]

-0.42244673

## Question 2

In [15]:
results_gpt4o = df.to_dict(orient='records')

In [27]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [28]:
evaluations = []

for record in tqdm(results_gpt4o):
    sim = compute_similarity(record)
    evaluations.append(sim)

100%|█████████████████████████████████████████████████████████████████████████████████| 300/300 [00:09<00:00, 32.60it/s]


In [30]:
df['cosine'] = evaluations
df['cosine'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cosine'] = evaluations


count    300.000000
mean      27.495996
std        6.384744
min        4.547922
25%       24.307846
50%       28.336862
75%       31.674308
max       39.476017
Name: cosine, dtype: float64

## Question 3

In [63]:
def normalize_vectors(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

In [64]:
def compute_similarity_norm(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)

    v_llm = normalize_vectors(v_llm)
    v_orig = normalize_vectors(v_orig)
    
    return v_llm.dot(v_orig)

In [65]:
norm_vec = normalize_vectors(v)

In [66]:
evaluations_norm = []

for record in tqdm(results_gpt4o):
    sim = compute_similarity_norm(record)
    evaluations_norm.append(sim)

100%|█████████████████████████████████████████████████████████████████████████████████| 300/300 [00:08<00:00, 33.56it/s]


In [67]:
df['cosine_norm'] = evaluations_norm
df['cosine_norm'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cosine_norm'] = evaluations_norm


count    300.000000
mean       0.728392
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: cosine_norm, dtype: float64

## Question 4

In [68]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [71]:
rouge_scorer = Rouge()

r = results_gpt4o[10]

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [73]:
scores['rouge-1']['f']

0.45454544954545456

## Question 5

In [77]:
rouge_1 = scores['rouge-1']['f']
rouge_2 = scores['rouge-2']['f']
rouge_l = scores['rouge-l']['f']
rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3

In [78]:
rouge_avg

0.35490034990035496

## Question 6

In [90]:
def compute_rouge(r):
    scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
    rouge_1 = scores['rouge-1']['f']
    rouge_2 = scores['rouge-2']['f']
    rouge_l = scores['rouge-l']['f']
    rouge_dict = {'rouge_1':rouge_1, "rouge_2":rouge_2, "rouge_l":rouge_l}
    return rouge_dict

In [91]:
rouge_scores = []

for record in tqdm(results_gpt4o):
    sim = compute_rouge(record)
    rouge_scores.append(sim)

100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [00:00<00:00, 671.85it/s]


In [94]:
df_rouge = pd.DataFrame(rouge_scores)

In [95]:
df_rouge['rouge_2'].mean()

0.20696501983423318