In [1]:
import pandas as pd

In [2]:
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"
url = f"{github_url}?raw=1"
df = pd.read_csv(url)

df = df.iloc[:300]

In [3]:
from sentence_transformers import SentenceTransformer

model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

  from tqdm.autonotebook import tqdm, trange


In [4]:
# Q1. Getting the embeddings model

answer_llm = df.iloc[0].answer_llm
# Generate the embedding for the first LLM answer
embedding_vector = embedding_model.encode(answer_llm)
first_value = embedding_vector[0]
print(first_value)

-0.42244658


In [21]:
def compute_similarity(record, model=embedding_model):
    answer_orig = record["answer_llm"]
    answer_llm = record["answer_orig"]

    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)

    return v_llm.dot(v_orig)

In [37]:
# Q2. Computing the dot product

from tqdm import tqdm 
results_gpt35 = df.to_dict(orient="records")
similarity_35 = []

for record in tqdm(results_gpt35):
    sim = compute_similarity(record)
    similarity_35.append(sim)

df["evaluations"] = similarity_35

df["evaluations"].describe()

100%|██████████| 300/300 [01:22<00:00,  3.65it/s]


count    300.000000
mean      27.495996
std        6.384742
min        4.547924
25%       24.307847
50%       28.336872
75%       31.674312
max       39.476013
Name: evaluations, dtype: float64

In [35]:
import numpy as np
def normalized_vector(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

In [36]:
# Q3. Computing the cosine

def compute_cosine_similarity(record, model=embedding_model):
    answer_orig = record["answer_llm"]
    answer_llm = record["answer_orig"]

    v_llm = model.encode(answer_llm)
    v_orig = model.encode(answer_orig)
    v_llm_norm = v_norm(v_llm)
    v_orig_norm = v_norm(v_orig)
    return v_llm_norm.dot(v_orig_norm)


results_gpt35 = df.to_dict(orient="records")
similarity_35 = []

for record in tqdm(results_gpt35):
    sim = compute_cosine_similarity(record)
    similarity_35.append(sim)

df["evaluations"] = similarity_35

df["evaluations"].describe()

100%|██████████| 300/300 [01:18<00:00,  3.83it/s]


count    300.000000
mean       0.728393
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: evaluations, dtype: float64

In [53]:
# Q4.Rouge

from rouge import Rouge

rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(df.loc[10]["answer_llm"], df.loc[10]["answer_orig"])[0]

scores["rouge-1"]["f"]

0.45454544954545456

In [54]:
# Q5. Average rouge score
sum([i["f"] for i in scores.values()]) / len([i["f"] for i in scores.values()])

0.35490034990035496

In [63]:
# Q6. Average rouge score for all the data points

df["rouge_2_f"] = df[["answer_llm", "answer_orig"]].apply(
    lambda x: rouge_scorer.get_scores(x[0], x[1])[0]["rouge-2"]["f"], axis=1
)
df["rouge_2_f"].describe()

  lambda x: rouge_scorer.get_scores(x[0], x[1])[0]["rouge-2"]["f"], axis=1


count    300.000000
mean       0.206965
std        0.153550
min        0.000000
25%        0.097809
50%        0.178671
75%        0.286181
max        0.739130
Name: rouge_2_f, dtype: float64