[homework question link](https://github.com/DataTalksClub/llm-zoomcamp/blob/main/cohorts/2024/04-monitoring/homework.md)

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [3]:
df = df.iloc[:300]

# Q1. Getting the embeddings model

In [4]:
model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

In [28]:
answer_llm = df.iloc[0].answer_llm
embedding_model.encode(answer_llm)[0]

np.float32(-0.42244655)

# Q2. Computing the dot product

In [29]:
def evaluate_score(document):
    original = embedding_model.encode(document['answer_orig'])
    llm = embedding_model.encode(document['answer_llm'])
    return original.dot(llm)

In [30]:
evaluate_score(df.iloc[0].to_dict())

np.float32(17.515987)

In [36]:
df['score'] = df.apply(lambda x: evaluate_score(x), axis=1)

In [38]:
df['score'].describe()

count    300.000000
mean      27.495996
std        6.384742
min        4.547923
25%       24.307844
50%       28.336870
75%       31.674309
max       39.476013
Name: score, dtype: float64

# Q3. Computing the cosine

In [43]:
def evaluate_norm_score(document):
    original = embedding_model.encode(document['answer_orig'])
    llm = embedding_model.encode(document['answer_llm'])

    norm_original = np.sqrt((original * original).sum())
    original = original / norm_original

    norm_llm = np.sqrt((llm * llm).sum())
    llm = llm / norm_llm

    return original.dot(llm)

In [44]:
df['norm_score'] = df.apply(lambda x: evaluate_norm_score(x), axis=1)

In [45]:
df['norm_score'].describe()

count    300.000000
mean       0.728393
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: norm_score, dtype: float64

# Q4. Rouge

In [7]:
from rouge import Rouge
rouge_scorer = Rouge()
r = df.iloc[10]

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

# Q5. Average rouge score

In [11]:
np.mean([x['f'] for x in scores.values()])

np.float64(0.35490034990035496)

# Q6. Average rouge score for all the data points

In [19]:
def calc_rouge_2(document):
    answer_llm = document['answer_llm']
    answer_orig = document['answer_orig']
    return rouge_scorer.get_scores(answer_llm, answer_orig)[0]['rouge-2']['f']

df['rouge-2'] = df.apply(lambda x: calc_rouge_2(x), axis =1)

In [20]:
df['rouge-2'].describe()

count    300.000000
mean       0.206965
std        0.153550
min        0.000000
25%        0.097809
50%        0.178671
75%        0.286181
max        0.739130
Name: rouge-2, dtype: float64