In [1]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')


In [7]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }


## Q1

In [22]:
import minsearch

index = minsearch.Index(
    text_fields = ["question", "section", "text"],
    keyword_fields = ["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x737fab402090>

In [25]:

boost = {'question': 1.5, 'section': 0.1}
query = "Can I join the course if it has already started?"
filter_dict = {"course": "data-engineering-zoomcamp"}

results = index.search(query, filter_dict=filter_dict, boost_dict=boost)
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp',
  'id': 'a482086d'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and

In [27]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results


In [28]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:13<00:00, 335.03it/s]


In [30]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [32]:
hit_rate(relevance_total)

0.848714069591528

## Q2

In [137]:
# Embeddings:

from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

cv = TfidfVectorizer(min_df=3)
svd_model = TruncatedSVD(n_components=128, random_state=1)

pipeline = make_pipeline(
    cv,
    svd_model
)
X = pipeline.fit_transform(texts)
X

array([[ 0.20189188, -0.19028114, -0.10261914, ...,  0.03719206,
         0.02850986, -0.04641277],
       [ 0.2723704 , -0.33653397, -0.1445361 , ..., -0.0499137 ,
         0.01132394,  0.02318573],
       [ 0.25137243, -0.24366293, -0.11105337, ...,  0.0322307 ,
        -0.02414921, -0.02599206],
       ...,
       [ 0.21850466,  0.2859507 ,  0.13110213, ...,  0.03990522,
        -0.02636175,  0.0350963 ],
       [ 0.01265053,  0.01110092, -0.02217507, ..., -0.02871288,
        -0.01579063, -0.08238173],
       [ 0.19543413, -0.03891868,  0.2853495 , ...,  0.11603444,
         0.03531262, -0.04113139]], shape=(948, 128))

In [138]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)
vindex

<minsearch.vector.VectorSearch at 0x737f9f4c6ff0>

In [139]:
q = "Can I join the course if it has already started?"
filter_dict = {"course": "data-engineering-zoomcamp"}

# query_vector = cv.transform([query])
# embedded_text = svd_model.transform(query_vector)
# or:
embedded_text = pipeline.transform([q])


In [140]:
results = vindex.search(embedded_text[0], filter_dict=filter_dict, num_results=5)
results

[{'text': 'Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (this document), most likely all your questions are already answered here.\nYou can also tag the bot @ZoomcampQABot to help you conduct the search, but don’t rely on its answers 100%, it is pretty good though.',
  'section': 'General course-related questions',
  'question': 'Course - Can I get support if I take the course in the self-paced mode?',
  'course': 'data-engineering-zoomcamp',
  'id': 'eb56ae98'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?

In [141]:
def minsearch_vector_search(index, query1, course1):
    # boost = {'question': 1.5, 'section': 0.1}

    # query_vector = cv.transform([query])
    # embedded_text = svd_model.transform(query_vector)
    # or:
    embedded_text1 = pipeline.transform([query1])

    filter_dict1 = {"course": course1}

    results = index.search(
        embedded_text1[0], 
        filter_dict=filter_dict1, 
        num_results=5,
    )
    
    # results = index.search(
    #     query=query,
    #     filter_dict={'course': course},
    #     boost_dict=boost,
    #     num_results=5
    # )

    return results

In [142]:
minsearch_vector_search(vindex, "Can I join the course if it has already started?", "data-engineering-zoomcamp")

[{'text': 'Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (this document), most likely all your questions are already answered here.\nYou can also tag the bot @ZoomcampQABot to help you conduct the search, but don’t rely on its answers 100%, it is pretty good though.',
  'section': 'General course-related questions',
  'question': 'Course - Can I get support if I take the course in the self-paced mode?',
  'course': 'data-engineering-zoomcamp',
  'id': 'eb56ae98'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?

In [143]:
relevance_total2 = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_vector_search(vindex, query1=q['question'], course1=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total2.append(relevance)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:07<00:00, 597.51it/s]


In [144]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [145]:
mrr(relevance_total2)

0.3571284489590088

# Q3

In [161]:
combined_texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    combined_texts.append(t)

combinedX = pipeline.fit_transform(combined_texts)
combinedX

array([[ 0.27652746, -0.12406338,  0.08165308, ...,  0.03232327,
        -0.01848302,  0.01792042],
       [ 0.13093798, -0.09290264, -0.05869318, ..., -0.0188873 ,
        -0.01244898,  0.02283528],
       [ 0.27345148, -0.11976017,  0.09454852, ..., -0.00291146,
         0.03255537,  0.07401423],
       ...,
       [ 0.28059404,  0.21363922,  0.07458614, ..., -0.00085369,
        -0.03680951,  0.08412727],
       [ 0.10485423,  0.02658987, -0.05758003, ..., -0.03745725,
        -0.02261345,  0.02970269],
       [ 0.13592913,  0.02690379, -0.10047022, ..., -0.02532567,
        -0.01282716, -0.01722296]], shape=(948, 128))

In [148]:
vindexCombined = VectorSearch(keyword_fields={'course'})
vindexCombined.fit(combinedX, documents)
vindexCombined

<minsearch.vector.VectorSearch at 0x737f9f9afa40>

In [177]:
# id = ground_truth[0]['document']

# def find_doc_by_id(id):
#     for doc in documents:
#         if (doc['id'] == id):
#             return doc
#     raise Exception()

# find_doc_by_id(id)

In [176]:
relevance_total3 = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_vector_search(vindexCombined, query1=q['question'], course1=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total3.append(relevance)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:08<00:00, 569.83it/s]


In [179]:
hit_rate(relevance_total3)

0.8210503566025502

# Q4

In [188]:
# combined_texts2 = []

# for doc in documents:
#     t = doc['question'] + ' ' + doc['text']
#     combined_texts2.append(t)

# combined_texts2
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [186]:
from qdrant_client import QdrantClient, models
client = QdrantClient("http://localhost:6333")

collection_name = "evaluation-02"
EMBEDDING_DIMENSIONALITY=512

client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size= EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [191]:
model_handle = "jinaai/jina-embeddings-v2-small-en"
points = []
id = 0

for doc in documents:
    text = doc['question'] + ' ' + doc['text']

    point = models.PointStruct(
        id=id,
        vector=models.Document(text=text, model=model_handle), #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
        payload={
            "text": doc['text'],
            "section": doc['section'],
            "course": doc['course'],
            "id": doc['id']
        } #save all needed metadata fields
    )
    points.append(point)

    id += 1

points

[PointStruct(id=0, vector=Document(text="Course - When will the course start? The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", model='jinaai/jina-embeddings-v2-small-en', options=None), payload={'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with an

In [192]:
from tqdm.auto import tqdm # loadbar
client.upsert(
    collection_name=collection_name,
    points=points
)

Fetching 5 files: 100%|███████████████████████████████████████████████████████████████| 5/5 [00:10<00:00,  2.09s/it]


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [194]:
def search_in_Qdrant(query, course, limit=5):

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle
        ),
        query_filter=models.Filter( # filter by course name
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

search_in_Qdrant("Can I join the course if it has already started?", "data-engineering-zoomcamp")

QueryResponse(points=[ScoredPoint(id=2, version=0, score=0.8838817, payload={'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'course': 'data-engineering-zoomcamp', 'id': '7842b56a'}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=7, version=0, score=0.86161625, payload={'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.', 'section': 'General course-related questions', 'course': 'data-engineering-zoomcamp', 'id': 'a482086d'}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=0, version=0, score=0.8395

In [223]:
relevance_total4 = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = search_in_Qdrant(q['question'], q['course'])
    relevance = [d.payload['id'] == doc_id for d in results.points]
    relevance_total4.append(relevance)

100%|███████████████████████████████████████████████████████████████████████████| 4627/4627 [01:05<00:00, 70.43it/s]


In [225]:
r = search_in_Qdrant(q['question'], q['course'])
print(r.points[0].payload['id'])

886d1617


In [227]:
relevance_total4

[[True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, True],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [False, False, True, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, Fals

In [229]:
mrr(relevance_total4)

0.8517722066133576

## Q5

In [259]:
import numpy as np

# def cosine(u, v):
#     u = normalize(u)
#     v = normalize(v)
#     return u.dot(v)


# def normalize(u):
#     norm = np.sqrt(u.dot(u))
#     return u / norm

def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)



In [268]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

data = df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question
Xfit = pipeline.fit(data)
Xfit

In [261]:
df_results

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp
...,...,...,...,...,...
1825,Some suggested titles for listing the Machine ...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,What are some suggested titles for listing the...,machine-learning-zoomcamp
1826,It is best advised that you do not list the Ma...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Should I list the Machine Learning Zoomcamp ex...,machine-learning-zoomcamp
1827,You can incorporate your Machine Learning Zoom...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,In which LinkedIn sections can I incorporate m...,machine-learning-zoomcamp
1828,The advice on including a project link in a CV...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Who gave advice on including a project link in...,machine-learning-zoomcamp


In [295]:
# pipeline.transform(Xfit)

for row in df_results.iterrows():
    v_llm = Xfit.transform([eow['answer_llm'][0]])
    v_orig = Xfit.transform([df_results['answer_orig'][0]])
    cosine(v_llm[0], v_orig[0])

AttributeError: 'DataFrame' object has no attribute 'to_list'

In [333]:
# v_llm[0].dot(v_orig[0])

v_llms = []

for row in df_results['answer_llm']:
    v_llms.append(Xfit.transform([row]))
v_llms

v_origs = []

for row in df_results['answer_orig']:
    v_origs.append(Xfit.transform([row]))
v_origs


cosines = []
for i,v in enumerate(v_llms):
    cosines.append(cosine(v_llms[i][0], v_origs[i][0]))
cosines

[np.float64(0.46352620160029906),
 np.float64(0.7815651064829413),
 np.float64(0.8891577173455298),
 np.float64(0.6149615816691363),
 np.float64(0.6240861551352468),
 np.float64(0.9345519699817656),
 np.float64(0.883219091637175),
 np.float64(0.8827707138393344),
 np.float64(0.9779470935543948),
 np.float64(0.7924301017857226),
 np.float64(0.9428574444641923),
 np.float64(0.9740634681019089),
 np.float64(0.9910122289434198),
 np.float64(0.9908956356315888),
 np.float64(0.9879144788820641),
 np.float64(0.9284031711293226),
 np.float64(0.9400237827972099),
 np.float64(0.8279205649663369),
 np.float64(0.7955631678529784),
 np.float64(0.9070399794363231),
 np.float64(0.8651161589043759),
 np.float64(0.918565813019574),
 np.float64(0.9727699610284664),
 np.float64(0.588167618222869),
 np.float64(0.9733784064834398),
 np.float64(0.9507923258178012),
 np.float64(0.6954534449875964),
 np.float64(0.9327305653874071),
 np.float64(0.9807723672101395),
 np.float64(0.9562263710922675),
 np.float64(

In [334]:
np.mean(cosines)

np.float64(0.8415841233490402)

# Q6

In [335]:
!pip install rouge


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [345]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
# scores['rouge-1']['f']
scores


{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [346]:
rougef1s = []
for i,row in df_results.iterrows():
    scores = rouge_scorer.get_scores(row.answer_llm, row.answer_orig)[0]
    rougef1s.append(scores['rouge-1']['f'])
rougef1s

[0.09523809178130524,
 0.12499999641113292,
 0.41558441095631643,
 0.2162162117421476,
 0.14207649881095297,
 0.43137254522106894,
 0.41269840791131274,
 0.30434782321361065,
 0.5172413747919143,
 0.34374999517578125,
 0.45454544954545456,
 0.6060606010606061,
 0.753623183415249,
 0.7297297247881666,
 0.6849315018952901,
 0.4772727222727273,
 0.47999999515022235,
 0.3888888841358025,
 0.2857142807256236,
 0.37499999505,
 0.24719100710011369,
 0.2682926793664485,
 0.5242718399095109,
 0.18918918665814466,
 0.5048543641813555,
 0.6206896501724138,
 0.3333333290589569,
 0.4130434736011342,
 0.633663361446917,
 0.359550557257922,
 0.4489795870054144,
 0.5217391255198489,
 0.4090909041322315,
 0.25714285306122453,
 0.16666666172839517,
 0.19999999601250007,
 0.29999999601250005,
 0.42857142429705225,
 0.23255813514332083,
 0.7096774143756504,
 0.4848484798530762,
 0.3934426180166622,
 0.5245901590002687,
 0.3692307642414202,
 0.22222221852839513,
 0.11111110797839514,
 0.15789473383656513,


In [347]:
np.mean(rougef1s)

np.float64(0.3516946452113943)