In [1]:
import pandas as pd

In [2]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
df = pd.DataFrame(documents)

In [5]:
# specifying the columns to enforce an order
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...


In [6]:
# to implement text search, we will use scikit-learn
# Lets take an example to understand how CountVectorizer works - basically CountVectorizer turns text into vectors

In [7]:
doc_examples = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
cv = CountVectorizer()

In [10]:
cv.fit(doc_examples)

In [11]:
# This gives the different terms in our documents.
cv.get_feature_names_out()

array(['15th', '2024', 'after', 'and', 'before', 'cloud', 'course',
       'date', 'for', 'github', 'google', 'homeworks', 'jan', 'listed',
       'not', 'on', 'participation', 'prerequisites', 'python',
       'registration', 'required', 'setup', 'start', 'starts', 'submit'],
      dtype=object)

In [12]:
X = cv.transform(doc_examples)

In [13]:
X

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 27 stored elements and shape (5, 25)>

In [14]:
# This vectorizes the words in each document 
#(each element of the doc_examples array is treated as a document) and 1 indicates that particular elemnet was present in that document
X.todense()

matrix([[1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
         0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
         0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
         0, 0, 0, 0],
        [0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         1, 0, 0, 0]])

In [15]:
vectorized_df = pd.DataFrame(X.todense(), columns = cv.get_feature_names_out())

In [16]:
vectorized_df

Unnamed: 0,15th,2024,after,and,before,cloud,course,date,for,github,...,on,participation,prerequisites,python,registration,required,setup,start,starts,submit
0,1,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,1,1,0,0,0,0
4,0,0,0,1,1,1,1,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [17]:
# A better view to transpose this and view it document wise
vectorized_df.T

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
after,0,0,1,0,0
and,0,0,0,0,1
before,0,0,0,0,1
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
for,0,0,0,1,0
github,0,1,0,0,0


In [18]:
# To remove unnecessary words
cv = CountVectorizer(stop_words = 'english')

In [19]:
cv.fit(doc_examples)

In [20]:
cv.get_feature_names_out()

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

Vector Spaces
 - turns docs into vectors
 - term-document matrix:
     - rows contain documents
     - columns contain the word/tokens
     - values re 0s and 1s based on which word document mapping
 - bag of words
    - word order is lost
    - sparse matrix

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(doc_examples)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4
15th,0.46,0.0,0.0,0.0,0.0
2024,0.46,0.0,0.0,0.0,0.0
cloud,0.0,0.0,0.0,0.0,0.46
course,0.37,0.0,0.0,0.0,0.37
date,0.0,0.0,0.5,0.0,0.0
github,0.0,0.58,0.0,0.0,0.0
google,0.0,0.0,0.0,0.0,0.46
homeworks,0.0,0.0,0.5,0.0,0.0
jan,0.46,0.0,0.0,0.0,0.0
listed,0.0,0.58,0.0,0.0,0.0


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
# min_df: parameter to filter out the words based on document frequency. here it means word should be present in atleast 5 documents.
cv = CountVectorizer(stop_words='english', min_df = 5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
05,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0,0,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
yml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
youtube,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
zip,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


If I replace it with Tfidf, it will assign weights to these words based on their frequency of occurence in each document

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
02,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
03,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
04,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
05,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.00,0.28,0.0,0.0,0.00,0.21,0.2,0.15,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
yml,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.00
youtube,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.15,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
zip,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00


In [24]:
query = "Do I need to know python to sign up for the January course?"
# Apply the vectorizer to turn this query into a document
q = cv.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]], shape=(1, 1333))

In [25]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

{'01': np.float64(0.0),
 '02': np.float64(0.0),
 '03': np.float64(0.0),
 '04': np.float64(0.0),
 '05': np.float64(0.0),
 '06': np.float64(0.0),
 '09': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '11': np.float64(0.0),
 '12': np.float64(0.0),
 '127': np.float64(0.0),
 '13': np.float64(0.0),
 '14': np.float64(0.0),
 '15': np.float64(0.0),
 '16': np.float64(0.0),
 '17': np.float64(0.0),
 '19': np.float64(0.0),
 '1st': np.float64(0.0),
 '20': np.float64(0.0),
 '2019': np.float64(0.0),
 '2020': np.float64(0.0),
 '2021': np.float64(0.0),
 '2022': np.float64(0.0),
 '2023': np.float64(0.0),
 '2024': np.float64(0.0),
 '21': np.float64(0.0),
 '22': np.float64(0.0),
 '24': np.float64(0.0),
 '25': np.float64(0.0),
 '2pacx': np.float64(0.0),
 '30': np.float64(0.0),
 '35': np.float64(0.0),
 '403': np.float64(0.0),
 '42': np.float64(0.0),
 '50': np.float64(0.0),
 '5000': np.float64(0.0),
 '5431': np.float64(0.0),
 '5432': np.float64(0.0),
 '60': np.float64(0.0),
 '600': np.floa

In [26]:
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'01': np.float64(0.0),
 '02': np.float64(0.0),
 '03': np.float64(0.0),
 '04': np.float64(0.0),
 '05': np.float64(0.0),
 '06': np.float64(0.0),
 '09': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '11': np.float64(0.0),
 '12': np.float64(0.0),
 '127': np.float64(0.0),
 '13': np.float64(0.0),
 '14': np.float64(0.0),
 '15': np.float64(0.0),
 '16': np.float64(0.0),
 '17': np.float64(0.0),
 '19': np.float64(0.0),
 '1st': np.float64(0.0),
 '20': np.float64(0.0),
 '2019': np.float64(0.0),
 '2020': np.float64(0.0),
 '2021': np.float64(0.0),
 '2022': np.float64(0.0),
 '2023': np.float64(0.0),
 '2024': np.float64(0.0),
 '21': np.float64(0.0),
 '22': np.float64(0.0),
 '24': np.float64(0.0),
 '25': np.float64(0.0),
 '2pacx': np.float64(0.0),
 '30': np.float64(0.0),
 '35': np.float64(0.0),
 '403': np.float64(0.0),
 '42': np.float64(0.0),
 '50': np.float64(0.0),
 '5000': np.float64(0.0),
 '5431': np.float64(0.0),
 '5432': np.float64(0.0),
 '60': np.float64(0.0),
 '600': np.floa

In [27]:
test = dict(zip(names, X.toarray()[3]))
test

{'01': np.float64(0.0),
 '02': np.float64(0.0),
 '03': np.float64(0.0),
 '04': np.float64(0.0),
 '05': np.float64(0.0),
 '06': np.float64(0.0),
 '09': np.float64(0.0),
 '10': np.float64(0.0),
 '100': np.float64(0.0),
 '11': np.float64(0.0),
 '12': np.float64(0.0),
 '127': np.float64(0.0),
 '13': np.float64(0.0),
 '14': np.float64(0.0),
 '15': np.float64(0.0),
 '16': np.float64(0.0),
 '17': np.float64(0.0),
 '19': np.float64(0.0),
 '1st': np.float64(0.0),
 '20': np.float64(0.0),
 '2019': np.float64(0.0),
 '2020': np.float64(0.0),
 '2021': np.float64(0.0),
 '2022': np.float64(0.0),
 '2023': np.float64(0.0),
 '2024': np.float64(0.0),
 '21': np.float64(0.0),
 '22': np.float64(0.0),
 '24': np.float64(0.0),
 '25': np.float64(0.0),
 '2pacx': np.float64(0.0),
 '30': np.float64(0.0),
 '35': np.float64(0.0),
 '403': np.float64(0.0),
 '42': np.float64(0.0),
 '50': np.float64(0.0),
 '5000': np.float64(0.0),
 '5431': np.float64(0.0),
 '5432': np.float64(0.0),
 '60': np.float64(0.0),
 '600': np.floa

In [28]:
X.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.42896052],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.14842753]], shape=(948, 1333))

In [29]:
#cosine similarity - Just gives us how similar this search is to our document
X.dot(q.T).todense()

matrix([[0.19464486],
        [0.        ],
        [0.        ],
        [0.06011641],
        [0.04932915],
        [0.        ],
        [0.        ],
        [0.13477565],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.15899187],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.07431408],
        [0.        ],
        [0.        ],
        [0.05779673],
        [0.07243428],
        [0.        ],
        [0.05174293],
        [0.16373635],
        [0.08076031],
        [0.        ],
        [0.09755254],
        [0.        ],
        [0.21069625],
        [0.12067781],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.06381749],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.00910541],
        [0.02835681],
        [0.05480112],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
cosine_similarity(X,q)

array([[0.19464486],
       [0.        ],
       [0.        ],
       [0.06011641],
       [0.04932915],
       [0.        ],
       [0.        ],
       [0.13477565],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.15899187],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.07431408],
       [0.        ],
       [0.        ],
       [0.05779673],
       [0.07243428],
       [0.        ],
       [0.05174293],
       [0.16373635],
       [0.08076031],
       [0.        ],
       [0.09755254],
       [0.        ],
       [0.21069625],
       [0.12067781],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.06381749],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.00910541],
       [0.02835681],
       [0.05480112],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.024

In [32]:
score = cosine_similarity(X,q).flatten()

In [33]:
import numpy as np

In [34]:
np.argsort(score)

array([718, 699, 700, 701, 703, 704, 707, 710, 711, 712, 717, 698, 476,
       477, 478, 479, 480, 481, 482, 483, 486, 487, 833, 820, 821, 822,
       823, 824, 825, 827, 828, 830, 832, 488, 834, 467, 468, 471, 472,
       473, 474, 475, 697, 526, 514, 515, 516, 517, 518, 519, 520, 523,
       524, 525, 513, 527, 528, 530, 532, 533, 534, 535, 536, 537, 538,
       499, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 819, 501,
       504, 505, 506, 507, 508, 509, 510, 512, 864, 386, 851, 853, 854,
       855, 859, 860, 861, 862, 863, 385, 865, 387, 389, 390, 392, 397,
       399, 400, 402, 404, 369, 358, 359, 360, 361, 362, 363,  32, 366,
       367, 368, 405, 370, 371, 376, 377, 379, 380, 382, 383, 384, 442,
       839, 840, 841, 843, 846, 847, 848, 850, 437, 441, 836, 443, 444,
       447, 453, 460, 461, 462, 463, 466, 420, 407, 408, 409, 410, 412,
       414, 416, 417, 418, 419, 542, 421, 422, 423, 426, 427, 428, 429,
       430, 432, 654, 643, 644, 645, 646, 647, 649, 650, 651, 65

In [35]:
np.argsort(score)[-5:]

array([764,  27, 806, 577, 445])

In [36]:
df.iloc[445].text

'Check this article. If you know everything in this article, you know enough. If you don’t, read the article and join the coursIntroduction to Pythone too :)\nIntroduction to Python – Machine Learning Bookcamp\nYou can follow this English course from the OpenClassrooms e-learning platform, which is free and covers the python basics for data analysis: Learn Python Basics for Data Analysis - OpenClassrooms . It is important to know some basics such as: how to run a Jupyter notebook, how to import libraries (and what libraries are), how to declare a variable (and what variables are) and some important operations regarding data analysis.\n(Mélanie Fouesnard)'

In [37]:
fields = ['section','question','text']

In [38]:
df

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
943,mlops-zoomcamp,Module 6: Best practices,Github actions: Permission denied error when e...,Problem description\nThis is the step in the c...
944,mlops-zoomcamp,Module 6: Best practices,Managing Multiple Docker Containers with docke...,Problem description\nWhen a docker-compose fil...
945,mlops-zoomcamp,Module 6: Best practices,AWS regions need to match docker-compose,Problem description\nIf you are having problem...
946,mlops-zoomcamp,Module 6: Best practices,Isort Pre-commit,Problem description\nPre-commit command was fa...


In [39]:
matrices = {}
vectorizers = {}

for f in fields:
    cv = TfidfVectorizer(stop_words = "english", min_df=5)
    x = cv.fit_transform(df[f])
    matrices[f]=x
    vectorizers[f]=cv
    

In [40]:
matrices

{'section': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3090 stored elements and shape (948, 66)>,
 'question': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3431 stored elements and shape (948, 291)>,
 'text': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 23808 stored elements and shape (948, 1333)>}

In [46]:
vectorizers

{'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english')}

In [47]:
n = len(df)

In [65]:
score = np.zeros(n)
query = "Do I need to know python to sign up for the January course?"

boosts = {
    'question':3
    # 'text':0.5
}

for f in fields:
    q = vectorizers[f].transform([query])
    x = matrices[f]

    boost = boosts.get(f,1.0)

    f_score = cosine_similarity(x,q).flatten()
    score = score + boost*f_score

In [66]:
filters = {
    'course':'data-engineering-zoomcamp'
}

In [67]:
for field, course in filters.items():
    mask = (df[field] == course).astype(int).values
    score = score*mask


In [68]:
score

array([1.81162054, 1.81276416, 1.46676396, 1.54901763, 1.86209332,
       1.81276416, 1.12836823, 1.94753981, 1.45142523, 1.81276416,
       1.64009203, 1.44783445, 0.49512426, 0.49512426, 0.49512426,
       0.56943834, 0.49512426, 1.43618161, 2.33928659, 1.16009076,
       0.49512426, 0.54686719, 0.65886061, 1.32911676, 0.49512426,
       0.5926768 , 0.49512426, 0.70582051, 0.61580207, 0.49512426,
       0.49512426, 0.49512426, 0.49512426, 1.10486851, 1.81276416,
       1.0334609 , 0.49512426, 0.49512426, 0.49512426, 0.50422966,
       0.52348107, 1.18003727, 0.49512426, 0.49512426, 0.        ,
       0.        , 0.        , 0.02469964, 0.05129386, 0.06013439,
       0.05252658, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.04169018, 0.        , 0.        , 0.        , 0.0075293 ,
       0.        , 0.        , 0.01971463, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [69]:
idx = np.argsort(score)[-5:]

In [70]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
18,data-engineering-zoomcamp,General course-related questions,Leaderboard - I am not on the leaderboard / ho...,When you set up your account you are automatic...


Putting it all together in 1 pass

In [71]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

Semantic Search - how is it better than plain text search

In [42]:
from sklearn.decomposition import TruncatedSVD

X = matrices['text']
cv = vectorizers['text']

In [43]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23808 stored elements and shape (948, 1333)>

In [44]:
cv

In [45]:
svd = TruncatedSVD(n_components = 16)
X_emb = svd.fit_transform(X)

SVD reduces the dimensionality of the original feature matrix without loss of information. It does so by capturing the semantic similarity of various words and grouping them into a single word.

In [46]:
X_emb

array([[ 0.0965319 , -0.08228884, -0.10190404, ..., -0.09427398,
         0.04782527, -0.03280886],
       [ 0.15234128, -0.19151704, -0.24645635, ..., -0.02430393,
        -0.11063945,  0.01636998],
       [ 0.04487854, -0.04480579, -0.04699856, ..., -0.0662199 ,
         0.01353867, -0.03127877],
       ...,
       [ 0.24652441,  0.19542534, -0.09529171, ...,  0.03891652,
        -0.05129595,  0.01166217],
       [ 0.16743285, -0.01278212,  0.08260269, ..., -0.12955809,
        -0.16884539, -0.02333225],
       [ 0.17204069, -0.01411986, -0.0167405 , ..., -0.04441762,
        -0.0850512 , -0.03736247]], shape=(948, 16))

In [47]:
query = "Do I need to know python to sign up for the January course?"
Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

array([ 0.09945216, -0.0530443 ,  0.07065988, -0.04253499,  0.02417932,
       -0.04466523,  0.00994159, -0.1325207 , -0.08355707,  0.12658423,
        0.11577347,  0.04053737,  0.02743288,  0.06219855,  0.01327778,
        0.0071884 ])

In [48]:
np.dot(Q_emb[0],X_emb[0])

np.float64(0.09461254691460305)

In [49]:
score = cosine_similarity(X_emb,Q_emb).flatten()
idx = np.argsort(score)[-10:]
df.iloc[idx]

Unnamed: 0,course,section,question,text
565,machine-learning-zoomcamp,4. Evaluation Metrics for Classification,Use AUC to evaluate feature importance of nume...,Check the solutions from the 2021 iteration of...
812,mlops-zoomcamp,+-General course questions,What’s the difference between the 2023 and 202...,The difference is the Orchestration and Monito...
768,machine-learning-zoomcamp,Miscellaneous,Do you pass a project based on the average of ...,Alexey Grigorev: “It’s based on all the scores...
6,data-engineering-zoomcamp,General course-related questions,Course - Is the current cohort going to be dif...,Yes. For the 2024 edition we are using Mage AI...
445,machine-learning-zoomcamp,General course-related questions,How much Python should I know?,Check this article. If you know everything in ...
443,machine-learning-zoomcamp,General course-related questions,Will I get a certificate?,"Yes, if you finish at least 2 out of 3 project..."
503,machine-learning-zoomcamp,2. Machine Learning for Regression,The answer I get for one of the homework quest...,That’s normal. We all have different environme...
19,data-engineering-zoomcamp,General course-related questions,Environment - Is Python 3.9 still the recommen...,"Yes, for simplicity (of troubleshooting agains..."
455,machine-learning-zoomcamp,General course-related questions,The course videos are from the previous iterat...,We won’t re-record the course videos. The focu...
806,machine-learning-zoomcamp,Miscellaneous,"Can I do the course in other languages, like R...","Technically, yes. Advisable? Not really. Reaso..."


In [51]:
X_emb[0]

array([ 0.0965319 , -0.08228884, -0.10190404, -0.08028651,  0.06628635,
       -0.06631423,  0.02500221, -0.16297332, -0.22722151,  0.28012428,
        0.08835949,  0.0418152 , -0.10510302, -0.09427398,  0.04782527,
       -0.03280886])

The problem with the above approach is that there are negative elements in the array. So what does a negative correlation of a topic to a document mean? It doesnt make sense. To overcome that, we have Non-Negative Matrix Factorization
SVD creates values with negative numbers. It's difficult to interpet them.

NMF (Non-Negative Matrix Factorization) is a similar concept, except for non-negative input matrices it produces non-negative results.

We can interpret each of the columns (features) of the embeddings as different topic/concents and to what extent this document is about this concept.

In [52]:
from sklearn.decomposition import NMF

nmf = NMF(n_components = 16)
X_nmf_emb = nmf.fit_transform(X)
X_nmf_emb[0]

array([0.00610587, 0.00590069, 0.        , 0.        , 0.08690468,
       0.        , 0.00105108, 0.        , 0.00218175, 0.01250785,
       0.0003156 , 0.        , 0.        , 0.00730296, 0.00469219,
       0.00943706])

Now its a little bit more easier since the max of the values refers to the topic for this document.

In [53]:
Q = cv.transform([query])
Q_nmf_emb = nmf.transform(Q)
Q_nmf_emb[0]

array([0.00326309, 0.        , 0.        , 0.        , 0.01979199,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.1323118 , 0.01276841, 0.        ,
       0.        ])

In [54]:
score = cosine_similarity(X_nmf_emb,Q_nmf_emb).flatten()
idx = np.argsort(score)[-10:]
df.iloc[idx]

Unnamed: 0,course,section,question,text
864,mlops-zoomcamp,Module 2: Experiment tracking,MlflowClient object has no attribute 'list_exp...,Since the version 1.29 the list_experiments me...
323,data-engineering-zoomcamp,Module 5: pyspark,Py4JJavaError - ModuleNotFoundError: No module...,You need to look for the Py4J file and note th...
455,machine-learning-zoomcamp,General course-related questions,The course videos are from the previous iterat...,We won’t re-record the course videos. The focu...
398,data-engineering-zoomcamp,Project,How to run python as start up script?,You need to redefine the python environment va...
324,data-engineering-zoomcamp,Module 5: pyspark,Py4J Error - ModuleNotFoundError: No module na...,"If below does not work, then download the late..."
521,machine-learning-zoomcamp,3. Machine Learning for Classification,What sklearn version is Alexey using in the yo...,Version 0.24.2 and Python 3.8.11\n(Added by Di...
720,machine-learning-zoomcamp,9. Serverless Deep Learning,Using Tensorflow 2.15 for AWS deployment,Using the 2.14 version with python 3.11 works ...
806,machine-learning-zoomcamp,Miscellaneous,"Can I do the course in other languages, like R...","Technically, yes. Advisable? Not really. Reaso..."
19,data-engineering-zoomcamp,General course-related questions,Environment - Is Python 3.9 still the recommen...,"Yes, for simplicity (of troubleshooting agains..."
373,data-engineering-zoomcamp,Module 6: streaming with kafka,Error importing cimpl dll when running avro ex...,ImportError: DLL load failed while importing c...


In [56]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [57]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

In [58]:
encoded_input

{'input_ids': tensor([[  101,  2748,  1010,  2057,  2097,  2562,  2035,  1996,  4475,  2044,
          1996,  2607, 12321,  1012,   102],
        [  101,  2017,  2064,  3582,  1996,  2607,  2012,  2115,  2219,  6393,
          2044,  2009, 12321,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [59]:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

In [60]:
hidden_states.shape

torch.Size([2, 15, 768])

In [61]:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

torch.Size([2, 768])

In [62]:
sentence_embeddings

tensor([[ 0.3600, -0.1607,  0.3545,  ...,  0.0429,  0.0348, -0.0382],
        [ 0.1785, -0.5000,  0.2528,  ..., -0.1141, -0.3361,  0.4110]])