In [1]:
import pandas as pd 

In [2]:
import requests

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name 
        documents.append(doc)

In [3]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


__Basics of Text Search__

__Information Retrieval__ - The process of obtaining relevant information from large datasets based on user queries.

__Vector Spaces__ - A mathematical representation where text is converted into vectors (points in space) allowing for quantitative comparison.

__Bag of Words__ - A simple text representation model treating each document as a collection of words disregarding grammar and word order but keeping multiplicity.

__TF-IDF (Term Frequency-Inverse Document Frequency)__ - A statistical measure used to evaluate how important a word is to a document in a collection or corpus. It increases with the number of times a word appears in the document but is offset by the frequency of the word in the corpus.

# Implementing Basic text search 

In [5]:
# keyword filtering 
df[df.course == 'data-engineering-zoomcamp'].head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [6]:
#vectorization
docs_example = [
    "January course details, register now",
    "Course prerequisites listed in January catalog",
    "Submit January course homework by end of month",
    "Register for January course, no prerequisites",
    "January course setup: Python and Google Cloud"
]


In [7]:
# count vectoriser 
from sklearn.feature_extraction.text import CountVectorizer

#cv =CountVectorizer(stop_words='english')
cv = CountVectorizer()

In [8]:
cv.fit(docs_example)

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"


In [9]:
names = cv.get_feature_names_out()
names

array(['and', 'by', 'catalog', 'cloud', 'course', 'details', 'end', 'for',
       'google', 'homework', 'in', 'january', 'listed', 'month', 'no',
       'now', 'of', 'prerequisites', 'python', 'register', 'setup',
       'submit'], dtype=object)

In [10]:
X = cv.fit_transform(docs_example)

In [11]:
X.toarray()

array([[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0],
       [1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]])

In [12]:


df_docs = pd.DataFrame(X.toarray(), columns= names).T
df_docs

Unnamed: 0,0,1,2,3,4
and,0,0,0,0,1
by,0,0,1,0,0
catalog,0,1,0,0,0
cloud,0,0,0,0,1
course,1,1,1,1,1
details,1,0,0,0,0
end,0,0,1,0,0
for,0,0,0,1,0
google,0,0,0,0,1
homework,0,0,1,0,0


In [13]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs


Unnamed: 0,0,1,2,3,4
catalog,0,1,0,0,0
cloud,0,0,0,0,1
course,1,1,1,1,1
details,1,0,0,0,0
end,0,0,1,0,0
google,0,0,0,0,1
homework,0,0,1,0,0
january,1,1,1,1,1
listed,0,1,0,0,0
month,0,0,1,0,0


In [14]:
#TF_IDF
from sklearn.feature_extraction.text import TfidfVectorizer


cv =TfidfVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns= names).T
df_docs

Unnamed: 0,0,1,2,3,4
catalog,0.0,0.567502,0.0,0.0,0.0
cloud,0.0,0.0,0.0,0.0,0.473826
course,0.328427,0.270418,0.225781,0.359594,0.225781
details,0.68924,0.0,0.0,0.0,0.0
end,0.0,0.0,0.473826,0.0,0.0
google,0.0,0.0,0.0,0.0,0.473826
homework,0.0,0.0,0.473826,0.0,0.0
january,0.328427,0.270418,0.225781,0.359594,0.225781
listed,0.0,0.567502,0.0,0.0,0.0
month,0.0,0.0,0.473826,0.0,0.0


__Query-Document Similarity__
We represent the query in the same vector space - i.e. using the same vectorizer:

In [15]:
query =  "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

array([[0.        , 0.        , 0.39515588, 0.        , 0.        ,
        0.        , 0.        , 0.39515588, 0.        , 0.        ,
        0.        , 0.829279  , 0.        , 0.        , 0.        ]])

In [16]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

{'catalog': np.float64(0.0),
 'cloud': np.float64(0.0),
 'course': np.float64(0.39515588491314224),
 'details': np.float64(0.0),
 'end': np.float64(0.0),
 'google': np.float64(0.0),
 'homework': np.float64(0.0),
 'january': np.float64(0.39515588491314224),
 'listed': np.float64(0.0),
 'month': np.float64(0.0),
 'prerequisites': np.float64(0.0),
 'python': np.float64(0.8292789960182417),
 'register': np.float64(0.0),
 'setup': np.float64(0.0),
 'submit': np.float64(0.0)}

In [17]:
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'catalog': np.float64(0.5675015398728066),
 'cloud': np.float64(0.0),
 'course': np.float64(0.2704175244456293),
 'details': np.float64(0.0),
 'end': np.float64(0.0),
 'google': np.float64(0.0),
 'homework': np.float64(0.0),
 'january': np.float64(0.2704175244456293),
 'listed': np.float64(0.5675015398728066),
 'month': np.float64(0.0),
 'prerequisites': np.float64(0.45785666908911726),
 'python': np.float64(0.0),
 'register': np.float64(0.0),
 'setup': np.float64(0.0),
 'submit': np.float64(0.0)}

The more words in common - the better the matching score. Let's calculate it:

In [18]:
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T

(df_qd['query'] * df_qd['doc']).sum()

np.float64(0.21371415233666782)

In [19]:
X.dot(q.T).toarray()

array([[0.25955955],
       [0.21371415],
       [0.17843726],
       [0.28419115],
       [0.57137158]])

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(X, q)

array([[0.25955955],
       [0.21371415],
       [0.17843726],
       [0.28419115],
       [0.57137158]])

__In practice, we usually use cosine similarity__

The TF-IDF vectorizer already outputs a normalized vectors, so the results are identical

In [21]:
df.columns

Index(['course', 'section', 'question', 'text'], dtype='object')

Vectorizing all the documents
 do it for all the documents:

In [22]:
fields = ['section', 'question', 'text']
transformers ={}
matrices={}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv 
    matrices[field] = X




In [23]:
transformers['text'].get_feature_names_out()

array(['001', '01', '02', ..., 'zones', 'zoom', 'zoomcamp'],
      shape=(2118,), dtype=object)

In [24]:
matrices['text']

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 26463 stored elements and shape (948, 2118)>

Search
Let's now do search with the text field:

In [27]:
query = "I just signed up. Is it too late to join the course?"

q = transformers['text'].transform([query])

score = cosine_similarity(matrices['text'], q).flatten()

In [28]:
# i=only do the search for data engineering course 
mask = (df.course == 'data-engineering-zoomcamp').values
score = score * mask
score[:10]

array([0.3336047 , 0.        , 0.        , 0.1328874 , 0.        ,
       0.        , 0.        , 0.12722114, 0.        , 0.        ])

In [29]:
import numpy as np

idx = np.argsort(-score)[:10] #get the top results. np.argpartition is a more efficient way of doing the same thing
idx 

array([  0,  15,  22,  27,  38, 287,   3,   7, 113,  11])

In [30]:
score[idx]

array([0.3336047 , 0.23530268, 0.22668   , 0.1894954 , 0.16484429,
       0.13921764, 0.1328874 , 0.12722114, 0.1207499 , 0.10830554])

In [31]:
df.iloc[idx].text # get the text

0      The purpose of this document is to capture fre...
15     No, late submissions are not allowed. But if t...
22     It's up to you which platform and environment ...
27     You can do most of the course without a cloud....
38     You will have two attempts for a project. If t...
287    This error could result if you are using some ...
3      You don't need it. You're accepted. You can al...
7      Yes, we will keep all the materials after the ...
113    In the join queries, if we mention the column ...
11     No, you can only get a certificate if you fini...
Name: text, dtype: object

In [32]:
fields

['section', 'question', 'text']

Search with all the fields & boosting + filtering
We do it for all the fields.

We boost one of the fields - question - to give it more importance than to others

In [34]:
boost ={'question': 3.0}

score = np.zeros(len(df)) # Create an array of zeros, one for each document

for f in fields: # This loop will run for 'section', then 'question', then 'text'
    b = boost.get(f, 1.0)  #  Get the boost value for the current field
    q = transformers[f].transform([query])  #Vectorize the query using the field-specific transformer
    s = cosine_similarity(matrices[f], q).flatten() #Calculate similarity against the field-specific matrix
    score = score + b * s

In [35]:
filters = {
    'course':'data-enginering-zoomcamp'}

for field, value in filters.items():
    mask = (df[field] == value).values
    score = score * mask  #use of boolean arithmetic. In NumPy, when you multiply a number by a boolean:

    #number * True is number * 1, which equals the number.

   # number * False is number * 0, which equals 0.

In [38]:
# getting the result 
idx = np.argsort(-score)[:10] #returns the indices of the elements in sorted (ascending) order
results = df.iloc[idx]
results.to_dict(orient='records')

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the cours

# putting it together: create a class 



In [40]:
class TextSearch:
    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_param={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_param)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f]= cv 

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([queery])
            s = cosine_similarity(self.matrices[f], q).flatten()

        for field, value in filters.items():
            mask = (slef.df[field] ==value).values
            score = score * amsk

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

    

In [50]:
# Import necessary libraries for data manipulation (pandas), numerical operations (numpy),
# text vectorization (TfidfVectorizer), and similarity calculation (cosine_similarity).
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define a class, which is a blueprint for creating a search engine object.
class TextSearch:
    
    # The __init__ method is the constructor. It runs automatically when a new TextSearch object is created.
    # 'self' refers to the instance of the object itself.
    # 'text_fields' is a list of column names that should be indexed for searching.
    def __init__(self, text_fields):
        # Store the list of fields to search within the object for later use.
        self.text_fields = text_fields
        # Initialize an empty dictionary to store the TF-IDF numerical matrices for each field.
        self.matrices = {}
        # Initialize an empty dictionary to store the trained TF-IDF vectorizer for each field.
        self.vectorizers = {}

    # The 'fit' method is used to index the documents. It learns the vocabulary from the provided data.
    # 'records' is the data, expected as a list of dictionaries.
    # 'vectorizer_param' is an optional dictionary for custom TfidfVectorizer settings.
    def fit(self, records, vectorizer_param={}):
        # Convert the list of dictionaries into a pandas DataFrame for easier data handling.
        self.df = pd.DataFrame(records)

        # Loop through each field specified in the text_fields list.
        for f in self.text_fields:
            # Create a new TfidfVectorizer instance, unpacking any custom parameters.
            cv = TfidfVectorizer(**vectorizer_param)
            # 'fit_transform' learns the vocabulary from the text in the current field (column)
            # and transforms that text into a numerical TF-IDF sparse matrix.
            X = cv.fit_transform(self.df[f])
            
            # Store the resulting numerical matrix in the 'matrices' dictionary, keyed by the field name.
            self.matrices[f] = X
            # Store the trained vectorizer itself in the 'vectorizers' dictionary, keyed by the field name.
            # This is crucial for transforming search queries the exact same way.
            self.vectorizers[f]= cv 

    # The 'search' method finds relevant documents for a given query.
    # 'query' is the user's search string.
    # 'n_results' is the maximum number of results to return.
    # 'boost' is a dictionary to give more weight to matches in certain fields.
    # 'filters' is a dictionary to apply strict filtering to the results.
    def search(self, query, n_results=10, boost={}, filters={}):
        # Initialize a "scorecard" array of zeros, with one slot for every document.
        score = np.zeros(len(self.df))

        # --- SCORING STAGE ---
        # Loop through each of the indexed text fields.
        for f in self.text_fields:
            # Get the boost weight for the current field, defaulting to 1.0 if not specified.
            b = boost.get(f, 1.0)
            # Transform the user's query into a TF-IDF vector using the specific vectorizer for this field.
            q = self.vectorizers[f].transform([query]) # 
            # Calculate the cosine similarity between the query vector and all document vectors for this field.
            s = cosine_similarity(self.matrices[f], q).flatten()
            # This crucial line adds the boosted scores for the current field to the total score.
            # (This line was logically implied but missing from the original user-provided code block).
            # score = score + b * s

        # --- FILTERING STAGE ---
        # Loop through each key-value pair in the filters dictionary.
        for field, value in filters.items():
            # Create a boolean mask: an array of True/False values where True means the document matches the filter.
            mask = (self.df[field] == value).values 
            # Multiply the score by the mask. This zeroes out scores for documents that don't match (score * False == 0).
            score = score * mask 

            #  RESULT GENERATION STAGE         
            # Use np.argsort on the negated scores to get the indices of the top results in descending order.
        idx = np.argsort(-score)[:n_results]
            # Select the full rows for the top indices from the original DataFrame.
        results = self.df.iloc[idx]
            # Convert the resulting DataFrame to a list of dictionaries and return it.
        return results.to_dict(orient='records')

The TextSearch class is a blueprint for a simple but powerful text search engine. It is designed to take a collection of documents (as a list of dictionaries), index specific text fields, and perform relevance-ranked searches with advanced features like boosting and filtering.
Key Features

    Multi-Field Indexing: Instead of searching all text at once, it creates separate search indexes for each field you specify (e.g., title, abstract, body).

    TF-IDF Ranking: It uses the standard Term Frequency-Inverse Document Frequency (TF-IDF) algorithm to measure how relevant a document is to a query.

    Field Boosting: You can give more importance to matches in certain fields. For example, a match in a title field can be made to contribute more to the final score than a match in the body.

    Post-Search Filtering: You can apply strict filters to the search results, ensuring that only documents meeting certain criteria (e.g., course == 'data-engineering-zoomcamp') are returned.

How It Works: A Two-Step Process

The class operates in two main phases:
1. Fitting (.fit() method)

This is the indexing phase. You call this method once with your entire dataset.

    It converts your data into a pandas DataFrame.

    It loops through each text field you designated for searching (e.g., 'question', 'text').

    For each field, it creates and trains a TfidfVectorizer, which learns the unique vocabulary of that field and calculates term weights.

    It stores both the trained vectorizer and the resulting numerical matrix for each field, preparing them for the search phase.

2. Searching (.search() method)

This is the query phase. You call this method whenever a user wants to search for something.

    It initializes a "scorecard" with a score of zero for every document.

    It iterates through each indexed field, calculates the cosine similarity between the user's query and all documents for that field, and adds these scores to the scorecard.

    If boosts are provided, it multiplies the scores from a field by the boost factor before adding them to the total.

    If filters are provided, it zeroes out the scores of any documents that do not match the filter criteria.

    Finally, it ranks the documents by their final scores, selects the top N results, and returns them as a clean list of dictionaries.

In [51]:
# using the class 
index = TextSearch(text_fields=['section', 'question', 'text'])
index.fit(documents)

index.search( query='I just signed up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines 