In [1]:
import polars as pl
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
with open("documents.pkl", "rb") as f:
    documents = pickle.load(f)

In [3]:
df = pl.read_parquet('df_qa.parquet')
df = df.filter( pl.col('course').eq('data-engineering-zoomcamp') )
df.head()

course,section,question,text
str,str,str,str
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - When will the course …","""The purpose of this document i…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - What are the prerequi…","""GitHub - DataTalksClub data-en…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - Can I still join the …","""Yes, even if you don't registe…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - I have registered for…","""You don't need it. You're acce…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - What can I do before …","""You can start by installing an…"


In [4]:
df.columns

['course', 'section', 'question', 'text']

In [5]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X



In [6]:
transformers['text'].get_feature_names_out()

array(['01', '02', '03', ..., 'zones', 'zoom', 'zoomcamp'],
      shape=(1177,), dtype=object)

In [7]:
matrices['text']

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 12020 stored elements and shape (435, 1177)>

435 documents with 1177 words tracked.

In [8]:
query = "I just signed up. Is it too late to join the course?"

Compare query to all known documents:

In [9]:
q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

print(score[:10])
print(np.mean(score))
print(np.median(score))
print(np.max(score))

[0.49093979 0.         0.         0.20300683 0.         0.
 0.         0.19419469 0.         0.        ]
0.011982965599973302
0.0
0.4909397896827705


In [10]:
idx = np.argsort(-score)[:10]
idx

array([  0,  22,  27, 287,   3,   7, 113,  11, 395, 148])

In [11]:
score[idx]

array([0.49093979, 0.32949342, 0.28618013, 0.2120385 , 0.20300683,
       0.19419469, 0.16673395, 0.16553512, 0.14632906, 0.1414606 ])

In [12]:
print(query)

df.with_row_index().filter(pl.col("index").is_in(idx))['text'].to_list()


I just signed up. Is it too late to join the course?


["The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
 "No, you can

In [13]:
boost = {'question': 3.0}

score = np.zeros(len(df))

for f in fields:
    b = boost.get(f, 1.0)
    q = transformers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
    score = score + b * s

In [14]:


idx = np.argsort(-score)[:10]
results = df.with_row_index().filter(pl.col("index").is_in(idx))
results.to_dicts()



[{'index': 0,
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."},
 {'index': 1,
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites'},
 {'index': 2,
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'quest

In [15]:


class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pl.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            s = self.df[field]
            mask = s.set(s != value,'0').set(s==value,'1').cast(pl.Int32).to_numpy()
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.with_row_index().filter(pl.col("index").is_in(idx))
        return results.to_dicts()



In [16]:
index = TextSearch(text_fields=['section', 'question', 'text'])

In [17]:
index.fit(documents)

In [18]:
index.search(
    query='I just signed up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'index': 0,
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'index': 2,
  'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
