In [1]:
import polars as pl
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity




In [2]:
df = pl.read_parquet('df_qa.parquet')
df = df.filter( pl.col('course').eq('data-engineering-zoomcamp') )
df.head()

course,section,question,text
str,str,str,str
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - When will the course …","""The purpose of this document i…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - What are the prerequi…","""GitHub - DataTalksClub data-en…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - Can I still join the …","""Yes, even if you don't registe…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - I have registered for…","""You don't need it. You're acce…"
"""data-engineering-zoomcamp""","""General course-related questio…","""Course - What can I do before …","""You can start by installing an…"


## Vectorize Documents

- Turn emails into vectors
- term-document matrix
    - rows: documents
    - columns: words/tokens
- bag of words
    - word counts only

In [3]:
docs_example = [
    "January course details, register now",
    "Course prerequisites listed in January catalog",
    "Submit January course homework by end of month",
    "Register for January course, no prerequisites",
    "January course setup: Python and Google Cloud"
]

cv = CountVectorizer(stop_words='english')

cv.fit(docs_example)

X = cv.transform(docs_example)


In [4]:
print(cv.get_feature_names_out())

pl.from_numpy(np.array(X.todense()))

['catalog' 'cloud' 'course' 'details' 'end' 'google' 'homework' 'january'
 'listed' 'month' 'prerequisites' 'python' 'register' 'setup' 'submit']


column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
0,0,1,1,0,0,0,1,0,0,0,0,1,0,0
1,0,1,0,0,0,0,1,1,0,1,0,0,0,0
0,0,1,0,1,0,1,1,0,1,0,0,0,0,1
0,0,1,0,0,0,0,1,0,0,1,0,1,0,0
0,1,1,0,0,1,0,1,0,0,0,1,0,1,0


In [5]:
df_cv = pl.from_numpy(np.array(X.todense()) ,schema=cv.get_feature_names_out().tolist())
df_cv.transpose(include_header=True, header_name="Word")

Word,column_0,column_1,column_2,column_3,column_4
str,i64,i64,i64,i64,i64
"""catalog""",0,1,0,0,0
"""cloud""",0,0,0,0,1
"""course""",1,1,1,1,1
"""details""",1,0,0,0,0
"""end""",0,0,1,0,0
…,…,…,…,…,…
"""prerequisites""",0,1,0,1,0
"""python""",0,0,0,0,1
"""register""",1,0,0,1,0
"""setup""",0,0,0,0,1


### FROM BAG OF WORDS TO TFID

In [6]:

cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pl.DataFrame(X.toarray(), schema=list(names))

df_docs.with_columns(
    pl.selectors.float().round(2)
).transpose(include_header=True, header_name="Word")


Word,column_0,column_1,column_2,column_3,column_4
str,f64,f64,f64,f64,f64
"""catalog""",0.0,0.57,0.0,0.0,0.0
"""cloud""",0.0,0.0,0.0,0.0,0.47
"""course""",0.33,0.27,0.23,0.36,0.23
"""details""",0.69,0.0,0.0,0.0,0.0
"""end""",0.0,0.0,0.47,0.0,0.0
…,…,…,…,…,…
"""prerequisites""",0.0,0.46,0.0,0.61,0.0
"""python""",0.0,0.0,0.0,0.0,0.47
"""register""",0.56,0.0,0.0,0.61,0.0
"""setup""",0.0,0.0,0.0,0.0,0.47


In [7]:
query = "Do I need to know python to sign up for the January course?"



q = cv.transform([query])
q.toarray()



array([[0.        , 0.        , 0.39515588, 0.        , 0.        ,
        0.        , 0.        , 0.39515588, 0.        , 0.        ,
        0.        , 0.829279  , 0.        , 0.        , 0.        ]])

In [8]:
query_dict = dict(zip(names, q.toarray()[0]))
doc_dict = dict(zip(names, X.toarray()[1]))
print(query_dict)
print(doc_dict)

{'catalog': np.float64(0.0), 'cloud': np.float64(0.0), 'course': np.float64(0.39515588491314224), 'details': np.float64(0.0), 'end': np.float64(0.0), 'google': np.float64(0.0), 'homework': np.float64(0.0), 'january': np.float64(0.39515588491314224), 'listed': np.float64(0.0), 'month': np.float64(0.0), 'prerequisites': np.float64(0.0), 'python': np.float64(0.8292789960182417), 'register': np.float64(0.0), 'setup': np.float64(0.0), 'submit': np.float64(0.0)}
{'catalog': np.float64(0.5675015398728066), 'cloud': np.float64(0.0), 'course': np.float64(0.2704175244456293), 'details': np.float64(0.0), 'end': np.float64(0.0), 'google': np.float64(0.0), 'homework': np.float64(0.0), 'january': np.float64(0.2704175244456293), 'listed': np.float64(0.5675015398728066), 'month': np.float64(0.0), 'prerequisites': np.float64(0.45785666908911726), 'python': np.float64(0.0), 'register': np.float64(0.0), 'setup': np.float64(0.0), 'submit': np.float64(0.0)}


In [9]:
df_qd = pl.DataFrame([query_dict, doc_dict])
pl.concat([ pl.DataFrame({'index':['query','doc']}) ,df_qd],how='horizontal' )


index,catalog,cloud,course,details,end,google,homework,january,listed,month,prerequisites,python,register,setup,submit
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""query""",0.0,0.0,0.395156,0.0,0.0,0.0,0.0,0.395156,0.0,0.0,0.0,0.829279,0.0,0.0,0.0
"""doc""",0.567502,0.0,0.270418,0.0,0.0,0.0,0.0,0.270418,0.567502,0.0,0.457857,0.0,0.0,0.0,0.0


In [10]:
print('query similarity to row 1')
df_qd.product().sum_horizontal().item()

query similarity to row 1


0.21371415233666782

In [11]:
print('query similarity to all docs')

X.dot(q.T).toarray()



query similarity to all docs


array([[0.25955955],
       [0.21371415],
       [0.17843726],
       [0.28419115],
       [0.57137158]])

In [12]:
print('equivalent to cosine similarity')

cosine_similarity(X, q)

equivalent to cosine similarity


array([[0.25955955],
       [0.21371415],
       [0.17843726],
       [0.28419115],
       [0.57137158]])

In [13]:
print(query)

print(docs_example[4])

Do I need to know python to sign up for the January course?
January course setup: Python and Google Cloud
