# Overview 

Let's imeplement TF-IDF on numpy.

In [1]:
import torch
import timeit

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import torch.nn.functional as F

In [2]:
import pandas as pd

# Movie DataSet

Let's download the list of movies from Kaggle:
https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset/versions/7?resource=download

In [3]:
df = pd.read_csv('movies.csv', header=None, names=['title'])
df.head()

Unnamed: 0,title
0,Toy Story
1,Jumanji
2,Grumpier Old Men
3,Waiting to Exhale
4,Father of the Bride Part II


In [4]:
'Jumanji'.isascii()

True

In [5]:
df['title'].values

array(['Toy Story', 'Jumanji', 'Grumpier Old Men', ..., 'Betrayal',
       'Satana likuyushchiy', 'Queerama'], dtype=object)

In [6]:
'Toy Story'.isnumeric()

False

In [85]:
# Example documents
def entity_filter(x):
    for ch in ['!', '&', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '$', '(', ')', '*', '+', '/', '?', '_', '#', '@', '[', ']']:
        if ch in x:
            return False
    return True

documents = [v.replace(' ', '') for v in df['title'].values[:2000] if v.isascii() and entity_filter(v)]
len(documents)

# documents = [
#     "Toy Story",
#     "Jumanji",
#     "Grumpier Old Men",
#     "Waiting to Exhale",
#     "Father of the Bride Part II",
# ]

1575

In [86]:
documents[:10]

['ToyStory',
 'Jumanji',
 'GrumpierOldMen',
 'WaitingtoExhale',
 'FatheroftheBridePartII',
 'Heat',
 'Sabrina',
 'TomandHuck',
 'SuddenDeath',
 'GoldenEye']

# Build TF-IDF Vectors

In [87]:
# Initialize the TF-IDF vectorizer
# Char analyzer is used with restricted ASCII letters only and whitespace.s
vectorizer = TfidfVectorizer(
    ngram_range=(3,3),
    analyzer='char',
    stop_words='english',
)

# Fit the vectorizer on the documents and transform the documents into TF-IDF vectors
tfidf_vectors = vectorizer.fit_transform(documents)

# Convert the sparse TF-IDF vectors to dense PyTorch tensors
tfidf_tensors = torch.tensor(tfidf_vectors.toarray())

In [88]:
vocab = vectorizer.get_feature_names()
print(len(vocab))
print(vocab[:20])

3599
['aab', 'aad', 'aam', 'aan', 'aba', 'abb', 'abe', 'abi', 'abl', 'abo', 'abr', 'abs', 'aby', 'aca', 'ace', 'ach', 'ack', 'acl', 'aco', 'acr']




In [89]:
tfidf_tensors.shape

torch.Size([1575, 3599])

In [90]:
tfidf_tensors

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

# Query TF-IDF using cosine similarity

In [98]:
def query(query, verbose=False):
    query = query.replace(' ', '')
    # Transform the query into a TF-IDF vector
    query_vector = vectorizer.transform([query])

    # Convert the sparse query vector to a dense PyTorch tensor
    query_tensor = torch.tensor(query_vector.toarray())

    # Compute the cosine similarity between the query vector and the document vectors
    similarities = cosine_similarity(query_tensor, tfidf_tensors)

    # Get the indices of the most similar documents
    top_indices = similarities.argsort()[0][::-1]

    # Print the most similar documents
    if verbose:
        for index in top_indices[:4]:
            print(f"[{similarities[0][index]}] '{documents[index]}' index: {index}")

    return (similarities[0][top_indices[0]], documents[top_indices[0]])

In [99]:
%%timeit
query("toy story")

33.8 ms ± 1.26 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Latency

With 30K movie_titles (30,508 records), on my local MacBook Pro 13" 2 GHz CPU, there is 8.19ms latency per query.

In [100]:
query("Toy Story", verbose=True)

[1.0000000000000002] 'ToyStory' index: 0
[0.3118583022867405] 'Boys' index: 554
[0.28242702697614114] 'WestSideStory' index: 1462
[0.2443935369039572] 'TheStoryofXinghua' index: 661


(1.0000000000000002, 'ToyStory')

In [101]:
query("toi story", verbose=True)

[0.38712832948148573] 'ToyStory' index: 0
[0.2852736590361008] 'WestSideStory' index: 1462
[0.24685682267673317] 'TheStoryofXinghua' index: 661
[0.2396733191488809] 'Firestorm' index: 1319


(0.38712832948148573, 'ToyStory')

In [102]:
query("toi stori", verbose=True)

[0.2539964924901078] 'Notorious' index: 723
[0.23834292600419213] 'Firestorm' index: 1319
[0.23212660280069453] 'StefanoQuantestorie' index: 639
[0.22691129764942763] 'Histoiresextraordinaires' index: 607


(0.2539964924901078, 'Notorious')