# Overview 

Let's imeplement TF-IDF on numpy.

In [174]:
import torch
import timeit

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import torch.nn.functional as F

In [4]:
import pandas as pd

# Movie DataSet

Let's download the list of movies from Kaggle:
https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset/versions/7?resource=download

In [125]:
df = pd.read_csv('movies.csv', header=None, names=['title'])
df.head()

Unnamed: 0,title
0,Toy Story
1,Jumanji
2,Grumpier Old Men
3,Waiting to Exhale
4,Father of the Bride Part II


In [81]:
'Jumanji'.isascii()

True

In [53]:
df['title'].values

array(['Toy Story', 'Jumanji', 'Grumpier Old Men', ..., 'Betrayal',
       'Satana likuyushchiy', 'Queerama'], dtype=object)

In [128]:
'Toy Story'.isnumeric()

False

In [158]:
# Example documents
def entity_filter(x):
    for ch in ['!', '&', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '$', '(', ')', '*', '+', '/', '?', '_', '#', '@', '[', ']']:
        if ch in x:
            return False
    return True

documents = [v for v in df['title'].values if v.isascii() and entity_filter(v)]
len(documents)

# documents = [
#     "Toy Story",
#     "Jumanji",
#     "Grumpier Old Men",
#     "Waiting to Exhale",
#     "Father of the Bride Part II",
# ]

30508

In [159]:
documents[:10]

['Toy Story',
 'Jumanji',
 'Grumpier Old Men',
 'Waiting to Exhale',
 'Father of the Bride Part II',
 'Heat',
 'Sabrina',
 'Tom and Huck',
 'Sudden Death',
 'GoldenEye']

# Build TF-IDF Vectors

In [196]:
# Initialize the TF-IDF vectorizer
# Char analyzer is used with restricted ASCII letters only and whitespace.s
vectorizer = TfidfVectorizer(analyzer='char')

# Fit the vectorizer on the documents and transform the documents into TF-IDF vectors
tfidf_vectors = vectorizer.fit_transform(documents)

# Convert the sparse TF-IDF vectors to dense PyTorch tensors
tfidf_tensors = torch.tensor(tfidf_vectors.toarray())

In [161]:
vocab = vectorizer.get_feature_names()
print(len(vocab))
print(vocab)

27
[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [162]:
tfidf_tensors.shape

torch.Size([30508, 27])

In [163]:
tfidf_tensors

tensor([[0.1744, 0.0000, 0.0000,  ..., 0.0000, 0.7269, 0.0000],
        [0.0000, 0.1510, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.2991, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.5007, 0.4644,  ..., 0.0000, 0.4791, 0.0000],
        [0.1159, 0.3785, 0.0000,  ..., 0.0000, 0.4830, 0.0000],
        [0.0000, 0.3797, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       dtype=torch.float64)

# Query TF-IDF using cosine similarity

In [192]:
def query(query, verbose=False):
    # Transform the query into a TF-IDF vector
    query_vector = vectorizer.transform([query])

    # Convert the sparse query vector to a dense PyTorch tensor
    query_tensor = torch.tensor(query_vector.toarray())

    # Compute the cosine similarity between the query vector and the document vectors
    similarities = cosine_similarity(query_tensor, tfidf_tensors)

    # Get the indices of the most similar documents
    top_indices = similarities.argsort()[0][::-1]

    # Print the most similar documents
    if verbose:
        for index in top_indices[:4]:
            print(f"[{similarities[0][index]}] '{documents[index]}' index: {index}")

    return (similarities[0][top_indices[0]], documents[top_indices[0]])

In [193]:
%%timeit
query("toy story")

8.36 ms ± 273 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Latency

With 30K movie_titles (30,508 records), on my local MacBook Pro 13" 2 GHz CPU, there is 8.19ms latency per query.

In [194]:
query("toi story", verbose=True)

[0.9145063650632923] 'Toy Story' index: 0
[0.8929888558046775] 'Toy Story That Time Forgot' index: 18194
[0.8785127869677261] 'Sorority Boys' index: 3985
[0.8734695238497956] 'Monstrosity' index: 13409


(0.9145063650632923, 'Toy Story')

In [195]:
query("toi stori", verbose=True)

[0.9313211800828696] 'Trois' index: 2489
[0.909499148967656] 'Otis' index: 11501
[0.9061484299383746] 'Trio' index: 12618
[0.9061484299383746] 'Riot' index: 20711


(0.9313211800828696, 'Trois')