# Similarity search

## Word embeddings

In [44]:
import pandas as pd
import re

def tokenize(text: str) -> list[str]:
    return re.findall(r"[A-Za-z]+", str(text).lower())

def compute_vocabulary(file: str) -> dict[str, int]:
    vocabulary: dict[str, int] = {}  
    data = pd.read_csv(file)
    i = 0
    for _, row in data.iterrows():
        title, overview, genre, crew = tokenize(row['title']), tokenize(row['overview']), tokenize(row['genre']), tokenize(row['crew'])
        for token in title + overview:
            if token not in vocabulary:
                vocabulary[token] = i
                i += 1
            else: continue
    return vocabulary

In [45]:
print(tokenize('Pirates of the Caribbean'))

['pirates', 'of', 'the', 'caribbean']


In [46]:
vocab = compute_vocabulary('movies.csv')
print(f"{len(vocab)} unique words in movies.csv")

29632 unique words in movies.csv


### Embedding similarity search

In [47]:
import numpy as np
from numpy.linalg import norm

class EmbeddingIndex:
    def __init__(self, vocabulary: dict[str, int]) -> None:
        self.word_embeddings_: dict[str, np.array] = {}
        self.document_embeddings_ = None 
        self.vocabulary_: dict[str, int] = vocabulary
        self.movie_titles_: list[str] = []

    def build_from_file(self, file: str) -> None:
        data = pd.read_csv(file) 
        docs_as_vec = [] 
        for _, row in data.iterrows():
            title, overview, genre, crew = tokenize(row['title']), tokenize(row['overview']), tokenize(row['genre']), tokenize(row['crew'])
            document_embedding = np.zeros(len(self.vocabulary_))
            for word in title + overview:
                document_embedding[self.vocabulary_[word]] += 1
            docs_as_vec.append(document_embedding)
            self.movie_titles_.append(row['title']) 
        self.document_embeddings_ = np.array(docs_as_vec)
        
        for word in self.vocabulary_:
            word_embedding = np.zeros(len(self.vocabulary_))
            word_embedding[self.vocabulary_[word]] = 1
            self.word_embeddings_[word] = word_embedding

    def cosine_similarity(self, x: np.array, y: np.array) -> float:
        dot = np.dot(x, y)
        xy_norm = norm(x) * norm(y)
        return dot / xy_norm

    def process_query(self, quyer: str):
        query_words = tokenize(quyer)
        query_vector = np.zeros(len(self.vocabulary_))
        for word in query_words:
            query_vector[self.vocabulary_[word]] += 1

        similarities = list(self.cosine_similarity(self.document_embeddings_, query_vector))
        similarities_sorted = list(sorted([(doc_id, sim) for doc_id, sim in enumerate(similarities)], key=lambda x: x[1], reverse=True))
        return similarities_sorted

    def generate_output(self, query: str, print_threshold: int = 10) -> None:
        results = self.process_query(query)
        print(f"Query: {query}") 
        for i, (doc_id, cos_sim) in enumerate(results):
            cos_sim_to_percent = (cos_sim + 1) / 2
            print(f"Title: {self.movie_titles_[doc_id]}\t Match: {cos_sim_to_percent}")
            if i > print_threshold:
                break

In [48]:
SearchEngine = EmbeddingIndex(vocab)
SearchEngine.build_from_file('movies.csv')

### Query: iron man

In [49]:
SearchEngine.generate_output('iron man')

Query: iron man
Title: Iron Man & Captain America: Heroes United	 Match: 0.5029070013099445
Title: Iron Man & Hulk: Heroes United	 Match: 0.5021802509824583
Title: Iron Man 2	 Match: 0.5014535006549723
Title: Sex and Zen II	 Match: 0.5014535006549723
Title: The Curious Case of Benjamin Button	 Match: 0.5014535006549723
Title: Spider-Man: All Roads Lead to No Way Home	 Match: 0.5014535006549723
Title: Iron Man: Rise of Technovore	 Match: 0.5014535006549723
Title: TEKKEN: A Man Called X	 Match: 0.5014535006549723
Title: Detective Conan: The Private Eyes' Requiem	 Match: 0.5014535006549723
Title: Me Before You	 Match: 0.5010901254912292
Title: Spider-Man 3	 Match: 0.5010901254912292
Title: Ong Bak 2	 Match: 0.5010901254912292


### Query: star wars

In [50]:
SearchEngine.generate_output('star wars')

Query: star wars
Title: Star Wars: The Clone Wars	 Match: 0.5021802509824583
Title: Star Trek: The Captains' Summit	 Match: 0.5018168758187154
Title: Empire of Dreams: The Story of the Star Wars Trilogy	 Match: 0.5014535006549723
Title: LEGO Star Wars Holiday Special	 Match: 0.5014535006549723
Title: Rogue One: A Star Wars Story	 Match: 0.5010901254912292
Title: Doraemon: Nobita's New Great Adventure Into the Underworld - The Seven Magic Users	 Match: 0.5010901254912292
Title: A Star Is Born	 Match: 0.5010901254912292
Title: Phineas and Ferb: Star Wars	 Match: 0.5010901254912292
Title: Doraemon: Nobita's Little Star Wars 2021	 Match: 0.5007267503274861
Title: Star Wars	 Match: 0.5007267503274861
Title: Star Wars: The Force Awakens	 Match: 0.5007267503274861
Title: Star Wars: The Rise of Skywalker	 Match: 0.5007267503274861


### Query: anime

In [51]:
SearchEngine.generate_output('anime')

Query: anime
Title: Kimetsu Orchestra Concert	 Match: 0.5015416702543849
Title: The Animatrix	 Match: 0.5010277801695899
Title: Laid-Back Camp The Movie	 Match: 0.500513890084795
Title: ODDTAXI in the Woods	 Match: 0.500513890084795
Title: Dragon Ball: Yo! Son Goku and His Friends Return!!	 Match: 0.500513890084795
Title: Death Note Relight 1: Visions of a God	 Match: 0.500513890084795
Title: Halo Legends	 Match: 0.500513890084795
Title: Naruto to Boruto: The Live 2019	 Match: 0.500513890084795
Title: Evangelion: 1.0 You Are (Not) Alone	 Match: 0.500513890084795
Title: Steins;Gate: The Movie - Load Region of Déjà Vu	 Match: 0.500513890084795
Title: Phantom of the Kill: Zero's Rebellion	 Match: 0.500513890084795
Title: Death Note Relight 2: L's Successors	 Match: 0.500513890084795
