# Vector space model

In [1]:
import numpy as np
import pandas as pd
import re

cities = pd.read_csv('movies.csv')
cities.head()

Unnamed: 0,title,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,340942000.0,US


## Vector space inverted index search

In [4]:
from scipy.sparse import coo_matrix

class InvertedIndex:
    def __init__(self) -> None:
        self.inverted_lists_: dict[str, list[tuple[int, float]]] = {}
        self.td_matrix_ = None
        self.td_sparse_matrix_ = None 
        self.vocab: dict[str, int] = {}

        self.doc_ids_: list[int] = []
        self.movie_titles_: list[str] = []

    def tokenize(self, text: str) -> list[str]:
        return re.findall(r"[A-Za-z]+", str(text).lower())
    
    def build_from_file(self, file: str) -> None:
        df = pd.read_csv(file)
        doc_id: int = 1
        for _, row in df.iterrows():
            title, genre ,overview, crew = self.tokenize(row['title']), self.tokenize(row['genre']), self.tokenize(row['overview']), self.tokenize(row['crew'])
            for word in title + overview + crew + genre:
                if word not in self.inverted_lists_:
                    self.inverted_lists_[word] = [(doc_id, 1)]
                    self.vocab[word] = len(self.inverted_lists_) - 1
                else:
                    if self.inverted_lists_[word][-1][0] != doc_id:
                        self.inverted_lists_[word].append((doc_id, 1))
                    else:
                        new_score = self.inverted_lists_[word][-1][1] + 1
                        self.inverted_lists_[word].append((doc_id, new_score))
            self.doc_ids_.append(doc_id)
            self.movie_titles_.append(row['title'])
            doc_id += 1

        # Time for linear algebra!
        # Build td matrix.
        self.td_matrix_ = np.zeros((len(self.inverted_lists_), len(self.doc_ids_))) # num_unique_words x num_documents
        for row, word in enumerate(self.inverted_lists_):
            for doc_id, tf in self.inverted_lists_[word]:
                self.td_matrix_[row][doc_id-1] = tf

        self.td_sparse_matrix_ = coo_matrix(self.td_matrix_)

    def process_query(self, query: str) -> list[tuple[int, int]]:
        query_words = self.tokenize(query)
        query_vector = np.zeros(len(self.inverted_lists_))
        
        for word in query_words:
            if word in self.inverted_lists_:
                query_vector[self.vocab[word]] += 1
        # Sparse matrix for fast computation. 
        query_scores = query_vector.T @ self.td_sparse_matrix_
        # Slow variant. 
        # query_scores = query_vector.T @ self.td_matrix_
        results = list(sorted([(i+1, score) for i, score in enumerate(query_scores)], key=lambda x: x[1], reverse=True))
        return results

    def generate_output(self, query: str, print_threshold: int = 10) -> None:
        results = self.process_query(query)
        print(f"Query: {query}") 
        for i, (doc_id, score) in enumerate(results):
            print(f"{self.movie_titles_[doc_id-1]}")
            if i > print_threshold:
                break

### Sparse matrix for fast computation

In [5]:
SearchEngine = InvertedIndex()
SearchEngine.build_from_file('movies.csv')
print(SearchEngine.td_matrix_)
print(SearchEngine.td_sparse_matrix_)

[[5. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [2. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
  (0, 0)	5.0
  (0, 114)	3.0
  (0, 115)	3.0
  (0, 774)	1.0
  (0, 1030)	1.0
  (0, 1380)	2.0
  (0, 1555)	3.0
  (0, 2083)	1.0
  (0, 2996)	5.0
  (0, 2997)	5.0
  (0, 3073)	1.0
  (0, 3096)	1.0
  (0, 4103)	6.0
  (0, 4104)	6.0
  (0, 5709)	2.0
  (0, 6723)	3.0
  (0, 7478)	1.0
  (0, 8097)	1.0
  (0, 8322)	2.0
  (0, 8428)	2.0
  (1, 0)	1.0
  (1, 26)	1.0
  (1, 165)	2.0
  (1, 277)	1.0
  (1, 430)	1.0
  :	:
  (70422, 10168)	1.0
  (70423, 10169)	1.0
  (70424, 10169)	1.0
  (70425, 10169)	1.0
  (70426, 10170)	2.0
  (70427, 10170)	1.0
  (70428, 10170)	1.0
  (70429, 10171)	1.0
  (70430, 10172)	1.0
  (70431, 10172)	1.0
  (70432, 10172)	1.0
  (70433, 10172)	1.0
  (70434, 10172)	1.0
  (70435, 10172)	1.0
  (70436, 10172)	2.0
  (70437, 10172)	1.0
  (70438, 10174)	1.0
  (70439, 10174)	1.0
  (70440, 10174)	1.0
  (70441, 10175)	1.0
  (70442, 10175)	1.0
  (70443, 1

### Query: Iron Man

In [6]:
SearchEngine.generate_output('iron man')

Query: iron man
Iron Man & Captain America: Heroes United
Iron Man & Hulk: Heroes United
Iron Man 2
Life of Brian
Iron Man: Rise of Technovore
Spider-Man: Across the Spider-Verse
Spider-Man: Homecoming
Iron Man 3
Iron Man
Raped by an Angel
3-Iron
LEGO Marvel Super Heroes: Avengers Reassembled!


### Query: Action

In [7]:
SearchEngine.generate_output('action')

Query: action
Last Action Hero
Bruce Lee: A Warrior's Journey
Missing in Action 2: The Beginning
Shazam! Fury of the Gods
Die Hart
Memory
Avengers: Age of Ultron
Incredibles 2
Rambo
The Action Pack Saves Christmas
Rush Hour
I Am Number Four


### Query: Love

In [8]:
SearchEngine.generate_output('love')

Query: love
Love Me, Love Me Not
Please Enlighten Me
The Moment You Fall in Love
Urusei Yatsura: Always My Darling
Billboard Dad
Sweet Sex and Love
Till We Meet Again
Love Affair
Endless Love
AI Love You
I've Always Liked You
A Frozen Flower


### Query: Anime

In [9]:
SearchEngine.generate_output('Anime')

Query: Anime
Kimetsu Orchestra Concert
The Animatrix
Laid-Back Camp The Movie
ODDTAXI in the Woods
Dragon Ball: Yo! Son Goku and His Friends Return!!
Death Note Relight 1: Visions of a God
Halo Legends
Naruto to Boruto: The Live 2019
Evangelion: 1.0 You Are (Not) Alone
Steins;Gate: The Movie - Load Region of Déjà Vu
Phantom of the Kill: Zero's Rebellion
Death Note Relight 2: L's Successors
