In [None]:
import pickle

In [None]:
!pip install faiss-gpu



In [7]:
def movies_data_load():
    with open('movies.pickle', 'rb') as f:
        movies = pickle.load(f)
    return movies

movies_data = movies_data_load()
movies_vector = movies_data["vector"]
movies_names = movies_data["name"]
movies_data

{'name': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object),
 'vector': array([[-0.01780608, -0.14265831,  0.10308606, ...,  0.09659795,
         -0.17529577, -0.03061521],
        [-0.03357764,  0.16418771,  0.21801303, ...,  0.16502103,
         -0.09166156,  0.05047869],
        [-0.2761452 , -0.01991325, -0.04969981, ...,  0.0258275 ,
         -0.08328608, -0.0152858 ],
        ...,
        [ 0.05142734, -0.01683608, -0.20441587, ...,  0.00045828,
          0.14679626,  0.2462584 ],
        [ 0.04491899, -0.02819411, -0.09472758, ..., -0.02152078,
          0.16223577,  0.19897607],
        [ 0.02531924,  0.03099714,  0.06437534, ..., -0.07260127,
          0.0467432 ,  0.07893164]], dtype=float32)}

In [8]:
import faiss
faiss.MatrixStats(movies_vector).comments.split("\n")

['analyzing 1682 vectors of size 64',
 'no NaN or Infs in data',
 'all vectors are distinct',
 'range of L2 norms=[0.747558, 1.80436] (0 null vectors)',
 'matrix contains no 0s',
 'no constant dimensions',
 'no dimension has a too large mean',
 'stddevs per dimension are in [0.112036 0.158214]',
 '']

In [9]:
movie_index = faiss.IndexFlatL2(movies_vector.shape[1])
movie_index.add(movies_vector)

In [10]:
movies_search_vector = movies_vector[90:91]
distances, indices = movie_index.search(movies_search_vector, 10)

In [None]:
print(f"Similar movie to {movies_names[90]} are:\n")
print([movies_names[j] for j in indices[0]])

Similar movie to Nightmare Before Christmas, The (1993) are:

['Nightmare Before Christmas, The (1993)', 'Heavy Metal (1981)', 'Sirens (1994)', 'Beauty and the Beast (1991)', 'Akira (1988)', 'Fantasia (1940)', 'Benny & Joon (1993)', 'Barbarella (1968)', "Pete's Dragon (1977)", 'James and the Giant Peach (1996)']


### Exaustive Search

In [11]:
exaustive_quantizer = faiss.IndexFlatL2(movies_vector.shape[1])
exaustive_index = faiss.IndexIVFFlat(exaustive_quantizer, 
                           movies_vector.shape[1], 
                           100,             
                           8)               
exaustive_index.train(movies_vector)
exaustive_index.add(movies_vector)

In [12]:
exaustive_search_vector = movies_vector[90:91]
distances, indices = movie_index.search(movies_search_vector, 10)

In [13]:
print(f"Similar moview to {movies_names[90]} are:\n")
print([movies_names[i] for i in indices[0]])

Similar moview to Nightmare Before Christmas, The (1993) are:

['Nightmare Before Christmas, The (1993)', 'Heavy Metal (1981)', 'Sirens (1994)', 'Beauty and the Beast (1991)', 'Akira (1988)', 'Fantasia (1940)', 'Benny & Joon (1993)', 'Barbarella (1968)', "Pete's Dragon (1977)", 'James and the Giant Peach (1996)']


### Product Quantization

In [None]:
product_quantizer = faiss.IndexFlatL2(movies_vector.shape[1])
product_index = faiss.IndexIVFPQ(product_quantizer, 
                         movies_vector.shape[1], 
                         100,             
                         8,                
                         8)               
product_index.train(movies_vector)
product_index.add(movies_vector)

In [None]:
product_quantization_search_vector = movies_vector[90:91]
distances, indices = product_index.search(product_quantization_search_vector, 10)

In [None]:
print(f"Similar moview to {movies_names[90]} are:\n")
print([movies_names[k] for k in indices[0]])

Similar moview to Nightmare Before Christmas, The (1993) are:

['Nightmare Before Christmas, The (1993)', 'Heavy Metal (1981)', 'Pink Floyd - The Wall (1982)', 'Akira (1988)', 'Hamlet (1996)', 'Full Metal Jacket (1987)', 'Supercop (1992)', 'Scream of Stone (Schrei aus Stein) (1991)', 'Scream of Stone (Schrei aus Stein) (1991)', 'Scream of Stone (Schrei aus Stein) (1991)']


### Trees and Forests

In [None]:
!pip install apache_beam
!pip install 'scikit_learn~=0.23.0'  # For gaussian_random_matrix.
!pip install annoy

Collecting apache_beam
  Downloading apache_beam-2.34.0-cp37-cp37m-manylinux2010_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 7.4 MB/s 
[?25hCollecting fastavro<2,>=0.21.4
  Downloading fastavro-1.4.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 56.5 MB/s 
[?25hCollecting future<1.0.0,>=0.18.2
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 55.7 MB/s 
Collecting hdfs<3.0.0,>=2.1.0
  Downloading hdfs-2.6.0-py3-none-any.whl (33 kB)
Collecting dill<0.3.2,>=0.3.1.1
  Downloading dill-0.3.1.1.tar.gz (151 kB)
[K     |████████████████████████████████| 151 kB 61.6 MB/s 
Collecting avro-python3!=1.9.2,<1.10.0,>=1.8.1
  Downloading avro-python3-1.9.2.1.tar.gz (37 kB)
Collecting requests<3.0.0,>=2.24.0
  Downloading requests-2.26.0-py2.py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 804 kB/s 
Collecting orjson<4.0
  Downloading

In [None]:
import annoy

In [None]:
def load_annoy_data():
    with open('movies.pickle', 'rb') as f:
        annoy_data = pickle.load(f)
    return annoy_data

annoy_data = load_annoy_data()
annoy_data

{'name': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object),
 'vector': array([[-0.01780608, -0.14265831,  0.10308606, ...,  0.09659795,
         -0.17529577, -0.03061521],
        [-0.03357764,  0.16418771,  0.21801303, ...,  0.16502103,
         -0.09166156,  0.05047869],
        [-0.2761452 , -0.01991325, -0.04969981, ...,  0.0258275 ,
         -0.08328608, -0.0152858 ],
        ...,
        [ 0.05142734, -0.01683608, -0.20441587, ...,  0.00045828,
          0.14679626,  0.2462584 ],
        [ 0.04491899, -0.02819411, -0.09472758, ..., -0.02152078,
          0.16223577,  0.19897607],
        [ 0.02531924,  0.03099714,  0.06437534, ..., -0.07260127,
          0.0467432 ,  0.07893164]], dtype=float32)}

In [None]:
class tree_index():
    def __init__(self, annoy_vectors, tree_labels):
        self.dimention = annoy_vectors.shape[1]
        self.annoy_vectors = annoy_vectors.astype('float32')
        self.tree_labels = tree_labels


    def build(self, total_trees=5):
        self.index = annoy.AnnoyIndex(self.dimention)
        for i, vec in enumerate(self.annoy_vectors):
            self.index.add_item(i, vec.tolist())
        self.index.build(total_trees)
        
    def query(self, vector, k=11):
        indices = self.index.get_nns_by_vector(vector.tolist(), k)
        return [self.tree_labels[l] for l in indices]

In [None]:
tree_vector_index = tree_index(annoy_data["vector"], annoy_data["name"])
tree_vector_index.build()

  if __name__ == '__main__':


In [None]:
movie_annoy_vector, name_of_movie = annoy_data['vector'][90], annoy_data['name'][90]
same_movies = '\n* '.join(tree_vector_index.query(movie_annoy_vector))
print(f"Similar moview to {name_of_movie} are:\n* {same_movies}")

Similar moview to Nightmare Before Christmas, The (1993) are:
* Nightmare Before Christmas, The (1993)
* Aladdin (1992)
* Blade Runner (1982)
* Aliens (1986)
* Pink Floyd - The Wall (1982)
* Brazil (1985)
* Wrong Trousers, The (1993)
* Alien (1979)
* Return of the Jedi (1983)
* E.T. the Extra-Terrestrial (1982)
* Grand Day Out, A (1992)


### HNSW

In [None]:
!pip install nmslib

Collecting nmslib
  Downloading nmslib-2.1.1-cp37-cp37m-manylinux2010_x86_64.whl (13.5 MB)
[K     |████████████████████████████████| 13.5 MB 8.2 MB/s 
[?25hCollecting pybind11<2.6.2
  Downloading pybind11-2.6.1-py2.py3-none-any.whl (188 kB)
[K     |████████████████████████████████| 188 kB 61.7 MB/s 
Installing collected packages: pybind11, nmslib
Successfully installed nmslib-2.1.1 pybind11-2.6.1


In [None]:
import nmslib

In [None]:
def load_graph_data():
    with open('movies.pickle', 'rb') as f:
        graph_data = pickle.load(f)
    return graph_data

graph_data = load_graph_data()
graph_data

{'name': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object),
 'vector': array([[-0.01780608, -0.14265831,  0.10308606, ...,  0.09659795,
         -0.17529577, -0.03061521],
        [-0.03357764,  0.16418771,  0.21801303, ...,  0.16502103,
         -0.09166156,  0.05047869],
        [-0.2761452 , -0.01991325, -0.04969981, ...,  0.0258275 ,
         -0.08328608, -0.0152858 ],
        ...,
        [ 0.05142734, -0.01683608, -0.20441587, ...,  0.00045828,
          0.14679626,  0.2462584 ],
        [ 0.04491899, -0.02819411, -0.09472758, ..., -0.02152078,
          0.16223577,  0.19897607],
        [ 0.02531924,  0.03099714,  0.06437534, ..., -0.07260127,
          0.0467432 ,  0.07893164]], dtype=float32)}

In [None]:
class hnsw_index():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels

    def build(self):
        self.index = nmslib.init(method='hnsw', space='cosinesimil')
        self.index.addDataPointBatch(self.vectors)
        self.index.createIndex({'post': 2})
        
    def query(self, vector, k=10):
        indices = self.index.knnQuery(vector, k=k)
        return [self.labels[z] for z in indices[0]]

In [None]:
graph_hnsw_index = hnsw_index(graph_data["vector"], graph_data["name"])
graph_hnsw_index.build()

In [None]:
graph_movie_vector, namee_of_movie = graph_data['vector'][90], graph_data['name'][90]
same_shows = '\n* '.join(graph_hnsw_index.query(graph_movie_vector))
print(f"Similar moview to {namee_of_movie} are:\n* {same_shows}")

Similar moview to Nightmare Before Christmas, The (1993) are:
* Nightmare Before Christmas, The (1993)
* Beauty and the Beast (1991)
* Fantasia (1940)
* Heavy Metal (1981)
* Aladdin (1992)
* Snow White and the Seven Dwarfs (1937)
* Batman (1989)
* James and the Giant Peach (1996)
* Blade Runner (1982)
* Aliens (1986)


### LSH

In [None]:
def load_lsh_data():
    with open('movies.pickle', 'rb') as f:
        lsh_data = pickle.load(f)
    return lsh_data

lsh_data = load_lsh_data()
lsh_data

{'name': array(['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', ...,
        'Sliding Doors (1998)', 'You So Crazy (1994)',
        'Scream of Stone (Schrei aus Stein) (1991)'], dtype=object),
 'vector': array([[-0.01780608, -0.14265831,  0.10308606, ...,  0.09659795,
         -0.17529577, -0.03061521],
        [-0.03357764,  0.16418771,  0.21801303, ...,  0.16502103,
         -0.09166156,  0.05047869],
        [-0.2761452 , -0.01991325, -0.04969981, ...,  0.0258275 ,
         -0.08328608, -0.0152858 ],
        ...,
        [ 0.05142734, -0.01683608, -0.20441587, ...,  0.00045828,
          0.14679626,  0.2462584 ],
        [ 0.04491899, -0.02819411, -0.09472758, ..., -0.02152078,
          0.16223577,  0.19897607],
        [ 0.02531924,  0.03099714,  0.06437534, ..., -0.07260127,
          0.0467432 ,  0.07893164]], dtype=float32)}

In [None]:
class lsh():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels


    def build(self, number_of_partition=8, search_in_x_partitions=2, subvector_size=8):
        lsh_quantizer = faiss.IndexFlatL2(self.dimention)
        self.lsh_index = faiss.IndexIVFPQ(lsh_quantizer, self.dimention, number_of_partition, search_in_x_partitions, subvector_size)
        self.lsh_index.train(self.vectors)
        self.lsh_index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.lsh_index.search(vectors, k) 
        return [self.labels[a] for a in indices[0]]

In [None]:
lsh_index = lsh(lsh_data["vector"], lsh_data["name"])
lsh_index.build()

In [None]:
lsh_vector, name_of_movie = lsh_data['vector'][90:91], lsh_data['name'][90]
same_movies = '\n* '.join(lsh_index.query(lsh_vector))
print(f"Similar moview to {name_of_movie} are:\n* {same_movies}")

Similar moview to Nightmare Before Christmas, The (1993) are:
* Nightmare Before Christmas, The (1993)
* Fantasia (1940)
* Brazil (1985)
* Monty Python's Life of Brian (1979)
* This Is Spinal Tap (1984)
* Hunt for Red October, The (1990)
* Sneakers (1992)
* Lion King, The (1994)
* Clockwork Orange, A (1971)
* Full Metal Jacket (1987)


**References:**

- https://towardsdatascience.com/comprehensive-guide-to-approximate-nearest-neighbors-algorithms-8b94f057d6b6

- https://github.com/eyaltrabelsi/my-notebooks/tree/87d4727e777bec04fee89553e4bc83fad7294c8a/Lectures/search_in_practice-approximate_nearest_neighbors