In [7]:
import pickle
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import annoy
import pandas as pd
model = SentenceTransformer('all-MiniLM-L6-v2')

## Loading Dataset

In [8]:
data = pd.read_csv('movies_metadata.csv')

  data = pd.read_csv('movies_metadata.csv')


In [9]:
data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


## Embeddings Creation

In [13]:
df = data[['id','title']].copy()

In [22]:
df['title'].tolist()[:5]

['Toy Story',
 'Jumanji',
 'Grumpier Old Men',
 'Waiting to Exhale',
 'Father of the Bride Part II']

In [24]:
embeddings=model.encode(df['title'].tolist())
embeddings.shape[1]

## FAISS (Facebook AI Similarity Search) | Title based similarity search
* https://www.youtube.com/watch?v=sKyvsdEv6rk


In [30]:
index = faiss.IndexFlatL2(embeddings.shape[1]) #L2 -> Eucledian Distance
print(index.is_trained)
index.add(embeddings)


(45466, 384)

## Inference

In [34]:
k = 5 # Number of recommendations
xq = model.encode(["Toy Story"])

In [35]:
%%time
D, I = index.search(xq, k)  # search
print(I)  # k-nearest neigbors of the query vector | nprobe == 1: 6495 26392 61709 49932 | nprobe == 10: 362

[[    0 15348  2997 25800]]
CPU times: user 3.79 ms, sys: 3.44 ms, total: 7.23 ms
Wall time: 11.1 ms


In [42]:
for i in I[0]:
    print('Title',df['title'].loc[i])

Title Toy Story
Title Toy Story 3
Title Toy Story 2
Title Toy Story That Time Forgot


## Approximate Nearrest Neighbour using Annoy 

In [43]:
index = annoy.AnnoyIndex(embeddings.shape[1])

  index = annoy.AnnoyIndex(embeddings.shape[1])


In [48]:
for i,vec in enumerate(embeddings):
    index.add_item(i,vec.tolist())

In [50]:
index.build(5)

True

In [51]:
k = 5 # Number of recommendations
xq = model.encode(["Toy Story"])

In [62]:
flat_list = []
for sublist in np.array(xq.tolist()).astype('float'):
    for item in sublist:
        flat_list.append(item)

## Inference

In [72]:
indices = index.get_nns_by_vector(flat_list, 6, search_k=5)    
indices

[0, 15348, 2997, 2142, 44167, 22718]

In [73]:
for i in indices:
    print('Title:',df['title'].loc[i])

Title: Toy Story
Title: Toy Story 3
Title: Toy Story 2
Title: Toys
Title: Kid's Story
Title: The Lego Movie
