In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import math
pd.set_option('display.max_colwidth', -1)
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip install faiss-gpu ##Installing GPU version of faiss



In [None]:
! pip install sentence_transformers ## For textual similarity, using pretrained models

In [None]:
import faiss
from sentence_transformers import SentenceTransformer, util

In [None]:
model = SentenceTransformer('paraphrase-distilroberta-base-v1',device="cuda") ## On GPU Loads the distil roberta model,whcih was trained on millions of data

### Load the movies data

In [None]:
imdb_movies=pd.read_csv("/kaggle/input/imdb-extensive-dataset/IMDb movies.csv")
imdb_movies.shape

In [None]:
imdb_movies.head()

In [None]:
## Dropping off rows where Movie Description is NULL
imdb_movies=imdb_movies[pd.notnull(imdb_movies['description'])]
imdb_movies=imdb_movies.reset_index(drop=True)

The data contains description columns -which we will use to search for similar movie by plot summaries. Sentence Transformers will be used to encode sentences into FAISS. To encode into FAISS, we need to have an unique ID (numeric) assigned to each sentence. For this we will use the index of the row of the movie description

In [None]:
imdb_movies['id']=imdb_movies.index


In [None]:
imdb_movies.head()

## Extract the Embeddings for movie description

In [None]:
sentences=imdb_movies['description'].tolist()
print("Number of Sentences in Movie Description ",len(sentences))

In [None]:
embeddings=model.encode(sentences)
faiss.normalize_L2(embeddings) ## Normalising the Embeddings

In [None]:
print("Shape of the EMbeddings is ",embeddings.shape)

**There are many types of FAISS Indices - you can use a Simple FLAT Index, or you can use the concept of Inverted Index and Product Quantisation to index the data.**

**IVF with Product Quantisation, allows us to efficiently compress the data - it uses nearest neighbour search**

With IVF with PRoduct Quantisation, the vectors are first partitioned into clusters using k-means  => This step is known as training the index.

In [None]:
## We get a 768 dimension vector using Roberta. So we will create FAISS index with dimaensions - 768

dim=768
ncentroids=50 ## This is a hyperparameter, and indicates number of clusters to be split into
m=16 ## This is also a hyper parameter
quantiser = faiss.IndexFlatL2(dim)
index = faiss.IndexIVFPQ (quantiser, dim,ncentroids, m , 8)
index.train(embeddings) ## This step, will do the clustering and create the clusters
print(index.is_trained)
faiss.write_index(index, "trained.index")


In [None]:
### We have to add the embeddings to the Trained Index.
ids=imdb_movies['id'].tolist()
ids=np.array(ids)
index.add_with_ids(embeddings,ids)
print(index.ntotal)



In [None]:
faiss.write_index(index,"block.index")


### Let us use the FAISS index to search for similar movie plots

In [None]:
def searchFAISSIndex(data,id_col_name,query,index,nprobe,model,topk=20):
    ## Convert the query into embeddings
    query_embedding=model.encode([query])[0]
    dim=query_embedding.shape[0]
    query_embedding=query_embedding.reshape(1,dim)
    faiss.normalize_L2(query_embedding)
  
    
    index.nprobe=nprobe
    
    D,I=index.search(query_embedding,topk) 
    ids=[i for i in I][0]
    L2_score=[d for d in D][0]
    inner_product=[calculateInnerProduct(l2) for l2 in L2_score]
    search_result=pd.DataFrame()
    search_result[id_col_name]=ids
    search_result['cosine_sim']=inner_product
    search_result['L2_score']=L2_score
    dat=data[data[id_col_name].isin(ids)]
    dat=pd.merge(dat,search_result,on=id_col_name)
    dat=dat.sort_values('cosine_sim',ascending=False)
    return dat

In [None]:

def calculateInnerProduct(L2_score):
    return (2-math.pow(L2_score,2))/2

In [None]:
query="A seventeen-year-old aristocrat falls in love with a kind but poor artist"
search_result=searchFAISSIndex(imdb_movies,"id",query,index,nprobe=10,model=model,topk=20)
search_result=search_result[['id','description','title','cosine_sim','L2_score']]

In [None]:
search_result

In [None]:
query="Former Football player  to train an  football team"
search_result=searchFAISSIndex(imdb_movies,"id",query,index,nprobe=10,model=model,topk=20)
search_result=search_result[['id','description','title','cosine_sim','L2_score']]
search_result