In [8]:
import pandas as pd
import numpy as np
import time

import re


In [9]:
!python -m spacy download en_core_web_md
!pip install spacy_universal_sentence_encoder
!pip install faiss-cpu
!pip install datasketch
!pip install nmslib

2022-11-25 10:11:16.181971: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.4.1/en_core_web_md-3.4.1-py3-none-any.whl (42.8 MB)
[K     |████████████████████████████████| 42.8 MB 1.2 MB/s 
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy_universal_sentence_encoder
  Downloading spacy_universal_sentence_encoder-0.4.5.tar.gz (13 kB)
Building wheels for collected packages: spacy-universal-sentence-encoder
  Building whe

In [10]:
import spacy
from datasketch import MinHash, MinHashLSHForest
import faiss
import spacy_universal_sentence_encoder
import nmslib


In [4]:
movie_data = pd.read_csv("/content/netflix_movies.csv")
movie_data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [5]:
movie_data.shape

(8807, 12)

In [6]:
data = movie_data.drop(['show_id', 'type', 'director','cast','country','date_added','release_year','rating','duration'], axis=1)
data.head()

Unnamed: 0,title,listed_in,description
0,Dick Johnson Is Dead,Documentaries,"As her father nears the end of his life, filmm..."
1,Blood & Water,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,Ganglands,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,Jailbirds New Orleans,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,Kota Factory,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


# Locality Sensitive Hashing(LSH)

In [11]:
#load english language module of spacy
nlp = spacy.load('en_core_web_md')

# get stop words list
stopwords = nlp.Defaults.stop_words

## Preprocess your data

In [12]:
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    #convert to lower case
    tokens = text.lower()
    #split into tokens
    tokens = tokens.split()
    cleaned=[]
    for x in tokens:
      if not x in (stopwords):      
          cleaned.append('{a}'.format(a=x))
    #return the canonical form
    return ' '.join(cleaned)

## Create Minhash Forest for Queries

In [13]:
def get_forest(final_dataset, perms):
    minhash = []
    for text in data['listed_in']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
    forest = MinHashLSHForest(num_perm=perms)
    for i,m in enumerate(minhash):
        forest.add(i,m)
    forest.index()    
    return forest

## Evaluate Queries

In [14]:
def predict(text, database, perms, num_results, forest):
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))    
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None 
    result = database.iloc[idx_array]['title']    
    return result

## Choose your parameters

In [15]:
permutations = 128
num_recommendations = 5

## Recommendation check for given query

In [16]:
forest = get_forest(data, permutations)

In [37]:
title = 'Ganglands'

print("The Most Similar Movie To: '{title}' are listed below:".format(title=title))


result = predict(title, data, permutations, num_recommendations, forest)
print("\n")

for x,y in enumerate(result):
    print("{x}.) {y}".format(x=x+1,y=y))

The Most Similar Movie To: 'Ganglands' are listed below:


1.) Go! Live Your Way
2.) Pocoyo
3.) Heidi, bienvenida a casa
4.) El Chavo
5.) Legend Quest: Masters of Myth


# Google Universal Sentence Encoder is used to create vectors

In [20]:
nlp = spacy_universal_sentence_encoder.load_model('en_use_md')

Downloaded https://tfhub.dev/google/universal-sentence-encoder/4, Total size: 987.47MB



In [21]:
list_title = data.iloc[:,2].values
vector_title=[]
for x in list_title:
  vector_title.append(nlp(x).vector)

In [22]:
list_title = data.iloc[:,2].values
vector_for_netflix_title=[]
for x in list_title:
  vector_for_netflix_title.append(nlp(x).vector)
#creating dictionary with title and list_title 
title_dict=dict()
title_dict['title'] = data.iloc[:,0].values
title_dict['list_title']=np.array(vector_for_netflix_title)

In [23]:
title_dict['title']

array(['Dick Johnson Is Dead', 'Blood & Water', 'Ganglands', ...,
       'Zombieland', 'Zoom', 'Zubaan'], dtype=object)

In [25]:
title_dict['list_title'].shape

(8807, 512)

# Exhaustive Search

In [26]:
class Exhaustive():
    def __init__(self, vectors, labels):
      #get the shape of the vector
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
   
    def build(self):
        self.index = faiss.IndexFlatL2(self.dimension,) #scaling down the input vector 
        self.index.add(self.vectors)
        
    def query(self, vectors, k=11):
        distances, indices = self.index.search(vectors, k)
        # I expect only query on one vector thus the slice
        return [self.labels[i] for i in indices[0]]

In [27]:
index = Exhaustive(title_dict["list_title"], title_dict['title'])
index.build()

In [36]:
movie_listed = title_dict['list_title'][0:1]
print("The Most Similar movies To: '{movie_title}' are listed below:".format(movie_title=title_dict['title'][0]))
print("\n")
for x,y in enumerate(index.query(movie_listed)):
  if x!=0:
    print("{x}.) {y}".format(x=x,y=y))

The Most Similar movies To: 'Dick Johnson Is Dead' are listed below:


1.) Woodshock
2.) God Bless the Broken Road
3.) North & South
4.) Russian Doll
5.) The Good Place
6.) Keeping the Bees
7.) Riding Faith
8.) Ultras
9.) My Life as a Zucchini


# Product Quantization

In [31]:
class IVPQIndex():
    def __init__(self, vectors, labels):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
    
    def build(self, 
              number_of_partition=8, 
              search_in_x_partitions=2, 
              subvector_size=8):
        quantizer = faiss.IndexFlatL2(self.dimension)
        self.index = faiss.IndexIVFPQ(quantizer, 
                                      self.dimension, 
                                      number_of_partition, 
                                      search_in_x_partitions, 
                                      subvector_size)
        self.index.train(self.vectors)
        self.index.add(self.vectors)
        
    def query(self, vectors, k=10):
        distances, indices = self.index.search(vectors, k) 
        
        return [self.labels[i] for i in indices[0]]

In [33]:
index = IVPQIndex(title_dict["list_title"], title_dict['title'])
index.build()

In [38]:
movie_listed = title_dict['list_title'][0:1]
print("The Most Similar movies To: '{movie_title}' are listed below:".format(movie_title=title_dict['title'][0]))
print("\n")

for x,y in enumerate(index.query(movie_listed)):
  if x!=0:
    print("{x}.) {y}".format(x=x,y=y))

The Most Similar movies To: 'Dick Johnson Is Dead' are listed below:


1.) Woodshock
2.) God Bless the Broken Road
3.) North & South
4.) Russian Doll
5.) The Good Place
6.) Keeping the Bees
7.) Riding Faith
8.) Ultras
9.) My Life as a Zucchini


# HNSW

In [39]:
class NMSLIBIndex():
    def __init__(self, vectors, labels):
        self.dimention = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels
    def build(self):
        self.index = nmslib.init(method='hnsw', space='cosinesimil')
        self.index.addDataPointBatch(self.vectors)
        self.index.createIndex({'post': 2})
        
    def query(self, vector, k=11):
        indices = self.index.knnQuery(vector, k=k)
        return [self.labels[i] for i in indices[0]]

In [40]:
index = NMSLIBIndex(title_dict["list_title"], title_dict['title'])
index.build()

In [43]:
movie_listed=index.query(title_dict['list_title'][0])

print("The Most Similar movies To: '{movie_title}' are listed below:".format(movie_title=movie_listed[0]))
print("\n")

for x,y in enumerate(movie_listed):
  if x!=0:
    print("{x}.) {y}".format(x=x,y=y))

The Most Similar movies To: 'Dick Johnson Is Dead' are listed below:


1.) The Other Side of the Wind
2.) Maska
3.) Khelti Hai Zindagi Aankh Micholi
4.) Lionheart
5.) Up Among  The Stars
6.) Riding Faith
7.) Pihu
8.) Angela's Christmas Wish
9.) Khoobsurat
10.) A Boy Name Flora A


# Annoy

In [47]:
!pip install annoy
import annoy
from annoy import AnnoyIndex

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [50]:
class AnnoyIndex():
    def __init__(self, vectors, labels):
        self.dimension = vectors.shape[1]
        self.vectors = vectors.astype('float32')
        self.labels = labels    
   
    def build(self, number_of_trees=5):
        self.index = annoy.AnnoyIndex(self.dimension)
        for i, vec in enumerate(self.vectors):
            self.index.add_item(i, vec.tolist())
        self.index.build(number_of_trees)
        
    def query(self, vector, k=11):
        indices = self.index.get_nns_by_vector(
              vector.tolist(), 
              k)                                           
        return [self.labels[i] for i in indices]

In [51]:
index = AnnoyIndex(title_dict["list_title"], title_dict['title'])
index.build()

  


In [54]:
movie_listed=index.query(title_dict['list_title'][10])

print("The Most Similar movies To: '{movie_title}' are listed below:".format(movie_title=movie_listed[0]))
print("\n")

for x,y in enumerate(movie_listed):
  if x!=0:
    print("{x}.) {y}".format(x=x,y=y))

The Most Similar movies To: 'Vendetta: Truth, Lies and The Mafia' are listed below:


1.) Bullet Head
2.) Don
3.) Legal Hash
4.) Special 26
5.) Killing Them Softly
6.) Sabotage
7.) Chupan Chupai
8.) Deuces
9.) Thug Life
10.) The Heroes of Evil
