# Finding similar movies/tv shows using their descriptions

Created for learning purposes.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing
import nltk # Language processing tools
nltk.download('punkt')

In [None]:
#Load our data
netflix_data = pd.read_csv("../input/netflix-shows/netflix_titles.csv")

In [None]:
#Check how data looks
netflix_data

In [None]:
#Replace index with netflix database convention
netflix_data.set_index('show_id',inplace=True)

In [None]:
#Lets first tokenize each description using nltk and list comprehesion
descriptions_tokenized = [nltk.word_tokenize(description) for description in netflix_data['description']]

In [None]:
#Check first description
descriptions_tokenized[0]

# Explanation
# desriptions_tokenized[i][j]
# i - index of description
# j - index of word in chosen description
# Example: desriptions_tokenized[14][5] - six word of fifteenth description (cause we numerate from 0)

In [None]:
#Lets give each word unique ID, so it will be easier later to use it
#Easiest method is to create gensim dictionary which will contain all words without repetition
from gensim import corpora
dictionary = corpora.Dictionary(descriptions_tokenized)
print(dictionary)

In [None]:
#We have 21381 uniqe words in out dictionary, lets check how many words we have in total. To do it we can multiplay rows length * columns length.
#We need to do it for each row and then sum it up, becuase every row has diffrent number of words
sum(len(row) for row in descriptions_tokenized)

In [None]:
#If the dictionary would be huge, we could decrease number of words by deleting these with low frequency (lets say these which appear only once)
#Now we have ids for each word
dictionary.token2id['island']

In [None]:
#Now we can exchange all descriptions to numbers (their identifiers). It's called bag of words (bow).
descriptions_bow = [dictionary.doc2bow(description) for description in descriptions_tokenized]

In [None]:
#First number in tuple is ID of word. Second number in tuple is frequency in document number 88
descriptions_bow[89]

In [None]:
#Now we can create model which will allow us to represent documents as vectors. We need that to search for similarities using math. 
#Lets try TFidf which uses frequency for transforming
from gensim import models

# Train the tfidf model 
tfidf = models.TfidfModel(descriptions_bow)

# Transform the "shoot enemies" string to test how it works. First value is word ID and second one is tf-idf weight
words = "shoot enemies".lower().split()
print(tfidf[dictionary.doc2bow(words)])

In [None]:
#Lets create spare matrix similarity
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[descriptions_bow], num_features=dictionary.num_pos)

In [None]:
#Now we can use model to find similar descriptions! Lets test one of the descriptions from base. I will use La casa de papel tv series, lets find it by title to get show_id.
netflix_data[netflix_data.title.str.find("La casa") > -1]

In [None]:
descriptions_tokenized[3488]

In [None]:
descriptions_bow[3488]

In [None]:
#Get query bag of words and tfidf model representation
query_bow = descriptions_bow[3488]
query_tfidf = tfidf[query_bow]

In [None]:
#Get similarity list
sims = index[query_tfidf]

In [None]:
#Lets sort them and check first 15 titles similar to La casa de papel
sorted_similar = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)
for document_number, score in sorted_similar[:15]:
    print(document_number, netflix_data.iloc[document_number].title, score)

In [None]:
#Lets check 3 highest scored films descriptions to check if its somehow similar to La casa de papel description. As you can see La casa de papel similarity is 1.0 because its equal to query.
for document_number, score in sorted_similar[0:4]:
   print(netflix_data.iloc[document_number].title)
   print(netflix_data.iloc[document_number].description + "\n")

In [None]:
# Doc2Vec model (propably too small dataset for this one to work good)
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
doc_model = models.Doc2Vec(vector_size=50, min_count=2, epochs=500)

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(descriptions_tokenized)]
doc_model.build_vocab(documents)

In [None]:
doc_model.train(documents, total_examples=doc_model.corpus_count, epochs=50)

In [None]:
la_papel_vector = doc_model.infer_vector(descriptions_tokenized[3488])
doc_sims = doc_model.docvecs.most_similar([la_papel_vector])

In [None]:
for document_number, score in doc_sims:
    print(document_number, netflix_data.iloc[document_number].title, score)

In [None]:
for document_number, score in doc_sims[0:4]:
   print(netflix_data.iloc[document_number].title)
   print(netflix_data.iloc[document_number].description + "\n")