In [29]:
# Importing modules
import pandas as pd
import numpy as np
import os
import re
# LDA Model
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from pprint import pprint
from gensim.models import CoherenceModel
import spacy
nlp = spacy.load("en_core_web_lg")
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# Import the wordcloud library
from wordcloud import WordCloud
# Visualize the topics
import pyLDAvis.gensim
import pickle
import pyLDAvis

[nltk_data] Downloading package stopwords to /home/dima/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
# Read the data
df = pd.read_csv("../datasets/netflix_movies_and_shows_1/netflix_titles.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [31]:
# Remove non-english words
df['description'] = df['description'].map(lambda x: re.sub("([^\x00-\x7F])+","", x))

In [40]:
# Convert in list of words, without punctuation/special characters
def convert_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))
data_words = list(convert_to_words(df['description']))
data_words

[['as',
  'her',
  'father',
  'nears',
  'the',
  'end',
  'of',
  'his',
  'life',
  'filmmaker',
  'kirsten',
  'johnson',
  'stages',
  'his',
  'death',
  'in',
  'inventive',
  'and',
  'comical',
  'ways',
  'to',
  'help',
  'them',
  'both',
  'face',
  'the',
  'inevitable'],
 ['after',
  'crossing',
  'paths',
  'at',
  'party',
  'cape',
  'town',
  'teen',
  'sets',
  'out',
  'to',
  'prove',
  'whether',
  'private',
  'school',
  'swimming',
  'star',
  'is',
  'her',
  'sister',
  'who',
  'was',
  'abducted',
  'at',
  'birth'],
 ['to',
  'protect',
  'his',
  'family',
  'from',
  'powerful',
  'drug',
  'lord',
  'skilled',
  'thief',
  'mehdi',
  'and',
  'his',
  'expert',
  'team',
  'of',
  'robbers',
  'are',
  'pulled',
  'into',
  'violent',
  'and',
  'deadly',
  'turf',
  'war'],
 ['feuds',
  'flirtations',
  'and',
  'toilet',
  'talk',
  'go',
  'down',
  'among',
  'the',
  'incarcerated',
  'women',
  'at',
  'the',
  'orleans',
  'justice',
  'center',

In [41]:
# Remove stop words
stop_words = stopwords.words('english')
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [43]:
without_stopwords = remove_stopwords(data_words)
without_stopwords

[['father',
  'nears',
  'end',
  'life',
  'filmmaker',
  'kirsten',
  'johnson',
  'stages',
  'death',
  'inventive',
  'comical',
  'ways',
  'help',
  'face',
  'inevitable'],
 ['crossing',
  'paths',
  'party',
  'cape',
  'town',
  'teen',
  'sets',
  'prove',
  'whether',
  'private',
  'school',
  'swimming',
  'star',
  'sister',
  'abducted',
  'birth'],
 ['protect',
  'family',
  'powerful',
  'drug',
  'lord',
  'skilled',
  'thief',
  'mehdi',
  'expert',
  'team',
  'robbers',
  'pulled',
  'violent',
  'deadly',
  'turf',
  'war'],
 ['feuds',
  'flirtations',
  'toilet',
  'talk',
  'go',
  'among',
  'incarcerated',
  'women',
  'orleans',
  'justice',
  'center',
  'new',
  'orleans',
  'gritty',
  'reality',
  'series'],
 ['city',
  'coaching',
  'centers',
  'known',
  'train',
  'indias',
  'finest',
  'collegiate',
  'minds',
  'earnest',
  'unexceptional',
  'student',
  'friends',
  'navigate',
  'campus',
  'life'],
 ['arrival',
  'charismatic',
  'young',
  'p

In [45]:
# Build a bigram (high_school instead of high and school)
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10)
bigram_mod = gensim.models.phrases.Phraser(bigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [47]:
as_bigrams = make_bigrams(without_stopwords)
as_bigrams

[['father',
  'nears',
  'end',
  'life',
  'filmmaker',
  'kirsten',
  'johnson',
  'stages',
  'death',
  'inventive',
  'comical',
  'ways',
  'help',
  'face',
  'inevitable'],
 ['crossing',
  'paths',
  'party',
  'cape',
  'town',
  'teen',
  'sets',
  'prove',
  'whether',
  'private_school',
  'swimming',
  'star',
  'sister',
  'abducted',
  'birth'],
 ['protect',
  'family',
  'powerful_drug',
  'lord',
  'skilled',
  'thief',
  'mehdi',
  'expert',
  'team',
  'robbers',
  'pulled',
  'violent',
  'deadly',
  'turf',
  'war'],
 ['feuds',
  'flirtations',
  'toilet',
  'talk',
  'go',
  'among',
  'incarcerated',
  'women',
  'orleans',
  'justice',
  'center',
  'new_orleans',
  'gritty',
  'reality_series'],
 ['city',
  'coaching',
  'centers',
  'known',
  'train',
  'indias',
  'finest',
  'collegiate',
  'minds',
  'earnest',
  'unexceptional',
  'student',
  'friends',
  'navigate',
  'campus',
  'life'],
 ['arrival',
  'charismatic',
  'young',
  'priest',
  'brings',


In [48]:
# Lemmatization -> convert words to root form
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sentence in texts:
        doc = nlp(" ".join(sentence)) # run 'python3 -m spacy download en_core_web_lg' before for this to work
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [50]:
data_lemmatized = lemmatization(as_bigrams)
data_lemmatized

[['near',
  'end',
  'life',
  'filmmaker',
  'stage',
  'death',
  'inventive',
  'comical',
  'way',
  'help',
  'face',
  'inevitable'],
 ['crossing',
  'path',
  'town',
  'teen',
  'set',
  'prove',
  'private_school',
  'swimming',
  'star',
  'sister',
  'abduct',
  'birth'],
 ['protect',
  'family',
  'skilled',
  'thief',
  'mehdi',
  'expert',
  'team',
  'robber',
  'pull',
  'violent',
  'deadly',
  'turf',
  'war'],
 ['feud', 'flirtation', 'toilet', 'talk', 'go', 'incarcerate', 'woman'],
 ['city',
  'coaching',
  'center',
  'know',
  'train',
  'fine',
  'collegiate',
  'mind',
  'earnest',
  'unexceptional',
  'student',
  'friend',
  'navigate',
  'campus',
  'life'],
 ['arrival',
  'charismatic',
  'young',
  'priest',
  'bring',
  'glorious',
  'miracle',
  'ominous',
  'mystery',
  'renew',
  'religious',
  'fervor',
  'die',
  'town',
  'desperate',
  'believe'],
 ['divide',
  'bright',
  'eyed',
  'hero',
  'believe',
  'earth',
  'pony',
  'pegasi',
  'unicorn',
 

In [52]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Filter out tokens that appear in only one document and appear in more than 90% of the documents
id2word.filter_extremes(no_below=2, no_above=0.9)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [53]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
 id2word=id2word,
 num_topics=15, 
 random_state=100,
 chunksize=100,
 passes=10,
 alpha=0.01,
 eta=0.9)

In [54]:
# Print the keywords in the topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.015*"life" + 0.006*"take" + 0.006*"murder" + 0.006*"family" + '
  '0.006*"woman" + 0.006*"man" + 0.005*"find" + 0.005*"story" + 0.005*"year" + '
  '0.004*"crime"'),
 (1,
  '0.014*"stand" + 0.011*"special" + 0.008*"comedian" + 0.007*"take" + '
  '0.006*"comedy" + 0.005*"show" + 0.005*"life" + 0.004*"celebrity" + '
  '0.004*"explore" + 0.003*"stage"'),
 (2,
  '0.004*"career" + 0.004*"player" + 0.003*"society" + 0.003*"soccer" + '
  '0.003*"documentary" + 0.003*"up" + 0.003*"athlete" + 0.003*"down" + '
  '0.002*"become" + 0.002*"rise"'),
 (3,
  '0.008*"save" + 0.007*"fight" + 0.006*"earth" + 0.006*"power" + 0.006*"evil" '
  '+ 0.005*"mission" + 0.005*"take" + 0.005*"team" + 0.005*"world" + '
  '0.005*"government"'),
 (4,
  '0.001*"matrix" + 0.001*"roam" + 0.001*"love_triangle" + 0.001*"notoriously" '
  '+ 0.001*"course" + 0.001*"trilogy" + 0.001*"opera" + 0.001*"unconscious" + '
  '0.001*"wing" + 0.001*"fierce"'),
 (5,
  '0.008*"adventure" + 0.007*"pal" + 0.006*"learn" + 0.005*"

In [61]:
def sort_Tuple(tup):  
    return(sorted(tup, key = lambda x: x[1], reverse = True))   

In [62]:
# Create document topic matrix
doc_num, topic_num, prob = [], [], []
print(lda_model.get_document_topics(corpus))
for n in range(len(df)):
    get_document_topics = lda_model.get_document_topics(corpus[n])
    doc_num.append(n)
    sorted_doc_topics = sort_Tuple(get_document_topics)
    topic_num.append(sorted_doc_topics[0][0])
    prob.append(sorted_doc_topics[0][1])
df['Doc'] = doc_num
df['Topic'] = topic_num
df['Probability'] = prob
df.to_csv("doc_topic_matrix.csv", index=False)

<gensim.interfaces.TransformedCorpus object at 0x7f9d3bfe3340>


In [69]:
def recommend_by_storyline(title, df):
    recommended = []
    top10_list = []

    title = title.lower()
    df['title'] = df['title'].str.lower()
    topic_num = df[df['title']==title].Topic.values
    doc_num = df[df['title']==title].Doc.values 

    output_df = df[df['Topic']==topic_num[0]].sort_values('Probability', ascending=False).reset_index(drop=True)
    index = output_df[output_df['Doc']==doc_num[0]].index[0]

    top10_list += list(output_df.iloc[index-5:index].index)
    top10_list += list(output_df.iloc[index+1:index+6].index)

    output_df['title'] = output_df['title'].str.title()

    for each in top10_list:
        recommended.append(output_df.iloc[each].title)

    return recommended
recommend_by_storyline('Solo: A Star Wars Story', df)

['The Matrix Reloaded',
 'Cursed',
 'Zenda',
 'Super Bheem Bana Vajraveer',
 'Solo: A Star Wars Story (Spanish Version)',
 'Thor: Ragnarok',
 'Arthdal Chronicles',
 'The Space Between Us',
 "Angel 'N' Devil",
 'Enter The Warriors Gate']