In [101]:
import pandas as pd
from rake_nltk import Rake
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
import string

In [102]:
content = pd.read_csv('netflix_titles.csv')

In [103]:
content = content[['title', 'director', 'cast', 'listed_in', 'description']]

In [104]:
content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        8807 non-null   object
 1   director     6173 non-null   object
 2   cast         7982 non-null   object
 3   listed_in    8807 non-null   object
 4   description  8807 non-null   object
dtypes: object(5)
memory usage: 344.1+ KB


In [105]:
content.describe()

Unnamed: 0,title,director,cast,listed_in,description
count,8807,6173,7982,8807,8807
unique,8807,4528,7692,514,8775
top,Dick Johnson Is Dead,Rajiv Chilaka,David Attenborough,"Dramas, International Movies","Paranormal activity at a lush, abandoned prope..."
freq,1,19,19,362,4


In [106]:
def tag_process(content):
    if isinstance(content, str):
        split = content.split(', ')
        no_nan = [str(director) for director in split]
        lowered = [director.lower().replace(' ','') for director in no_nan]
        return lowered
    return []

In [107]:
def desc_process(desc):
    no_punc = [char for char in desc if char not in string.punctuation]
    no_punc = ''.join(no_punc)
    no_punc = no_punc.split()
    no_stopwords = [word for word in no_punc if word not in stopwords.words('english')]
    lower = [word.lower() for word in no_stopwords]
    return lower

In [108]:
content['description'] = content['description'].apply(extract_keywords)

In [109]:
content.head()

Unnamed: 0,title,director,cast,listed_in,description
0,Dick Johnson Is Dead,Kirsten Johnson,,Documentaries,"[father, nears, end, life, filmmaker, kirsten,..."
1,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...","International TV Shows, TV Dramas, TV Mysteries","[crossing, paths, party, cape, town, teen, set..."
2,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...","Crime TV Shows, International TV Shows, TV Act...","[protect, family, powerful, drug, lord, skille..."
3,Jailbirds New Orleans,,,"Docuseries, Reality TV","[feuds, flirtations, toilet, talk, go, among, ..."
4,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...","International TV Shows, Romantic TV Shows, TV ...","[city, coaching, centers, known, train, india,..."


In [110]:
def extract_keywords(text):
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(text)

    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()

    # assigning the key words to the new column for the corresponding movie
    text = list(key_words_dict_scores.keys())
    
    return text

In [111]:
for tag in ['director', 'cast', 'listed_in']:
    content[tag] = content[tag].apply(tag_process)

In [112]:
def make_keywords(data):
    res = data['director'] + data['cast'] + data['listed_in'] + data['description']
    return res

In [113]:
content['keywords'] = content.apply(make_keywords, axis=1)

In [114]:
content = content[['title', 'keywords']]
content['keywords'] = content['keywords'].apply(lambda x: ' '.join([str(i) for i in x]))

In [115]:
content.head()

Unnamed: 0,title,keywords
0,Dick Johnson Is Dead,kirstenjohnson documentaries father nears end ...
1,Blood & Water,amaqamata khosingema gailmabalane thabangmolab...
2,Ganglands,julienleclercq samibouajila tracygotoas samuel...
3,Jailbirds New Orleans,docuseries realitytv feuds flirtations toilet ...
4,Kota Factory,mayurmore jitendrakumar ranjanraj alamkhan ahs...


In [116]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(content['keywords'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [117]:
# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use in the function to match the indexes
indices = pd.Series(content.index)

#  defining the function that takes in movie title 
# as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended movies
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = content.index[content['title'] == title].values[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(content.index)[i])
        
    return [content['title'][j] for j in recommended_movies]

In [123]:
recommendations('Jaws')

['Jaws 2',
 'Jaws: The Revenge',
 'Jaws 3',
 'Indiana Jones and the Last Crusade',
 'Pulang',
 'Veerappan',
 'Saladin',
 'A Wednesday',
 'Adrift',
 'Singham']