# Netflix Content Recommender
Using <a href="https://www.kaggle.com/shivamb/netflix-shows">Kaggle dataset on Netflix shows and movies</a> with scikit-learn's cosine similarity to recommend similar Netflix content based on user input.

In [1]:
#Basic imports
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#Imports for preprocessing
from rake_nltk import Rake
import nltk
from nltk.corpus import stopwords
import string

In [2]:
#Read content
content = pd.read_csv('netflix_titles.csv')

In [3]:
content = content[['title', 'director', 'cast', 'listed_in', 'description']]

In [4]:
content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        8807 non-null   object
 1   director     6173 non-null   object
 2   cast         7982 non-null   object
 3   listed_in    8807 non-null   object
 4   description  8807 non-null   object
dtypes: object(5)
memory usage: 344.1+ KB


In [5]:
content.describe()

Unnamed: 0,title,director,cast,listed_in,description
count,8807,6173,7982,8807,8807
unique,8807,4528,7692,514,8775
top,Dick Johnson Is Dead,Rajiv Chilaka,David Attenborough,"Dramas, International Movies","Paranormal activity at a lush, abandoned prope..."
freq,1,19,19,362,4


In [6]:
#Process director, cast, and listed_in columns by merging first and last names and genres titles. 
#This ensures uniqueness of genres/names.
def tag_process(content):
    if isinstance(content, str):
        split = content.split(', ')
        no_nan = [str(director) for director in split]
        lowered = [director.lower().replace(' ','') for director in no_nan]
        return lowered
    return []

In [7]:
def extract_keywords(text):
    #Instantiate Rake
    r = Rake()
    r.extract_keywords_from_text(text)

    #Get dictionary (keys are keywords, values are scores)
    #Chose not to include scores because each word is equally important in calculating similarity.
    key_words_dict_scores = r.get_word_degrees()

    #Returning keywords
    text = list(key_words_dict_scores.keys())
    return text

In [8]:
#Process director, cast, listed_in columns
for tag in ['director', 'cast', 'listed_in']:
    content[tag] = content[tag].apply(tag_process)

In [9]:
#Process description
content['description'] = content['description'].apply(extract_keywords)

In [10]:
content.head()

Unnamed: 0,title,director,cast,listed_in,description
0,Dick Johnson Is Dead,[kirstenjohnson],[],[documentaries],"[father, nears, end, life, filmmaker, kirsten,..."
1,Blood & Water,[],"[amaqamata, khosingema, gailmabalane, thabangm...","[internationaltvshows, tvdramas, tvmysteries]","[crossing, paths, party, cape, town, teen, set..."
2,Ganglands,[julienleclercq],"[samibouajila, tracygotoas, samueljouy, nabiha...","[crimetvshows, internationaltvshows, tvaction&...","[protect, family, powerful, drug, lord, skille..."
3,Jailbirds New Orleans,[],[],"[docuseries, realitytv]","[feuds, flirtations, toilet, talk, go, among, ..."
4,Kota Factory,[],"[mayurmore, jitendrakumar, ranjanraj, alamkhan...","[internationaltvshows, romantictvshows, tvcome...","[city, coaching, centers, known, train, india,..."


In [11]:
#Add director, cast, listed_in, and description tofether
def make_keywords(data):
    res = data['director'] + data['cast'] + data['listed_in'] + data['description']
    return res

In [12]:
#Create 'keywords' column
content['keywords'] = ''

In [13]:
content['keywords'] = content.apply(make_keywords, axis=1)

In [14]:
content.head()

Unnamed: 0,title,director,cast,listed_in,description,keywords
0,Dick Johnson Is Dead,[kirstenjohnson],[],[documentaries],"[father, nears, end, life, filmmaker, kirsten,...","[kirstenjohnson, documentaries, father, nears,..."
1,Blood & Water,[],"[amaqamata, khosingema, gailmabalane, thabangm...","[internationaltvshows, tvdramas, tvmysteries]","[crossing, paths, party, cape, town, teen, set...","[amaqamata, khosingema, gailmabalane, thabangm..."
2,Ganglands,[julienleclercq],"[samibouajila, tracygotoas, samueljouy, nabiha...","[crimetvshows, internationaltvshows, tvaction&...","[protect, family, powerful, drug, lord, skille...","[julienleclercq, samibouajila, tracygotoas, sa..."
3,Jailbirds New Orleans,[],[],"[docuseries, realitytv]","[feuds, flirtations, toilet, talk, go, among, ...","[docuseries, realitytv, feuds, flirtations, to..."
4,Kota Factory,[],"[mayurmore, jitendrakumar, ranjanraj, alamkhan...","[internationaltvshows, romantictvshows, tvcome...","[city, coaching, centers, known, train, india,...","[mayurmore, jitendrakumar, ranjanraj, alamkhan..."


In [15]:
#Use only title and keywords
content = content[['title', 'keywords']]
content['keywords'] = content['keywords'].apply(lambda x: ' '.join([str(i) for i in x]))

In [16]:
content.head()

Unnamed: 0,title,keywords
0,Dick Johnson Is Dead,kirstenjohnson documentaries father nears end ...
1,Blood & Water,amaqamata khosingema gailmabalane thabangmolab...
2,Ganglands,julienleclercq samibouajila tracygotoas samuel...
3,Jailbirds New Orleans,docuseries realitytv feuds flirtations toilet ...
4,Kota Factory,mayurmore jitendrakumar ranjanraj alamkhan ahs...


In [17]:
#Instantiating count vector
count = CountVectorizer()
count_matrix = count.fit_transform(content['keywords'])

#Generating cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [18]:
#Creating Series of movie titles so they can be ordered numerically for retrieval
indices = pd.Series(content.index)

#Defining recommendation function taking in title of movie, and outputting 10 similar movies
def recommendations(title, cosine_sim = cosine_sim):
    
    #Creating empty list of recommended movies
    recommended_movies = []
    
    #Getting index of matching movie title to input
    idx = content.index[content['title'] == title].values[0]

    #Creating a Series with the cosine similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    #Getting indexes of top 10 similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    for i in top_10_indexes:
        recommended_movies.append(list(content.index)[i])
        
    #Index can be used here to find more information if needed, like whether it's a TV Show/Movie
    #or to retrieve description/genres/director/cast
        
    return [content['title'][j] for j in recommended_movies]

In [19]:
#Final recommendation function!
recommendations('Jaws 2')

['Jaws',
 'Jaws: The Revenge',
 'Jaws 3',
 'Sweetheart',
 'Grandmaster',
 'In The Deep',
 'Oxygen',
 'Rogue City',
 'Carriers',
 'Ugly']