In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Read the data
df = pd.read_csv("../datasets/netflix_movies_and_shows_1/netflix_titles.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
# Build a column of combined values from the relevant columns
relevant_cols = ['type', 'title', 'director', 'cast', 'rating', 'listed_in', 'description']
df['combined'] = df[relevant_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [4]:
df['combined']

0       Movie Dick Johnson Is Dead Kirsten Johnson nan...
1       TV Show Blood & Water nan Ama Qamata, Khosi Ng...
2       TV Show Ganglands Julien Leclercq Sami Bouajil...
3       TV Show Jailbirds New Orleans nan nan TV-MA Do...
4       TV Show Kota Factory nan Mayur More, Jitendra ...
                              ...                        
8802    Movie Zodiac David Fincher Mark Ruffalo, Jake ...
8803    TV Show Zombie Dumb nan nan TV-Y7 Kids' TV, Ko...
8804    Movie Zombieland Ruben Fleischer Jesse Eisenbe...
8805    Movie Zoom Peter Hewitt Tim Allen, Courteney C...
8806    Movie Zubaan Mozez Singh Vicky Kaushal, Sarah-...
Name: combined, Length: 8807, dtype: object

In [5]:
# Remove non-ascii characters
#df['combined'] = df['combined'].map(lambda x: re.sub("([^\x00-\x7F])+", "", x))
#df['combined']

In [6]:
# Convert all words to lowercase and remove stop words
documents = df['combined']
count_vectorizer = CountVectorizer(stop_words='english')
sparse_matrix = count_vectorizer.fit_transform(documents)

In [7]:
# Compute similarity score between each document
similarity_scores = cosine_similarity(sparse_matrix, sparse_matrix)
similarity_scores = pd.DataFrame(similarity_scores)

In [8]:
# Recommender function
def recommend(input_title, scores_df, df):
    recommended = []
    input_title = input_title.lower()
    df['title'] = df['title'].str.lower()
    index = df[df['title'] == input_title].index[0]
    
    top10_list = list(scores_df.iloc[index].sort_values(ascending = False).iloc[1:11].index)
    for each in top10_list:
        recommended.append(df.iloc[each].title)
    return recommended

In [9]:
recommend('Solo: A Star Wars Story', similarity_scores, df)

['solo: a star wars story (spanish version)',
 '2012',
 'inkheart',
 'star wars: episode viii: the last jedi',
 'the karate kid part ii',
 'beowulf',
 'mirai',
 'hellboy',
 'the space between us',
 'real steel']