In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import squarify 

#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# Import the data
movie_df = pd.read_csv('../input/d/shivamb/netflix-shows/netflix_titles.csv')
movie_df.head()

In [None]:
#Let's begin by looking at the structure of the data
movie_df.info()

Let's try to explore the data and answer these interesting questions:
1. Understanding what content is available in different countries
2. Identifying similar content by matching text-based features
3. Network analysis of Actors / Directors and find interesting insights
4. Is Netflix has increasingly focusing on TV rather than movies in recent years?


In [None]:
titles_by_country= movie_df.groupby('country')[['title']].sum()
titles_by_country_label= movie_df.groupby('country')[['title']].sum().index.get_level_values(0)

In [None]:
print(titles_by_country_label)

In [None]:
#squarify.plot(sizes=titles_by_country, label= titles_by_country_label, alpha=.8)
#plt.axis('off')
#plt.show()

Out of all the features that are mentioned above, we will only look at a few that we want to use for this analysis.
There are about 7787 Titles, and we see a lot of features have data missing. The features that we want to utilize for this project are- director, cast and listed_in

In [None]:
#Replace NaN with an empty string
movie_df['cast'] = movie_df['cast'].fillna('')
movie_df['director'] = movie_df['director'].fillna('')


First we want to try a simple Content Based Recommendor System which will recommend us movies/series which are similar to each other. To achieve this, we will compute pairwise cosine similarity scores for all movies/series based on their genre which is given by the listed_in feature and recommend movies/series based on that similarity score threshold.

Since the data in question is in Natural Language, we will have to compute the word vectors for the data and find the similarity between each vector. We will be using a built-in class called as Term Frequency-Inverse Document Frequency (TF-IDF) vectors for each document. The TF-IDF score gives us the frequency of words occuring in a document which helps us evaluating the similarity between two documents.

In [None]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movie_df['listed_in'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

In [None]:
#Array mapping from feature integer indices to feature name.
tfidf.get_feature_names()[40:44]

Now, we will use Cosine Similarity to find the similarity between two movies.

In [None]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

In [None]:
cosine_sim[1]

In [None]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(movie_df.index, index=movie_df['title']).drop_duplicates()
indices[:10]

In [None]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:10]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movie_df['title'].iloc[movie_indices]

Now that the function is written, let's try to get recommendations for a similar movie/series title.

In [None]:
get_recommendations("Community")

This system has given us some good recommendations based on the genre associated with the title but let us try to make it better. There is a high possibility that we would like a recommendation for a movie/series by the same actor or director or of similar genres. For the next model let's try to combine all of these features to make our recommendation algorithm better.

To combine these features together, we need to concatenate them together and create a soup feature. To do that we need to clean the data. Since we are considering the name of Actors and Directors, we need to make sure that actors with same names should not be considered the same. For example, John Travolta and John Krasinski are two separate actors, to resolve that we need to remove the space between them.

In [None]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
        
# Apply clean_data function to your features.
features = ['cast', 'director', 'listed_in']

for feature in features:
    movie_df[feature] = movie_df[feature].apply(clean_data)

In [None]:
#Define a function to create a soup feature
def create_soup(x):
    return  ''.join(x['cast']) + ',' + ''.join(x['director'])  + ',' + ''.join(x['listed_in']) 

# Create a new soup feature
movie_df['soup'] = movie_df.apply(create_soup, axis=1)
movie_df['soup']

In the previous model, we used the TF-IDF, for this model we will use the CountVectorizer(). The difference between both these objects the inverse document factor(IDF). We do not want to down-weight any features that we are using for this analysis.  

In [None]:
#Define a Count Vectorizer Object. Remove all english stop words such as 'the', 'a'
count = CountVectorizer(stop_words='english')


In [None]:
count_matrix = count.fit_transform(movie_df['soup'])
count_matrix.shape

Next, we will use the cosine_similarity to measure the distance between the embeddings.

In [None]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [None]:
# Reset index of your main DataFrame and construct reverse mapping as before
movie_df = movie_df.reset_index()
indices = pd.Series(movie_df.index, index=movie_df['title'])

Let's try to get recommendations for a similar movie/series title using this model.

In [None]:
get_recommendations("Community", cosine_sim2 )

This time around we got some different recommendations. We do see that the recommendations include the cast/director of the title we provided.