In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px
import plotly.graph_objects as go


%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

Load the Data Set.

In [None]:
df = pd.read_csv("/kaggle/input/netflix-shows/netflix_titles.csv")
df.head()

In [None]:
df.info()

As displayed above, there are few missing values in different columns. We will have to look at those in order to proceed.

The dataset has 7787 rows and 12 columns:<br>

show_id: unique id of each show (not much of a use for us in this notebook)<br>
type: The category of a show, can be either a Movie or a TV Show<br>
title: Name of the show<br>
director: Name of the director(s) of the show<br>
cast: Name of actors and other cast of the show<br>
country: Name of countries the show is available to watch on Netflix<br>
date_added: Date when the show was added on Netflix<br>
release_year: Release year of the show<br>
rating: Show rating on netflix<br>
duration: Time duration of the show<br>
listed_in: Genre of the show<br>
description: Some text describing the show<br>

In [None]:
#Let us visualize the missing values. 

msno.matrix(df)

Director, cast and country has lot of variation which indicates there are missing values in these columns.

In [None]:
#Let's find out missing value count using Bar chart 

msno.bar(df)

In [None]:
#Let's see how many unique values are there
df.nunique()

In [None]:
#Count of missing values.

df.isna().sum()

In [None]:
df.groupby('type').count()

In [None]:
#using Descripition , We will try building recommendation engine

# Remove unwanted rows from description column
#We are dropping those movies which does not have description.
df.dropna(subset=['description'],inplace=True)

Exploratory Data Analysis.

Let's produce a word cloud of the movie overview

In [None]:
#Let's plot what words are common from the description.

from wordcloud import WordCloud
def plot_wordcloud(msg):
    plt.figure(figsize=(8,8))
    wordcloud = WordCloud(max_font_size=120, background_color = 'white').generate(", ".join(msg))
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis('off')
    plt.show()

    
plot_wordcloud(df['description'])


Observation: Life is the most commonly used word. Family,Find, World, two are also popular in Movies and TV Series wordcloud. Together with woman,Love, these wordclouds give us a pretty good idea of the most popular themes present in movies.

In [None]:
# Create a column that contains a combintion of listed_in,cast,director and genres

df['soup'] = df['listed_in'] + df['cast'] + df['director']

In [None]:
df['soup']

In [None]:
df['soup'].isna().sum()

In [None]:
df = df.dropna()

In [None]:
df['soup']

In [None]:
df.info()

In [None]:
df['soup'][1]

In [None]:
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

In [None]:
# Vectorize words to numbers and builds a sparse matrix

count = CountVectorizer(analyzer = 'word', ngram_range = (1,2), stop_words ='english')
count_matrix = count.fit_transform(df['soup'])
print(count_matrix)

In [None]:
#Computes similarity between movies and series using cosine similarity metric

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
print (cosine_sim.shape)

In [None]:
#Getting the title of the movie and series along with the index

df = df.reset_index()
titles = df['title']
print(titles)
indices = pd.Series(df.index, index = df['title'])
print(indices)

In [None]:
def recommendations(userId, title):
    
    #Extract the index of movie title
    idx = indices[title]
    
    #Extract the similarity scores and their corresponding index for every movie from the cosine similarity matrix
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    #Sort the (index,score) tuples in decreasing order of similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    #Select top 25
    sim_scores = sim_scores[1:26]  #not taking 0 in consideration because we are neglecting the self
    print ("Sim_scores",sim_scores)
    #Store the cosine_sim indices of the top 25 movies in a list
    movie_indices = [i[0] for i in sim_scores]
    
    #Extract metadata of the movie
    movies = df.iloc[movie_indices][['title', 'director', 'cast', 'rating']]
     
    #Sort the movies in decreasing order of predicted rating 
    movies = movies.sort_values('title', ascending=False)
    
    #Return top 10 movies as recommendations
    return movies.head(10)

In [None]:
recommendations(4805, 'Zoom')