# Netflix EDA and Recommender System

---
**Importing the required packages**

In [None]:
# To prevent the annoyning Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('darkgrid')
cmap = sns.cm.mako_r

%matplotlib inline

**Importing the CSV file using pandas read_csv()**

In [None]:
netflix = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

**Calling the head() and info() to get an initial idea on the Data**

In [None]:
netflix.head()

In [None]:
netflix.info()

---
**Plotting a Bar graph for the Type of content**

In [None]:
plt.figure(figsize=(4, 5))

x_axis = netflix.value_counts('type').index
y_axis = netflix.value_counts('type').values

sns.barplot(x=x_axis, y=y_axis)

plt.xlabel('Type of Content')
plt.ylabel('Count')
plt.show()

---
**Plotting a Bar graph for Top 10 Countries with most Content released**

In [None]:
plt.figure(figsize=(10, 6))

x_axis = netflix.value_counts('country')[:10].values
y_axis = netflix.value_counts('country')[:10].index

sns.barplot(x=x_axis, y=y_axis)

plt.xlabel('Count')
plt.ylabel('Country')
plt.show()

---
**Plotting a Bar graph for Top 10 Rating**

In [None]:
plt.figure(figsize=(10, 6))

x_axis = netflix.value_counts('rating')[:10].index
y_axis = netflix.value_counts('rating')[:10].values

sns.barplot(x=x_axis, y=y_axis)

plt.xlabel('Rating Type')
plt.ylabel('Count')
plt.show()

---
**Year and Month where Most Contents were Released**

In [None]:
# Adding month as release_month column

def get_month(date):
    try:
        return date.strip().split(' ')[0]
    except:
        return date
    

netflix['release_month'] = netflix['date_added'].apply(lambda x : get_month(x)).astype('category')

# At last converting the date_added to DateTime Type
netflix['date_added'] = pd.to_datetime(netflix['date_added'])

In [None]:
month_order = ['January', 'February', 'March', 'April', 
               'May', 'June', 'July','August', 'September', 
               'October', 'November', 'December']

netflix_2000_plus = netflix[netflix['release_year'] >= 2000]

pivot_table = netflix_2000_plus.pivot_table(values='show_id', index='release_month', 
                                  columns='release_year', aggfunc='count').reindex(month_order)

pivot_table.fillna(0)

plt.figure(figsize=(16, 8))

sns.heatmap(pivot_table, linewidths=1, cmap=cmap)

plt.xlabel('Year of Release')
plt.ylabel('Month')
plt.title("Heatmap for Contents released each Month")

plt.show()

Current I have just focused on Contents which were released either in the year 2000 or more. 

---
**Plotting a KDE for Duration of the Movies and TV Shows**

In [None]:
def get_minutes(duration):
    try:
        return int(duration.strip().split(' ')[0])
    except:
        return 0

def get_seasons(season):
    try:
        return int(season.strip().split(' ')[0])
    except:
        return 0
    
netflix_movies = netflix[netflix['type'] == 'Movie']
netflix_tvshows = netflix[netflix['type'] == 'TV Show']

movies_time_distribution = [get_minutes(duration) for duration in netflix_movies['duration']]
tvshows_time_distribution = [get_minutes(duration) for duration in netflix_tvshows['duration']]

In [None]:
fig, (axis1, axis2) = plt.subplots(1,2, figsize=(14,6))

sns.histplot(movies_time_distribution, bins=50, kde=True, ax=axis1)
sns.histplot(tvshows_time_distribution, bins=40, kde=True, ax=axis2)

axis1.set_xlabel("Duration (in Minutes)")
axis2.set_xlabel("Duration (in Seasons)")

plt.show()

---
**Displaying the Top 20 Genre in Netflix using the Squarify**

In [None]:
genre_set = set()

for genres in netflix['listed_in']:
    try:
        listed_in = [genre.strip() for genre in genres.split(',')]
        genre_set.update(listed_in)
    except:
        pass

genre_dictionary = {}
    
for genre in genre_set:
    for genre_list in netflix['listed_in']:
        try:
            listed_in = [genre.strip() for genre in genre_list.split(',')]
            if genre in listed_in:
                if genre in genre_dictionary.keys():
                    genre_dictionary[genre] = genre_dictionary[genre] + 1
                else:
                    genre_dictionary[genre] =1
        except:
            pass
        
genre_list_with_counts = sorted(genre_dictionary.items(), key=lambda x: x[1], reverse=True)

In [None]:
!pip install squarify

In [None]:
import squarify

plt.figure(figsize=(18, 12))

labels = [genre_counts[0] for genre_counts in genre_list_with_counts[:20]]
sizes = [genre_counts[1] for genre_counts in genre_list_with_counts[:20]]

squarify.plot(sizes=sizes, label=labels, color=["#63D1F4","#8FDEF7","#BBEBFA"])

plt.axis('off')
plt.show()

---
**WordCloud for the Description**

In [None]:
import re
from nltk import word_tokenize, corpus
from nltk.stem import PorterStemmer
from wordcloud import WordCloud, STOPWORDS

word_tokens = [word_tokenize(text) for text in netflix.description]

listed_in_string = ""

for word_list in word_tokens:
    for word in word_list:
            listed_in_string += word + " "
            
description_stopwords = set(STOPWORDS)
description_stopwords.update(["S", "new"])

my_word_cloud = WordCloud(background_color='white', stopwords=description_stopwords).generate(listed_in_string)
plt.figure(figsize=(16,22))
plt.imshow(my_word_cloud, interpolation='bilinear')
plt.title("Word Cloud for Description", fontsize=30)
plt.axis('off')
plt.show()

---

**Recommender System which will display similar movies to the one we provide based on the description.**

I have no experience in designing a 'Recommender System', I have used the code from the below provided notebook. Please look into that notebook as it contains a more accurate one.

<a href="https://www.kaggle.com/niharika41298/netflix-visualizations-recommendation-eda">Click here to navigate to Notebook from Niharika Pandit</a>

In [None]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

netflix['description'] = netflix['description'].fillna('')

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(netflix['description'])

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(netflix.index, index=netflix['title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return netflix['title'].iloc[movie_indices]

In [None]:
get_recommendations('The Circle')

In [None]:
get_recommendations('The Invisible Guest')

In [None]:
get_recommendations("The Queen's Gambit")

# Thanks a lot for showing interest in My Notebook