In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np
import matplotlib.font_manager
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.metrics.pairwise import linear_kernel
%matplotlib inline
sns.set(style="whitegrid")

# Data exploration 

In [None]:
data = pd.read_csv("../input/netflix-shows/netflix_titles.csv")
data.head()

In [None]:
data.info()

Transform the `date_added` column from object type to datetime type for ease of use

In [None]:
data["date_added"] = pd.to_datetime(data["date_added"])
data.info()

In [None]:
data.shape

Find missing data in the dataset

In [None]:
data.isna().sum()

# Cleaning data
Remove unnecessary data for this analysis such as `show_id`, `director`, `cast`

In [None]:
data = data.drop(labels=["show_id","director","cast"], axis=1)
data.head()

Impute the missing data in the countries with `United States` because `United States` is the most repeated value. And delete the `rating` and `date_added` rows since only `7` and `10` records were missing respectively.

In [None]:
data["country"]=data["country"].fillna("United States")
data = data.dropna(subset=["rating","date_added"])
data.isna().sum()

In [None]:
data["listed_in"] = data["listed_in"].apply(lambda x: x.split(",")[0])
data["country"] = data["country"].apply(lambda x: x.split(",")[0])
data.head()

#  Data analysis

In [None]:
total_catalog = data[["type"]]
total_catalog  = total_catalog .value_counts().reset_index(name="count")
total_catalog 

In [None]:
color_palette_list = ["#76D7C4", "#D1F2EB"]

fig, ax = plt.subplots(figsize = (10,6))
labels = total_catalog.type.unique()
ax.pie(total_catalog["count"],explode=(0.1,0),labels=labels,colors=color_palette_list[:], autopct="%1.0f%%", 
       shadow=True, startangle=0)
ax.axis("equal")
ax.set_title("Distribution of the netflix catalog by movies and tv show", fontweight="bold",size=14)
ax.legend(frameon=False)
plt.show()

69% of netflix's total productions correspond to `movies` while the remaining 31% represent `tv show`.

In [None]:
total_productions_per_year = data[['release_year']]
total_productions_per_year = total_productions_per_year[total_productions_per_year['release_year'] >= 2010].value_counts().reset_index(name='counts')

In [None]:
def vertical_show_values_on_bars(axs):
    def _show_on_single_plot(ax):        
        for p in ax.patches:
            height = p.get_height() 
            width = p.get_width() 
            _x = p.get_x() + width / 2
            _y = p.get_y() + height
            ax.text(_x, _y, int(height), ha="center") 

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)
        
def horizontal_show_values_on_bars(axs):
    def _show_on_single_plot(ax):
        for p in ax.patches:
            height = p.get_height() 
            width = p.get_width() 
            _x = width+3
            _y =  p.get_y()+(height/2)
            ax.text(_x, _y, int(width), va="center") 

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)

In [None]:
total_content_peer_year = data
total_content_peer_year = total_content_peer_year[['release_year']]
total_content_peer_year = total_content_peer_year[total_content_peer_year['release_year'] 
>= 2010].value_counts().reset_index(name='counts')

In [None]:
ax = plt.figure(figsize=(10,6))
ax = sns.barplot(data=total_content_peer_year,x='release_year',y = 'counts',alpha=0.8, palette="hls")
sns.despine()
vertical_show_values_on_bars(ax)
plt.title("Number total of content added",fontweight='bold',size=14)
plt.ylabel("Number of content added  in the year ",size=14)
plt.xticks(size=14)
plt.yticks(size=14)
plt.xlabel("Years (2010 - 2021)",size=14)
plt.show

We can see a higher amount of content added to the Netflix catalog in the years `2016 - 2020` with '2018' being its highest peak. 

In [None]:
#Tv shows
tv_shows_peer_year = data[data["type"] == "TV Show"]
tv_shows_peer_year = tv_shows_peer_year[["release_year"]]
tv_shows_peer_year = tv_shows_peer_year[tv_shows_peer_year["release_year"] >= 2010].value_counts().reset_index(name="counts")
#Movies
movies_peer_year = data[data["type"] == "Movie"]
movies_peer_year = movies_peer_year[["release_year"]]
movies_peer_year = movies_peer_year[movies_peer_year["release_year"] >= 2010].value_counts().reset_index(name="counts")

In [None]:
fig, axes = plt.subplots(1,2 , figsize=(16, 8))
fig.suptitle('Number of content added peer type',fontweight='bold')
axes[0].set_title("Number of movies added",fontsize=14)
axes[1].set_title("Number of tv show added",fontsize=14)
sns.barplot(ax=axes[0] ,x='release_year',y = 'counts',data=movies_peer_year,alpha=0.8, palette="hls")
sns.barplot(ax=axes[1] ,x='release_year',y = 'counts',data=tv_shows_peer_year,alpha=0.8)
sns.despine()
plt.setp(axes[:], ylabel='Number of content added')
plt.setp(axes[:], xlabel='Years 2010 - 2021')
vertical_show_values_on_bars(axes[:])
plt.show()


The number of content added in the period `2016 to 2018` increased but decreased in the period `2019 to 2021`, on the contrary, tv shows had a growth that has been increasing since `2016`, maintaining that trend, it can be concluded that there is a trend of Netflix users towards tv shows.

In [None]:
category = data[["listed_in"]].value_counts().reset_index(name="count")
plt.figure(figsize=(12, 8))
plt.title("Catalog of netflix by cateory",size=14, fontweight='bold')
ax = sns.barplot(data=category,x = "count", y = "listed_in", alpha=0.6, palette="hls")
horizontal_show_values_on_bars(ax)
sns.despine()
plt.xlabel("Number of movies and TV shows by category",size=14)
plt.ylabel("Categories", size=14)
plt.show()

In [None]:
content_by_countries = data[["country"]].value_counts().reset_index(name="count")[:15]
plt.figure(figsize=(16, 8))
plt.title("Top 15 countries creating content",size=14, fontweight='bold')
ax = sns.barplot(data=content_by_countries,x = "country", y = "count", alpha=0.8, palette="hls")
vertical_show_values_on_bars(ax)
sns.despine()
plt.xlabel("Content-creating countries",size=14)
plt.ylabel("Number of content created", size=14)
plt.show()

In [None]:
text = " ".join(review for review in data.description)

In [None]:
plt.figure(figsize=(20,6), facecolor='k')
wordcloud = WordCloud(width=1080, height=480, colormap="Oranges_r").generate(text)
plt.title("WordCloud of description")
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0) 
plt.show()

# Recommendation system using tf-idf

The tf–idf, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling.

To do so I made use of the following resources:
* [tf–idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)
* [Building a movie content based recommender using tf-idf](https://towardsdatascience.com/content-based-recommender-systems-28a1dbd858f5)
* [Working With Text Data](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)
* [Cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity)

In [None]:
tfidfVec = TfidfVectorizer(use_idf=True,stop_words="english")
tfid_matrix = tfidfVec.fit_transform(data["description"])
tfid_matrix.shape

In [None]:
cosine_sim = linear_kernel(tfid_matrix)

Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space. It is defined to equal the cosine of the angle between them, which is also the same as the inner product of the same vectors normalized to both have length 1.
<p align="center">
  <img width="400" height="100" src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1d94e5903f7936d3c131e040ef2c51b473dd071d">
</p> 

In [None]:
indices = pd.Series(data.index, index=data['title']).drop_duplicates()
indices.head()

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    similar_scores = list(enumerate(cosine_sim[idx]))
    similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)
    similar_scores = similar_scores[1:11]
    movie_index = [i[0] for i in similar_scores]
    return print("The recommendations are: \n",data["title"].iloc[movie_index])

In [None]:
get_recommendations("Altered Carbon")

In [None]:
get_recommendations("Marco Polo")

# <center> Thank you for reading the notebook if it helped you vote it </center>