In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns #importing our visualization library
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

In [None]:
df.head()

In [None]:
 df.isnull().sum() #checking for null values

In [None]:
sns.heatmap(df.isnull(),cmap = 'viridis')

We have null values in  director, cast,country,date_added and rating.So lets deal with it.

In [None]:
df['rating'].value_counts().unique()

In the above data, we can choose to drop the director and cast columns completely as they are not a huge part for us while we visualize the data and they dont add any significant value to our analysis. We are only focused on visualizing this data hence dropping two columns wont be any trouble for us.But this should not be a regular practise as if we are making a recommender system, we cannot drop the director and cast of a movie as these are a key feature used to recommend movies to users

In [None]:
df.drop(['director','cast'],axis = 1,inplace = True)

In [None]:
df.head()

We replaced all the Nan values in the country column with United States as Netflix was created in the USA and every show is aired on Netflix US. So instead of dropping the whole column we just replaced the values in it in order to save our data.

In [None]:
df['country'].replace(np.nan, 'United States',inplace  = True)

We already have released year for each movie and hence even if we dont have released date,it wont affect our analsis much. Hence we can Drop released date column.

In [None]:
df.drop(['date_added'],axis =1,inplace = True)

In [None]:
df.head()

In [None]:
df['rating'].value_counts()

In [None]:
df['listed_in'].value_counts()

As we can see we only have 10 missing values in our rating column, we can either drop them or replace them. We have TV-MA which is the most common raing and hence we can replace all these nan values with TV-MA.

In [None]:
df['rating'].replace(np.nan, 'TV-MA',inplace  = True)

In [None]:
df.isnull().sum()

We have now dealt with all of our missing data so lets  get started with our data visualization

In [None]:
df.head()

In [None]:
sns.countplot(x='type',data = df) #looking at number of Movies and TV shows

In [None]:
plt.figure(figsize = (12,8))
sns.countplot(x='rating',data = df)

In [None]:
plt.figure(figsize = (35,6))
sns.countplot(x='release_year',data = df)

As we can see most of the Movies and Tv shows on Netflix are Released in the past decade and very few were released earlier

In [None]:
plt.figure(figsize=(16,6))
sns.scatterplot(x='rating',y='type',data = df) #analysing the type, whether its a movie or a movie v/s the rating it has

In [None]:
plt.figure(figsize = (12,8))
sns.countplot(x='rating',data = df,hue='type')

In [None]:
import plotly.express as px #distribution according to countries
top_rated=df[0:10]
fig =px.sunburst(
    top_rated,
    path=['country'])
fig.show()

In [None]:
df['rating'].value_counts().plot.pie(autopct='%1.1f%%',figsize=(20,35)) #distribution according to the rating
plt.show()

In [None]:
country_counter=df['country'].value_counts().sort_values(ascending=False) #countries with the most rated content
country_counter=pd.DataFrame(country_count)
topcountry=country_count[0:11]
topcountry

In [None]:
old = df.sort_values("release_year", ascending = True) #oldest movies available on netflix
old = old[old['duration'] != ""]
old[['title', "release_year"]][:15]

In [None]:
tag = "Stand-Up Comedy" #standup shows on Netflix
df["relevant"] = df['listed_in'].fillna("").apply(lambda x : 1 if tag.lower() in x.lower() else 0)
com = df[df["relevant"] == 1]
com[com["country"] == "United States"][["title", "country","release_year"]].head(10)

In [None]:
tag = "Kids' TV" #Kids TV shows on Netflix
df["relevant"] = df['listed_in'].fillna("").apply(lambda x : 1 if tag.lower() in x.lower() else 0)
com = df[df["relevant"] == 1]
com[com["country"] == "United States"][["title", "country","release_year"]].head(10)

In [None]:
df_countries = pd.DataFrame(df.country.value_counts().reset_index().values, columns=["country", "count"])
df_countries.head()

In [None]:
fig = px.choropleth(   #distribution of content on basis of countries
    locationmode='country names',
    locations=df_countries.country,
    labels=df_countries["count"]
)
fig.show()

In [None]:
date = pd.DataFrame(df.release_year.value_counts().reset_index().values, columns=["Year", "Count"])
date.head()

In [None]:
plt.figure(figsize=(12,6))
df[df["type"]=="Movie"]["release_year"].value_counts()[:20].plot(kind="bar",color="Red")
plt.title("Frequency of Movies which were released in different years and are available on Netflix")

In [None]:
plt.figure(figsize=(12,6))
df[df["type"]=="TV Show"]["release_year"].value_counts()[:20].plot(kind="bar",color="Blue")
plt.title("Frequency of TV shows which were released in different years and are available on Netflix")

In [None]:
plt.figure(figsize=(12,6))
df[df["type"]=="Movie"]["listed_in"].value_counts()[:10].plot(kind="barh",color="black")
plt.title("Top 10 Genres of Movies",size=18)

In [None]:
plt.figure(figsize=(12,6))
df[df["type"]=="TV Show"]["listed_in"].value_counts()[:10].plot(kind="barh",color="brown")
plt.title("Top 10 Genres of TV Shows",size=18)

In [None]:
from wordcloud import WordCloud

In [None]:
plt.subplots(figsize=(25,15))
wordcloud = WordCloud(
                          background_color='Black',
                          width=1920,
                          height=1080
                         ).generate(" ".join(df.title))
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('cast.png')
plt.show()

And with  this i conclude this notebook. Kindly Upvote if you like it. I might add some more visualization in the future and try to do a much  better feature engineering.Keep coding! Keep Kaggling!