In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import time
import re
import datetime as dt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Printing the dataframe
df = pd.read_csv("/kaggle/input/netflix-shows/netflix_titles.csv")
df.head()

In [None]:
# Showing the object type and number of entries
df.info()

In [None]:
# Converting the date time column in desired format
df["release_year"] = pd.to_datetime(df["release_year"], format = "%Y")
df["date_added"] = pd.to_datetime(df["date_added"])
df["release_year"] = df["release_year"].dt.year
df.head()

In [None]:
# Plotting the TV showas and movies according to rating
plt.figure(figsize = (20, 4))
ax = sns.countplot(x="rating", hue="type", data=df, palette = "Set3")

In [None]:
#Plotting the count of each rating
ax = plt.figure(figsize = (20, 4))
plt.xticks(rotation=45)
ax = sns.countplot(x = df["rating"],  data = df)

In [None]:
df_data_added = df[df.date_added >'2014-01-01'].groupby([df.date_added.dt.year,df.type,
                                                        df.rating]).count().show_id.reset_index()
title = 'Number of content added in each year'

g = sns.catplot(x="date_added",y = "show_id", hue="rating", col="type", data=df_data_added, kind="point", 
            height=4, aspect=12/8, title = title)
g.axes[0,0].set_ylabel('Number of videos added')
g.fig.subplots_adjust(top=0.85)
g.fig.suptitle(title);

In [None]:
#Calculating the number of movies and TV shows
palette = ["#FFC300", "#C70039"]
plt.title("Count of TV shows and Movies over all the years")
sns.countplot(x = "type", data = df, palette= palette);

In [None]:
# Plotting the content released over the years
ax = plt.figure(figsize = (20, 4))
plt.xticks(rotation=45)
ax = sns.countplot(x = "release_year",  data = df)

In [None]:
# Creating a dataframe to collect all the countries and their count
countries = {}
for country in df.country.dropna():
    for sub_country in str(country).split(", "):
        if sub_country not in countries:
            countries[sub_country] = 1
        else:
            countries[sub_country] += 1
            
df_countries = pd.DataFrame(list(countries.items()), columns = ["country", "count"]).sort_values(by = ["count"],
                                                                                    ascending = False )
df_countries

In [None]:
# Plotting Top 20 countries in terms of Netflix content
plt.figure(figsize = (25, 5))
plt.xticks(rotation = 90)
plt.title("Top 20 countries in terms of Netflix content")
sns.barplot(data = df_countries.head(20), x = "country", y = "count");

In [None]:
# Plotting number of videos release in last 5 years in the top 3 countries
title = 'Number of videos release in last 5 years in the top 3 countries'
fig,ax = plt.subplots(nrows = 1, ncols = 3, figsize = (15,5),sharey=True)
_ = fig.suptitle(title, fontsize = 15, color = '#B38A03')

i = 0
for country in df_countries.head(3).country.values:
    df_country = df.loc[df.country == country].loc[(df.release_year > 2015) & (df.release_year <= 2020)]
    plots = sns.countplot(data = df_country,x= 'release_year', ax = ax[i])
    _ = ax[i].set_title(country,color='#60B303', fontsize=10)
    i += 1

In [None]:
# Creating a dataframe for directors along with the count
directors = {}
for director in df.director.dropna():
    for sub_director in str(director).split(", "):
        if sub_director not in directors:
            directors[sub_director] = 1
        else:
            directors[sub_director] += 1
            
df_directors = pd.DataFrame(list(directors.items()), columns = ["director", "count"]).sort_values(by = ["count"],
                                                                                    ascending = False )
df_directors

In [None]:
# Plotting the most popular directors
plt.figure(figsize = (25, 5))
plt.xticks(rotation = 90)
plt.title("Top 10 directors in terms of Netflix content")
sns.barplot(data = df_directors.head(10), x = "director", y = "count");

In [None]:
# Using Regex creating the new minute column
df_movie = df.copy()
df_movie = df_movie[df_movie['type'] == 'Movie']

df_movie['minute'] = [int(re.findall('\d{1,3}', w)[0]) for w in df_movie.duration.ravel()]
df_movie.head(3)

In [None]:
# Movie released over the year 
plt.figure(figsize = (20, 4))
sns.countplot(x = "release_year", data = df_movie, palette = "tab10");
plt.xticks(rotation = 45);

In [None]:
# Average duration of movie over the years
df_movie_duration = df_movie.groupby(["release_year"]).mean().sort_values("minute")

plt.figure(figsize = (15, 6))
sns.lineplot(x = df_movie_duration.index, y = df_movie_duration.minute.values);
plt.ylabel("Average duration of movie");
plt.xlabel("Release Year");
plt.title("Trends of Movie Duration");

In [None]:
# Plotting the 20 most common movie geners
ax = plt.figure(figsize = (20, 4))
plt.xticks(rotation=90)
ax = sns.countplot(x = df_movie["listed_in"].head(20),  data = df_movie);
plt.xlabel("Movie Geners")
plt.title("20 Most common movie geners");

In [None]:
# Creating a TV show dataframe
df_tv = df[df["type"] == "TV Show"]
df_tv.head()

In [None]:
# Plotting the count of rating according to the seasons relesed 
palette=['#CCCCFF',"#FAAE7B", "#DFFF00", "#FFBF00", "#FF7F50", "#DE3163", "#9FE2BF", "#40E0D0", "#6495ED"]

plt.figure(figsize = (15, 4))
sns.countplot(x = "duration", hue = "rating", data = df_tv, palette = palette);
plt.legend(loc = 'upper right');
plt.xticks(rotation = 45);

In [None]:
# Plotting the total count of each season released
plt.figure(figsize = (15, 4))
plt.title("Total count of each season")
plt.xticks(rotation = 45)
sns.countplot(x = "duration", data = df_tv);