In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import plotly.express as px
import matplotlib.pyplot as plt
import re
import plotly.graph_objects as go
import seaborn as sb

In [None]:
data = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data2=data.fillna(method="bfill")

In [None]:
data2.isnull().sum()

In [None]:
data2.info()

In [None]:
colors = px.colors.qualitative.D3
vs_count = data2.type.value_counts()
fig = px.pie(values=vs_count.values, names=vs_count.index, title = 'Movies vs Tv Shows',\
            color_discrete_sequence=colors)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [None]:
# number of movies and tv-shows per year
count_by_year=data2[data2.release_year>=2000].groupby(['type','release_year'],as_index=False)\
['show_id'].count()
plt.rcParams['figure.figsize']=(20,10)
(count_by_year.pivot_table(index='release_year', columns='type', values='show_id', \
                          aggfunc='sum', fill_value=0).plot.bar(stacked=False))

In [None]:
country_count=data2['country'].value_counts()[:25]
fig=px.pie(values=country_count.values, names=country_count.index, \
          title='Content share of countries', color_discrete_sequence=colors)
fig.show()

In [None]:
# duratoin of tv-shows
tv_show_duration=data2[data2.type == 'TV Show'].duration
tv_show_duration=tv_show_duration.apply(lambda x : float(re.sub('Seasons?', '' , x)))
t1=go.Histogram(x=tv_show_duration, xbins=dict(size=1), marker=dict(color=colors))
layout1=go.Layout(title='Distribution of tv-show duration', xaxis=dict(title='Seasons'))
fig=go.Figure(data=[t1], layout=layout1)
fig.show()

In [None]:
#duration of movies
movie_duration=data2[data2.type == 'Movie'].duration
movie_duration=movie_duration.apply(lambda x : float(x.replace( 'min', '')))
t2=go.Histogram(x=movie_duration, xbins=dict(size=1), marker=dict(color=colors))
layout2=go.Layout(title='Distribution of movie duration', xaxis=dict(title='Minutes'))
fig=go.Figure(data=[t2], layout=layout2)
fig.show()

In [None]:
#most watched ratings
plt.figure(figsize=(12,9))
sb.countplot(x='rating', data=data2, order=data2['rating'].value_counts().index[:14])
plt.show()

In [None]:
from wordcloud import WordCloud
wc=WordCloud(background_color='black', max_words=500)
fig=plt.figure(figsize=(10,6))
wc.generate(str(data2['description']))
plt.imshow(wc, interpolation='bilinear')
plt.title('Words used to describe the movies or tv-shows')
plt.axis('off')
plt.show()

In [None]:
count_data = data.groupby(['director']).count().sort_values(by=["description"], ascending=False)
plot_data = count_data.head(15)
sb.barplot(x=plot_data.index,  y=plot_data.description)
plt.xticks(rotation=45,ha="right", size=15)
plt.yticks(np.linspace(2, 20, 10, dtype=np.int32), size = 15)
plt.xlabel("Director")
plt.ylabel("Counts", size=18)
plt.title("Top-15 Most-Producing Directors", size= 18)
plt.grid()
plt.show()

In [None]:
temp=data2['listed_in']
genres=[]
for i in temp:
    a=i.split(sep=', ')
    genres += a

genres=list(set(genres))

print(genres)
print('\nTotal number of genres: {}'.format(len(genres)))

In [None]:
for genre in genres:
    count_list=[]
    for i in data2.index:
        if genre in data2.loc[i, 'listed_in'].split(sep=', '):
            count_list.append(1)
        else:
            count_list.append(0)
    data2[genre]=count_list

data2.head()

In [None]:
data_genre=pd.DataFrame(data2[genres].sum(axis=0), columns=['Counts'])
data_genre.sort_values('Counts', ascending=False, inplace=True)
sb.barplot(x=data_genre.Counts, y=data_genre.index)
plt.xticks(size=15)
plt.yticks(size=13)
plt.xlabel=('Genres')
plt.ylabel=('Counts')
plt.title('Genres Count', size=20)
plt.grid()
plt.show()