In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing related libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px 
import plotly.figure_factory as ff
from plotly.subplots import make_subplots 
from pandas_profiling import ProfileReport 

In [None]:
# Read data
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

In [None]:
#Show first five row of data
df.head()

In [None]:
# Information about data
df.info()

In [None]:
report = ProfileReport(df)
report

# **<font color = Darkred>Data Cleaning</font>**

In [None]:
df.nunique()

In [None]:
df.isnull().sum()

In [None]:
df[df['rating'].isnull()]

In [None]:
df['rating']=df['rating'].fillna("Unknown")
df['rating'].isnull().sum()

In [None]:
df =df.drop(['director','cast'] ,axis = 1)
df.columns

In [None]:
df[df['date_added'].isnull()]

In [None]:
df = df[df['date_added'].notna()]

In [None]:
df['country'].mode()[0]

In [None]:
df['country'] = df['country'].fillna(df['country'].mode()[0])

In [None]:
df.isnull().sum()

In [None]:
df['year'] =df['date_added'].apply(lambda x: x.split(" ")[-1])
df['year'].head()

In [None]:
df['month'] = df['date_added'].apply(lambda x: x.split(" ")[0])
df['month'].head()

In [None]:
df['month'].isnull().sum()

In [None]:
ratings_ages = {
    'TV-PG': 'Older Kids',
    'TV-MA': 'Adults',
    'TV-Y7-FV': 'Older Kids',
    'TV-Y7': 'Older Kids',
    'TV-14': 'Teens',
    'R': 'Adults',
    'TV-Y': 'Kids',
    'NR': 'Adults',
    'PG-13': 'Teens',
    'TV-G': 'Kids',
    'PG': 'Older Kids',
    'G': 'Kids',
    'UR': 'Adults',
    'NC-17': 'Adults'
}

In [None]:
df['target_ages'] = df['rating'].replace(ratings_ages)
df['target_ages'].unique()

In [None]:
df['principal_country'] = df['country'].apply(lambda x: x.split(",")[0])
df['principal_country']

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df['type'] = pd.Categorical(df['type'])

df['target_ages'] = pd.Categorical(df['target_ages'], categories = ['Kids','Older Kids','Teens','Adults'])
df['year'] = pd.to_numeric(df['year'])

In [None]:
df.dtypes

# **<font color = Darkred>Data Visualization</font>**

In [None]:
dfc = df['country'].value_counts().sort_values(ascending = False).reset_index().head(10)

fig = px.pie(dfc, values = 'country', names = 'index', color_discrete_sequence = px.colors.sequential.Jet,
             labels = {"index":"Country", "country":"Count"}
            
            )
fig.show()

In [None]:
dfy = df['release_year'].value_counts()
plt.figure(figsize = [10,7])
sns.lineplot(x =dfy.index, y = dfy.values, palette = 'terrain')

In [None]:
dfy = df['year'].value_counts()
plt.figure(figsize = [10,7])
sns.barplot(x =dfy.index, y = dfy.values, palette = 'terrain')

In [None]:
x = df['rating'].value_counts()
fig = px.pie(values = x.values, names = x.index,color_discrete_sequence = px.colors.qualitative.Safe)
fig.show()

In [None]:
dfty = df['type'].value_counts().reset_index()
fig = px.pie(dfty, values = 'type', names = 'index', color_discrete_sequence=px.colors.sequential.Aggrnyl,
             labels = {"index":"Type","type":"Count"}
            
            )

fig.update_traces(textposition = 'inside', textinfo = 'percent+label')
fig.show()

In [None]:
plt.figure(figsize = [10,7])

sns.countplot(data = df,x = 'rating' ,hue = 'type',palette = 'winter')

In [None]:
plt.figure(figsize = [10,7])

sns.countplot(data = df,x = 'target_ages' ,palette = 'crest')

In [None]:
plt.figure(figsize = [10,7])

sns.countplot(data = df,x = 'target_ages' ,hue = 'type',palette = 'crest')

In [None]:
plt.figure(figsize = [10,7])
sns.countplot(data = df, x = 'rating', hue = 'target_ages')

In [None]:
released_year_df = df.loc[df['release_year'] > 2010].groupby(['release_year', 'type']).agg({'show_id': 'count'}).reset_index()
added_year_df = df.loc[df['year'] > 2010].groupby(['year', 'type']).agg({'show_id': 'count'}).reset_index()


fig = go.Figure()

fig.add_trace(go.Scatter(x = released_year_df.loc[released_year_df['type'] == 'Movie']['release_year'],
                         y = released_year_df.loc[released_year_df['type']=="Movie"]['show_id'],
                         mode = 'lines+markers',
                         name= 'Movie: Released Year',
                         marker = dict(color = 'rgb(205, 92, 92)')))

fig.add_trace(go.Scatter(x = released_year_df.loc[released_year_df['type'] == 'TV Show']['release_year'],
                         y = released_year_df.loc[released_year_df['type'] == 'TV Show']['show_id'],
                         mode = 'lines+markers',
                         name = 'TV Show: Released Year',
                         marker = dict(color = 'rgb(255, 127, 80)')))

fig.add_trace(go.Scatter(x = added_year_df.loc[added_year_df['type'] == 'Movie']['year'],
                         y = added_year_df.loc[added_year_df['type'] == 'Movie']['show_id'],
                         mode = 'lines+markers',
                         name = 'Movie: Year',
                         marker_color = 'RebeccaPurple'))

fig.add_trace(go.Scatter(x = added_year_df.loc[added_year_df['type'] == 'TV Show']['year'],
                         y = added_year_df.loc[added_year_df['type'] == 'TV Show']['show_id'],
                         mode = 'lines+markers',
                         name = 'TV Show: Year',
                         marker_color = 'LightSeaGreen'))

fig.update_xaxes(categoryorder = 'total descending')
fig.show()

In [None]:
movie_df = df[df['type'] == 'Movie']
show_df = df[df['type'] == 'TV Show']

night_colors = ['rgb(219, 112, 147)', 'rgb(18, 36, 37)', 'rgb(147, 112, 219)',
                'rgb(36, 55, 57)', 'rgb(6, 4, 4)']
sunflowers_colors = ['rgb(177, 127, 38)', 'rgb(205, 152, 36)', 'rgb(99, 79, 37)',
                     'rgb(129, 180, 179)', 'rgb(255, 140, 0)']

fig = make_subplots(rows = 1,cols = 2,specs = [[{"type": "pie"},{"type": "pie"}]])

fig.add_trace(go.Pie(values = movie_df.value_counts(), labels= movie_df['target_ages'],marker_colors=night_colors),
               row= 1, col =1)

fig.add_trace(go.Pie(values = show_df.value_counts(), labels= show_df['target_ages'],marker_colors=sunflowers_colors),
               row= 1, col =2)

fig.update_traces(textposition = 'inside', hole = 0.5, hoverinfo = 'label+percent+name')
fig.update_layout(title_text = 'Rating distribution by Type of content',
                  title_x = 0.5,
                  title_font = dict(size = 20, color = 'MidnightBlue'),
                  annotations = [dict(text='Movies', x=0.19, y=0.5, font_size=16, showarrow=False,
                                      font_color = 'MidnightBlue'),
                                 dict(text='TV Shows', x=0.82, y=0.5, font_size=16, 
                                      showarrow=False,font_color = 'MidnightBlue')])

fig.show()

In [None]:
dfm = df[df['type']=='Movie']
dfmo = dfm['listed_in'].value_counts().reset_index().head(20)

fig = px.bar(dfmo, x = 'index', y = 'listed_in',color = 'listed_in',color_continuous_scale = 'rdbu',
             labels = {"index":"Movie","listed_in":"Count"})

fig.update_layout(title = 'Top 20 Movie Genre',
                  title_x = 0.5,
                  title_font = dict(size = 16, color = 'Darkblue'),
                  xaxis = dict(tickangle = 45)
                 
                 )

fig.show()

In [None]:
dft = df[df['type']=='TV Show']

dftv = dft['listed_in'].value_counts().reset_index().head(20)


fig = px.bar(dftv, x = 'index', y = 'listed_in',color = 'listed_in',color_continuous_scale = 'rdpu',
             labels = {"index":"Movie","listed_in":"Count"})

fig.update_layout(title = 'Top 20 TV Show Genre',
                  title_x = 0.5,
                  title_font = dict(size = 16, color = 'Darkblue'),
                  xaxis = dict(tickangle = 45)
                 
                 )

fig.show()

In [None]:
from wordcloud import WordCloud
from matplotlib import colors 

color_list=  ['DarkBlue','LightBlue','MediumAquamarine','Plum','OrangeRed','DarkRed','Pink','LightGoldenrodYellow']

colormap = colors.ListedColormap(color_list)

text = str(list(df['listed_in'])).replace("'","")
plt.rcParams['figure.figsize'] = (15, 15)

wordcloud =  WordCloud(background_color= 'black',width = 1200,height = 800 ,max_words = 120,colormap = colormap ).generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
df1 = df[df['type']=='TV Show']
df1d = df1['duration'].value_counts().reset_index()

fig = px.bar(df1d, x = 'index',y = 'duration', color = 'duration',color_continuous_scale = 'ylgn',
             labels = {"index":"Duration","duration":"Count"}
            
            )

fig.update_layout(title="Distribution of duration(TV Show)",
                  title_x = 0.5,
                  title_font = dict(size = 16, color = 'Darkgreen'),
)

fig.show()

# **<font color = Darkred>Thank You for Reading!</font>**

**Please consider upvoting & checking out more of my work if you found this interesting & valuable! Thanks so much!**

[Link to My Kaggle](http://www.kaggle.com/carriech)