In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Netflix Data Analysis
Netflix is one of the populart OTT platform. here we are going to analyse the data from Netflix to see the Movies by region, duration etc. 

# Read Data & Explore data
lets us read data and understand details of the columns in the data. 

In [None]:
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

In [None]:
df.head()

In [None]:
df.columns

# Column Details
1. show_id - id of the show tracked in Netflix.
2. type - type of the show whether it is Movie or TV show.
3. title - title of the show.
4. director - director of the show or movie.
5. cast - cast of the show or Movie.
6. country - country in with the show/movie is produced.
7. date_added - date added to Netflix.
8. release_year - original realease year of the movie.
9. rating - rating of the movie Ex: PG, R etc.
10. duration - duration of the movie/show.
11. listed_in - category of the Movie.show
12. description - discription of the movie. 

From the above details of the columns, show_id & descriptions can be ignored for anlaysis. rest other columns will give significat contribution for our analysis. 

In [None]:
#remove show_id & description 
df = df.drop(labels=['show_id','description'], axis=1)

In [None]:
df.head()

In [None]:
#import ploting libraries
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

# check the null or na values in the data

In [None]:
sns.heatmap(df.isnull())

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df['rating'].value_counts()

In [None]:
df[df['rating'].isnull()]

In [None]:
rating={67:'PG',2359:'TV-13',3660:'TV-13',3736:'UR',3737:'UR',3738:'UR',4323:'PG'}
for i, rate in rating.items():
    df.loc[i,'rating']=rate

In [None]:
df['country'].value_counts()

In [None]:
#since majority of the items from United staties, lets replace the 507 null values from country as 'Uniter States'
df['country']=df['country'].fillna('United States')

In [None]:
#split multiple countries as on orgin country based on the first contry in the list
df['country']=df['country'].apply(lambda x: x.split(',')[0])

In [None]:
df['country'].value_counts()

# Exploratory Data Analysis

In [None]:
df_types=df['type'].value_counts().T.reset_index()
px.pie(df_types, values='type', names='index', title='Netflix shows by Movie & TV shows')

around 69% of Netflix shows are contributed by Movies and TV shows are remaining 31%

In [None]:
df_country=df[['country','type','title']].groupby(by=['country','type']).count().sort_values(by='title',ascending=False).reset_index()

In [None]:
df['country'].value_counts()

In [None]:
px.bar(df_country.head(10),x='country',y='title',color='type',title='Netflix shows/movies count from countries by type')

From the above gap, we understand that US, India, UK are the top contributed to Netflix. 
United States has 70% of the productions in Movies & 30% in TV shows
India has 100% productions in Movies
United Kingdome has 70% & 30% in Movies & TV shows respectively
Canda, France, Spain are following in Movies only. 
Japan & South Korea are mostly on TV shows

In [None]:
px.scatter(df, x='release_year', color='country', hover_data=['title'])

Oldest produced movie in Netflix is 'Pioneer: First women flimmaker' released in the year 1925. 
mose of the release movies in Netflix is between later part of 2000 an 2020. 

In [None]:
#movies rating by rating
df_rating=df['rating'].value_counts().reset_index()
px.bar(df_rating, x='index',y='rating', color='index')

Top 3 rating goes to TV shows & TV-MA has top in the rating count

In [None]:
df.head()

In [None]:
df['date_added'].fillna(df['release_year'], inplace=True)

In [None]:
def convert(x):
    if len(x)>4:
        a=x.split(',')
        #print(a)
        return int(a[1].strip())
    return int(x)
    

In [None]:
df['year_added']=df['date_added'].apply(lambda x:convert(str(x)))

In [None]:
df.info()

In [None]:
df_country_year = df.groupby(by=['country','type','year_added']).count().reset_index()
df_country_year['aggregate']=df_country_year.groupby(by=['country'])['title'].cumsum()

In [None]:
px.choropleth(df_country_year.sort_values(by='year_added'), locations='country', title='Movies/TV shows released by year by country',color='aggregate', locationmode='country names', animation_frame='year_added', range_color=[0,500],
             )

In [None]:
df.head()

In [None]:
df['durations_time']=df['duration'].apply(lambda x: x.split()[0]).astype(float)
df['release_year']=df['release_year'].astype(int)

In [None]:
x=df['durations_time'][df['type']=='Movie']
y=df['release_year']

In [None]:
y

In [None]:
import plotly.figure_factory as ff
ff.create_distplot([x],['duration_time'], curve_type='normal', bin_size=0.7)

In [None]:
df['genere']=df['listed_in'].apply(lambda x:x.split(',')[0])

In [None]:
df_genere=df[['genere','release_year','title']].groupby(by=['release_year','genere']).count().reset_index()

In [None]:
to_10_genere=df_genere[['genere','title']].groupby(by='genere').sum().sort_values(by='title',ascending=False).reset_index()['genere'].head(5)

In [None]:
px.sunburst(df_genere[(df_genere['genere'].isin(to_10_genere) )],
            title='Top 10 genere by year',path=['release_year','genere'],values='title',)