## Netflix Show Data Exploatory Analysis

In [3]:
import pandas as pd 
import numpy as np
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import plotly.express as px

In [4]:
netflix = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

### Data Exploration  

In [5]:
netflix.info()

In [6]:
netflix['date_added'] = pd.to_datetime(netflix['date_added'])
netflix.info()

In [7]:
netflix.head()

In [8]:
missing_value_count = netflix.isnull().sum()

missing_value_count

## 1. Number of Shows

In [9]:
show_number = netflix.type.value_counts()
show_number = pd.DataFrame(show_number).reset_index()
show_number.columns = ['type','number']
show_number

## 2.Rating


Data in the 'rating' column is quite confusing.So i make 'age' column with value ('Kids' or 'Teens' or 'Adult').

In [10]:
kids = [ 'TV-Y', 'G','TV-Y7','TV-Y7-FV', 'TV-G', 'PG', 'TV-PG']
teens = [ 'PG-13', 'TV-14']
adults = [ 'R', 'TV-MA', 'NC-17']

netflix_rating = pd.DataFrame(netflix ['rating'],columns = ['rating'])

def age(x):
    if x in kids:
        return 'Kids'
    elif x in teens:
        return 'Teens'
    elif x in adults:
        return 'Adults'
    else:
        return 'Not Rated'

netflix_rating['ages'] = netflix['rating'].apply(age)

netflix_rating

count shows number for each rating

In [11]:
nr = netflix_rating.groupby(['rating','ages']).agg(number=('rating','count')).sort_values(by= 'ages').reset_index()
nr

In [12]:
fig = px.sunburst(nr,path = ['ages','rating'],values = 'number')
fig.show()

## 3.Genre

have a look at data

In [13]:
netflix['listed_in'].head(3)

Shows have multiple genre so i will split them into multiple rows.

In [14]:
df = netflix
#split rows and put to new dataframe
new_df = pd.DataFrame(df.listed_in.str.split(',').tolist()).stack()
new_df = pd.DataFrame(new_df).reset_index()
new_df.columns = ['a','b','genre']
new_df.drop(columns=['a','b'],inplace= True)
new_df['genre'] = new_df['genre'].str.strip()
new_df[:5]

In [15]:
genre = pd.DataFrame(new_df[['genre']].value_counts()).reset_index()
genre.columns = ['genre','number']
genre[:10]

In [16]:
fig = px.bar(genre[:10],x='genre',y='number')
fig.show()

## 4.Country (making shows) 

Country column have the same issue as genre one so i do the same thing with a bit difference method.

In [17]:
netflix['country'] = netflix['country'].fillna("null")
netflix_country=pd.concat([Series(row['date_added'], row['country'].split(','))              
                    for _, row in netflix.iterrows()]).reset_index()
netflix_country.columns = ['country','date_added']
netflix_country

In [18]:
netflix_country['country'] = netflix_country['country'].str.strip()
shows_by_country = netflix_country['country'].value_counts().to_frame().reset_index()
shows_by_country.columns = ['country','shows']
shows_by_country

delete null row. (index =3)

In [19]:
shows_by_country.drop([3])

In [20]:
netflix_country = netflix_country.sort_values(by='date_added',ascending=False)
netflix_country['n'] = 1
date_country = netflix_country.groupby(['country','date_added']).sum().groupby(level=0).cumsum().reset_index()
date_country.sort_values(by = 'n')


Shows number on the last rows(index=5005) for 'United States' is supposed to be 3297. It might be some problem on 'date_added' column.

In [21]:
netflix_country[netflix_country['country']=='United States']

date_added with NAT value is very suspicious.

In [22]:
date_country[date_country['date_added'].isnull()]

In [23]:
netflix_country[netflix_country['date_added'].isnull()]

Replace NaT with 22/10/2009 (When Netflix Originals was launched).

In [24]:
netflix_country['date_added'].fillna(pd.Timestamp('20140101'),inplace=True)
date_country = netflix_country.groupby(['country','date_added']).sum().groupby(level=0).cumsum().reset_index()
pd.DataFrame(date_country.groupby(['country'])['n'].max()).sort_values('n',ascending = False)

It worked.

Make visualization.

In [25]:
top10_country = shows_by_country[:10].set_index('country')
dc = date_country[date_country['country'].isin(top10_country.index.tolist())]
fig = px.line(dc,x='date_added',y='n',color = 'country')
fig.show()
