In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


# data preview

In [None]:
data=pd.read_csv('../input/netflix-shows/netflix_titles.csv')
data.head().style.set_properties(**{'background-color': '#161717','color': '#30c7e6','border-color': '#8b8c8c'})

In [None]:
print(data.columns)
data.index

# Performing Exploratory Data Analytics
- in this part we will be dealing with finding of null/NaN in the dataset
- And we will try to fill those null values with appropriate values 
- or else if a column consists of too many null values then we will delete the column


In [None]:
data.isnull().sum()

In [None]:
data['director'].value_counts()

# from the above depiction we can observe that 65% of the director values are missing
- Predicting data of directors values isn't that easy and also won't be accurate enough in real-time and also director values is not that important 
- so it is better we drop the column of director along the axis=1

In [None]:
data.drop(columns=['director'],inplace=True)

In [None]:
data.head().style.set_properties(**{'background-color': '#161717','color': '#30c7e6','border-color': '#8b8c8c'})

# Also we have 570 records of null values in cast 
- due to many values in cast we can't easily categorise and can't be predicted easily better we remove the cast column

In [None]:
data.drop('cast',inplace=True,axis=1)
data.head().style.set_properties(**{'background-color': '#161717','color': '#30c7e6','border-color': '#8b8c8c'})

In [None]:
countries=data['country']
countries

In [None]:
unique_country=data['country'].unique()
unique_country

In [None]:
occurences=np.count_nonzero(countries,axis=0)
occurences

In [None]:
(data.country=='United States').value_counts()

In [None]:
(data.country=='United Kingdom').value_counts()

In [None]:
(data.country=='India').value_counts()

In [None]:
data['country'].value_counts()

In [None]:
data.dtypes

# as there are many object data types present in the data set so we will try to convert the object data types to categorical values and then we will try to explore the data 

In [None]:
data['type'].unique()

In [None]:
pd.get_dummies(data['type'])

In [None]:
data.head().style.set_properties(**{'background-color': '#161717','color': '#30c7e6','border-color': '#8b8c8c'})

In [None]:
sns.heatmap(data.isnull())

# From the depiction we could realise that there are null values in country and rating as the data given in the respective columns are neither categorical nor numerical, and is discrete variables so we will find difficulty in filling up the null values
- so we will use bfill()/ffill() in fillna() function

In [None]:
data[data['type']=='TV Show']

In [None]:
data[data['type']=='TV Show'].value_counts()

In [None]:
sns.countplot(data=data,y='country',hue='type')

In [None]:
data.fillna('bfill',inplace=True)
data.head().style.set_properties(**{'background-color': '#161717','color': '#30c7e6','border-color': '#8b8c8c'})

In [None]:
sns.heatmap(data.isnull())

In [None]:
data.isnull().sum()

# from the above observation we can out that all null/NaN values were resolved and filled with some appropriate values

# Visualisation

In [None]:
sns.pairplot(data)

In [None]:
data.hist(figsize=(20,15))

In [None]:
sns.distplot(data['release_year'])

In [None]:
sns.kdeplot(data['release_year'])

In [None]:
sns.countplot(data=data,x='release_year',hue='type')

In [None]:
sns.barplot(data=data,x='release_year',y='type')

# from the given data TV shows were released more than Movies

In [None]:
data['type']=data['type'].astype('category')

In [None]:
data.columns

In [None]:
sns.jointplot(data=data,x='release_year',y='show_id',kind='reg')

In [None]:
sns.jointplot(data=data,x='release_year',y='show_id',kind='kde')

# finding the correlation

In [None]:
data.corr()

In [None]:
sns.heatmap(data.corr())

In [None]:
sns.set_style("whitegrid")
import plotly.express as px
fig = px.bar(data,x='release_year',y='country',title='Country Vs release_year', height=900, orientation='h')
fig.show()