# Netflix Data Analysis

In [None]:

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import warnings
warnings.filterwarnings('ignore')
                        
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from collections import Counter
pd.set_option('display.max_columns', None)
%matplotlib inline

# Load and check the Data

In [None]:
data = pd.read_csv("/kaggle/input/netflix-shows/netflix_titles.csv")

In [None]:
data.sample(5)

In [None]:
data.shape

In [None]:
data.info()

# Deal with Missing Values

In [None]:

def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

In [None]:
missing_values_table(data)

**missingno** provides a small toolset of flexible and easy-to-use missing data visualizations and utilities that allows you to get a quick visual summary of the completeness (or lack thereof) of your dataset. Install:

'pip install missingno'
https://github.com/ResidentMario/missingno

In [None]:
msno.bar(data,color = 'indianred')
plt.show()

In [None]:
msno.heatmap(data)

In [None]:
data = data.dropna()
data.shape

In [None]:
#number of categories in each columns
for col in data.columns:
        unique_cat = len(data[col].unique())
        print("Feature '{col}' has {unique_cat} unique categories".format(col=col, unique_cat=unique_cat))

In [None]:
data['type'].value_counts().to_frame()

as we can see the popular ganre of programm in Netflix are Movie 

In [None]:
data['type'].value_counts().head(10).plot(kind="bar")

The busiest directors are Raúl Campos, Jan Suter:

In [None]:
data['director'].value_counts()[data['director'].value_counts() == data['director'].value_counts().max()]

In [None]:
data.director.mode()

In [None]:
data[data['director'] == 'Raúl Campos, Jan Suter']

The top 10 most busiest(frequent) directors:

In [None]:
n = 10
data['director'].value_counts()[:n].index.tolist()

In [None]:
#list of Martin Scorsese's Movie on Netflix (one of my favorit director)
data[data['director'] == 'Martin Scorsese']

In [None]:
plt.figure(figsize=(16,8))
data['release_year'].value_counts().head(25).plot(kind="bar")


In [None]:
print('Released year from: ', data['release_year'].min(),'to', data['release_year'].max())

In [None]:
#oldest movie on Netflix
data[data['release_year'] == 1942]

# Top 5 Genres 

In [None]:
data['listed_in'].value_counts().to_frame()

In [None]:
print(data["listed_in"].value_counts()[:5])
plt.figure(figsize=(12,6))
sns.countplot(y='listed_in',data = data,order =data["listed_in"].value_counts().index[0:5],palette="magma")
plt.title("Top 5 Genres of Tv Shows",size=18)
plt.show()

In [None]:
data.rating.value_counts().to_frame()

In [None]:
plt.figure(figsize = (16,6))
sns.countplot(data.rating, order = data.rating.value_counts().index[0:15],palette="magma")
plt.title("Ratings for Movies And Shows")
plt.xlabel("Rating")
plt.ylabel("Total Count")
plt.show()