In [171]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [172]:
dataset = pd.read_csv("netflix_titles_nov_2019.csv")

In [173]:
dataset.head()

Unnamed: 0,show_id,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,type
0,81193313,Chocolate,,"Ha Ji-won, Yoon Kye-sang, Jang Seung-jo, Kang ...",South Korea,"November 30, 2019",2019,TV-14,1 Season,"International TV Shows, Korean TV Shows, Roman...",Brought together by meaningful meals in the pa...,TV Show
1,81197050,Guatemala: Heart of the Mayan World,"Luis Ara, Ignacio Jaunsolo",Christian Morales,,"November 30, 2019",2019,TV-G,67 min,"Documentaries, International Movies","From Sierra de las Minas to Esquipulas, explor...",Movie
2,81213894,The Zoya Factor,Abhishek Sharma,"Sonam Kapoor, Dulquer Salmaan, Sanjay Kapoor, ...",India,"November 30, 2019",2019,TV-14,135 min,"Comedies, Dramas, International Movies",A goofy copywriter unwittingly convinces the I...,Movie
3,81082007,Atlantics,Mati Diop,"Mama Sane, Amadou Mbow, Ibrahima Traore, Nicol...","France, Senegal, Belgium","November 29, 2019",2019,TV-14,106 min,"Dramas, Independent Movies, International Movies","Arranged to marry a rich man, young Ada is cru...",Movie
4,80213643,Chip and Potato,,"Abigail Oliver, Andrea Libman, Briana Buckmast...","Canada, United Kingdom",,2019,TV-Y,2 Seasons,Kids' TV,"Lovable pug Chip starts kindergarten, makes ne...",TV Show


In [174]:
def data_details(df):
    
    print("The total movies and shows is :", df.shape[0])
    print("Total numbers of variables is: ", df.shape[1])
    
    print("----------"*10)
    
    print("The dataset columns are listed below: ")
    print(df.columns)
    
    print("----------"*10)
    
    print("The data type of each columns are: ")
    print(df.dtypes)
    
    print("----------"*10)
    print("The Missing rows in each columns are: \n")
    missing_number = df.isnull().sum()
    print(missing_number[missing_number > 0])
    
data_details(dataset)

The total movies and shows is : 5837
Total numbers of variables is:  12
----------------------------------------------------------------------------------------------------
The dataset columns are listed below: 
Index(['show_id', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'type'],
      dtype='object')
----------------------------------------------------------------------------------------------------
The data type of each columns are: 
show_id          int64
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
type            object
dtype: object
----------------------------------------------------------------------------------------------------
The Missing rows in each columns are: 

director      1901
cast           556

## Data Cleaning Process

In [175]:
duplicated_data = dataset.duplicated(["title", "country", "type", "release_year"])
dataset[duplicated_data]

Unnamed: 0,show_id,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,type
1134,80175351,Kakegurui,,"Saori Hayami, Minami Tanaka, Tatsuya Tokutake,...",Japan,,2019,TV-14,2 Seasons,"Anime Series, International TV Shows, TV Thril...",High roller Yumeko Jabami plans to clean house...,TV Show
1741,81072516,Sarkar,A.R. Murugadoss,"Vijay, Varalakshmi Sarathkumar, Keerthi Suresh...",India,"March 2, 2019",2018,TV-MA,162 min,"Action & Adventure, Dramas, International Movies",A ruthless businessman’s mission to expose ele...,Movie


In [176]:
# Drop the id columns
dataset = dataset.drop("show_id", axis="columns")   # You can sue axis = 1 also.


In [177]:
# Create a new column for the numbers of cast in each row
dataset["cast"] = dataset["cast"].replace(np.nan, "unknown")

def cast_count(cast):
    if cast == "unknown":
        return 0
    else:
        names = cast.split()
        number_of_cast = len(names)
        return number_of_cast

dataset["number_of_cast"] = dataset["cast"].apply(cast_count)
dataset["cast"] = dataset["cast"].replace("unknown", np.nan)

In [178]:
dataset = dataset.reset_index()

In [179]:
# Replacing the rating column with the mode because the data type is non numerical

dataset["rating"] = dataset["rating"].fillna(dataset["rating"].mode()[0])

In [180]:
# Replacing the missing values in the date_added column with january andthe mode of the release year.
dataset["date_added"] = dataset["date_added"].fillna('january 1 , {}'.format(dataset["release_year"].mode()[0]))   

In [181]:
for i,j in zip(dataset['country'].values,dataset.index):
    if i==np.nan:
        if ('Anime' in dataset.loc[j,'listed_in']) or ('anime' in dataset.loc[j,'listed_in']):
                dataset.loc[j,'country']='Japan'
        else:
            continue
    else:
        continue

In [182]:
# Converting the date time type from object to datetime.
dataset['date_added'] = pd.to_datetime(dataset["date_added"])

In [183]:
dataset.isna().sum()


index                0
title                0
director          1901
cast               556
country            427
date_added           0
release_year         0
rating               0
duration             0
listed_in            0
description          0
type                 0
number_of_cast       0
dtype: int64

##  Exploratory Data Analysis

In [199]:
dataset["rating"].unique()

array(['TV-14', 'TV-G', 'TV-Y', 'TV-MA', 'TV-PG', 'R', 'TV-Y7', 'PG', 'G',
       'PG-13', 'TV-Y7-FV', 'NR', 'UR', 'NC-17'], dtype=object)

In [None]:
def no_rating(rating):
    if rating == "UR":
        