## Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Import The Dataset

In [8]:
data= pd.read_csv('netflix_titles.csv')
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## Convert 'date_added' to datetime

In [16]:
# Convert 'date_added' to datetime
data['date_added'] = pd.to_datetime(data['date_added']).dt.date
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,dick johnson is dead,kirsten johnson,,united states,2021-09-25,2020,PG-13,90 min,documentaries,"as her father nears the end of his life, filmm..."
1,s2,TV Show,blood & water,,"ama qamata, khosi ngema, gail mabalane, thaban...",south africa,2021-09-24,2021,TV-MA,2 Seasons,"international tv shows, tv dramas, tv mysteries","after crossing paths at a party, a cape town t..."
2,s3,TV Show,ganglands,julien leclercq,"sami bouajila, tracy gotoas, samuel jouy, nabi...",,2021-09-24,2021,TV-MA,1 Season,"crime tv shows, international tv shows, tv act...",to protect his family from a powerful drug lor...
3,s4,TV Show,jailbirds new orleans,,,,2021-09-24,2021,TV-MA,1 Season,"docuseries, reality tv","feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,kota factory,,"mayur more, jitendra kumar, ranjan raj, alam k...",india,2021-09-24,2021,TV-MA,2 Seasons,"international tv shows, romantic tv shows, tv ...",in a city of coaching centers known to train i...


## Standardize text data: convert to lowercase and strip whitespace

In [26]:
text_columns = ['type','title', 'director', 'cast', 'country', 'listed_in', 'description']
for col in text_columns:
    data[col] = data[col].str.lower().str.strip()
data.head()    

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,movie,dick johnson is dead,kirsten johnson,unknown,united states,2021-09-25,2020,PG-13,90 min,documentaries,"as her father nears the end of his life, filmm..."
1,s2,tv show,blood & water,unknown,"ama qamata, khosi ngema, gail mabalane, thaban...",south africa,2021-09-24,2021,TV-MA,2 Seasons,"international tv shows, tv dramas, tv mysteries","after crossing paths at a party, a cape town t..."
2,s3,tv show,ganglands,julien leclercq,"sami bouajila, tracy gotoas, samuel jouy, nabi...",unknown,2021-09-24,2021,TV-MA,1 Season,"crime tv shows, international tv shows, tv act...",to protect his family from a powerful drug lor...
3,s4,tv show,jailbirds new orleans,unknown,unknown,unknown,2021-09-24,2021,TV-MA,1 Season,"docuseries, reality tv","feuds, flirtations and toilet talk go down amo..."
4,s5,tv show,kota factory,unknown,"mayur more, jitendra kumar, ranjan raj, alam k...",india,2021-09-24,2021,TV-MA,2 Seasons,"international tv shows, romantic tv shows, tv ...",in a city of coaching centers known to train i...


## Null Value

#### Check For a Null Value 

In [19]:
# Check for missing values
data.isnull().sum().sort_values(ascending=False).head()

director      2634
country        831
cast           825
date_added      98
rating           4
dtype: int64

#### Fill missing values

In [24]:
# Fill missing values with placeholders
fill_values = {
    'director': 'unknown',
    'cast': 'unknown',
    'country': 'unknown',
    'date_added': pd.to_datetime('1970-01-01'),  # Placeholder date
    'rating': 'nr',
    'duration': 'unknown'
}

# Apply the fill values
data.fillna(fill_values, inplace=True)

In [25]:
data.isnull().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

## Check for duplicate rows

In [23]:
data.duplicated().sum()

0