In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Ignore Warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Load the dataset
df = pd.read_csv("netflix_titles.csv")

In [3]:
#Identify and handle missing values
print("Missing values per column:\n", df.isnull().sum())


Missing values per column:
 show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [4]:
# Fill missing values if needed (can also choose to drop them)
df['director'].fillna("unknown", inplace=True)
df['cast'].fillna("not available", inplace=True)
df['country'].fillna("unknown", inplace=True)
df['date_added'].fillna(method='ffill', inplace=True)  # forward fill
df['rating'].fillna("unknown", inplace=True)

In [5]:
#Remove duplicate rows
df.drop_duplicates(inplace=True)


In [6]:
#Standardize text values
text_cols = ['type', 'title', 'director', 'cast', 'country', 'rating']
for col in text_cols:
    df[col] = df[col].astype(str).str.lower().str.strip()

In [7]:
# Convert date formats to consistent type (e.g., dd-mm-yyyy)
df['date_added'] = pd.to_datetime(df['date_added'], format='%B %d, %Y', errors='coerce')


In [8]:
# Rename column headers to be clean and uniform
df.columns = df.columns.str.lower().str.replace(" ", "_")


In [9]:
#Check and fix data types
# Ensure 'release_year' is int
df['release_year'] = df['release_year'].astype(int)


In [10]:
# Save the cleaned dataset
df.to_csv("netflix_titles_cleaned.csv", index=False)