In [None]:
%pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('netflix_titles.csv')
initial_count = df.shape[0]
print(df.shape)
print(initial_count)
df.head(10)

In [None]:
df = df.drop_duplicates()
df.head(10)

In [None]:
count_after_drop_duplicates = df.shape[0]
count_drop = initial_count - count_after_drop_duplicates
print("Number of rows dropped: ", count_drop)
print(df.shape)
df.isnull().sum()

In [None]:
df = df.dropna(subset=['title', 'type']).reset_index(drop=True) #More logical
# df = df.dropna().reset_index(drop=True) #Not logical since dropping other coloumns than title and type makes no sense
print(df.shape)
count_after_dropping_na = df.shape[0]
count_drop_na = count_after_drop_duplicates - count_after_dropping_na
print("Number of rows dropped (NA): ", count_drop_na)
df.head(10)

In [None]:
text_cols = ['country', 'type', 'director', 'cast', 'listed_in']
for col in text_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().replace('nan', np.nan)
        df[col] = df[col].where(df[col].isna(), df[col].str.lower())
df.head(10)

In [None]:
if 'date_added' in df.columns:
    from dateutil import parser
    import warnings
    warnings.filterwarnings('ignore')
    
    def parse_date_flexible(date_str):
        if pd.isna(date_str):
            return None
        try:
            return parser.parse(str(date_str))
        except:
            try:
                return pd.to_datetime(date_str, format='%B %d, %Y')
            except:
                return None
    
    df['date_added'] = df['date_added'].apply(parse_date_flexible)
    
    df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce').dt.strftime('%d-%m-%Y')

if 'release_year' in df.columns:
    df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce').astype('Int64')
    
if 'rating' in df.columns:
    df['rating'] = df['rating'].astype('category')
    
if 'type' in df.columns:
    df['type'] = df['type'].astype('category')
    
df.head(10)

In [None]:
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
df.head(10)

In [None]:
print('Final shape:', df.shape)
print('\nNull counts after cleaning:\n', df.isnull().sum(), '\n')
print(df.dtypes)
df.to_csv('cleaned_netflix_titles.csv', index=False)
print('Saved cleaned dataset to cleaned_netflix_titles.csv')
df.head(10)