In [1]:
# 02_data_cleaning_and_preprocessing
print("Clean dataset and produce `outputs/cleaned_netflix.csv`.")


Clean dataset and produce `outputs/cleaned_netflix.csv`.


In [14]:
# Cell 1
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import pandas as pd
import numpy as np
import re

PROJECT_ROOT = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix")
RAW_CSV = PROJECT_ROOT / "data" / "raw" / "NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv"
CLEANED_OUT = PROJECT_ROOT / "outputs" / "cleaned_netflix.csv"
CLEANED_OUT.parent.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(RAW_CSV)
print("Loaded raw data. Shape:", df.shape)


Loaded raw data. Shape: (7787, 12)


In [8]:
# Notebook 02 — Cell 2
# Remove exact duplicates by title + release_year
initial = df.shape[0]
df.drop_duplicates(subset=["title", "release_year"], inplace=True)
print(f"Removed {initial - df.shape[0]} duplicate rows. New shape: {df.shape}")


Removed 0 duplicate rows. New shape: (7787, 12)


In [15]:
# Cell 3-Fill key text fields with empty strings (safe choice)
for col in ["description", "listed_in", "duration"]:
    if col in df.columns:
        df[col] = df[col].fillna("").astype(str)
print("Filled missing values for description, listed_in, duration.")


Filled missing values for description, listed_in, duration.


In [16]:
# Cell 4
def parse_duration_to_num(val):
    if pd.isna(val) or val == "":
        return np.nan
    s = str(val).lower()
    m = re.search(r"(\d+)", s)
    if m:
        return int(m.group(1))
    return np.nan

df["duration_num"] = df["duration"].apply(parse_duration_to_num)
print("Parsed duration -> duration_num. Example:")
display(df[["duration", "duration_num"]].head(6))


Parsed duration -> duration_num. Example:


Unnamed: 0,duration,duration_num
0,4 Seasons,4
1,93 min,93
2,78 min,78
3,80 min,80
4,123 min,123
5,1 Season,1


In [17]:
# Cell 5-Convert listed_in to list field genres_list
if "listed_in" in df.columns:
    df["genres_list"] = df["listed_in"].apply(lambda s: [x.strip() for x in s.split(",") if x.strip()])
else:
    df["genres_list"] = [[] for _ in range(len(df))]

print("Converted 'listed_in' to genres_list (list). Sample:")
display(df[["listed_in", "genres_list"]].head(4))


Converted 'listed_in' to genres_list (list). Sample:


Unnamed: 0,listed_in,genres_list
0,"International TV Shows, TV Dramas, TV Sci-Fi &...","[International TV Shows, TV Dramas, TV Sci-Fi ..."
1,"Dramas, International Movies","[Dramas, International Movies]"
2,"Horror Movies, International Movies","[Horror Movies, International Movies]"
3,"Action & Adventure, Independent Movies, Sci-Fi...","[Action & Adventure, Independent Movies, Sci-F..."


In [18]:
# Cell 6
df.to_csv(CLEANED_OUT, index=False)
print("Saved cleaned CSV to:", CLEANED_OUT)


Saved cleaned CSV to: C:\Users\KIIT\OneDrive\Documents\Labmentix\netflix\outputs\cleaned_netflix.csv
