In [1]:
# 02_data_cleaning_and_preprocessing
print("Clean dataset and produce `outputs/cleaned_netflix.csv`.")


Clean dataset and produce `outputs/cleaned_netflix.csv`.


In [2]:
# Cell 2: imports & load file
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import pandas as pd
import numpy as np

CANDIDATES = [Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/data/raw//NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv"),
              Path("NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv")]
DATA_PATH = next((p for p in CANDIDATES if p.exists()), None)
assert DATA_PATH is not None, "CSV not found. Place the CSV in repo root or data/."

df = pd.read_csv(DATA_PATH)
print("Loaded shape:", df.shape)


Loaded shape: (7787, 12)


In [3]:
# Cell 3: cleaning helpers & initial fixes
def safe_str(x):
    if pd.isna(x):
        return ""
    return str(x)

# Ensure description/listed_in exist
df['description'] = df.get('description', "").fillna("").astype(str)
df['listed_in'] = df.get('listed_in', "").fillna("").astype(str)

# Parse genres to list
df['genres_list'] = df['listed_in'].apply(lambda s: [g.strip() for g in safe_str(s).split(',') if g.strip()])

# Parse duration to numeric (minutes or seasons as int)
def parse_duration(x):
    if pd.isna(x):
        return np.nan
    s = str(x).lower().strip()
    if 'min' in s:
        try:
            return int(s.split()[0])
        except:
            return np.nan
    if 'season' in s:
        try:
            return int(s.split()[0])
        except:
            return np.nan
    return np.nan

df['duration_num'] = df.get('duration', np.nan)
if 'duration' in df.columns:
    df['duration_num'] = df['duration'].apply(parse_duration)
else:
    df['duration_num'] = np.nan

# Clean release_year to numeric
if 'release_year' in df.columns:
    df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')
else:
    df['release_year'] = np.nan

# Normalize 'type'
if 'type' in df.columns:
    df['type'] = df['type'].fillna('').astype(str)

# Remove exact duplicates (title + year) if available
if 'title' in df.columns and 'release_year' in df.columns:
    before = df.shape[0]
    df = df.drop_duplicates(subset=['title','release_year'])
    print(f"Dropped {before - df.shape[0]} duplicate rows (by title+release_year).")
else:
    df = df.drop_duplicates()


Dropped 0 duplicate rows (by title+release_year).


In [13]:
# Cell 4: Save cleaned CSV
Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs").mkdir(exist_ok=True)
clean_path = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs/cleaned_netflix.csv")
df.to_csv(clean_path, index=False)
print("Saved cleaned CSV to:", clean_path)
display(df.head(6))


Saved cleaned CSV to: C:\Users\KIIT\OneDrive\Documents\Labmentix\netflix\outputs\cleaned_netflix.csv


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,genres_list,duration_num
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...,"[International TV Shows, TV Dramas, TV Sci-Fi ...",4
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...,"[Dramas, International Movies]",93
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow...","[Horror Movies, International Movies]",78
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi...","[Action & Adventure, Independent Movies, Sci-F...",80
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...,[Dramas],123
5,s6,TV Show,46,Serdar Akar,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,"July 1, 2017",2016,TV-MA,1 Season,"International TV Shows, TV Dramas, TV Mysteries",A genetics professor experiments with a treatm...,"[International TV Shows, TV Dramas, TV Mysteries]",1


In [5]:
print("Notes:")
print("- cleaned CSV contains new columns: `genres_list`, `duration_num`.")
print("- If you change cleaning parameters, re-run downstream notebooks.")


Notes:
- cleaned CSV contains new columns: `genres_list`, `duration_num`.
- If you change cleaning parameters, re-run downstream notebooks.
