In [2]:
# Cell 1 — Preflight: set paths, add project root to sys.path, list scripts, optionally install missing packages
import sys, subprocess, importlib, os, warnings
from pathlib import Path

warnings.filterwarnings("ignore")

# EDIT THIS IF YOUR PROJECT ROOT IS DIFFERENT
PROJECT_ROOT = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix").resolve()
SCRIPTS_DIR = PROJECT_ROOT / "scripts"
RAW_CSV = PROJECT_ROOT / "data" / "raw" / "NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv"
OUT_DIR = PROJECT_ROOT / "outputs"

# ensure outputs exists
OUT_DIR.mkdir(parents=True, exist_ok=True)

# add project root to sys.path so "import scripts.utils" works
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("Project root:", PROJECT_ROOT)
print("Scripts dir exists:", SCRIPTS_DIR.exists(), SCRIPTS_DIR)
print("Raw CSV exists:", RAW_CSV.exists(), RAW_CSV)
print("Outputs dir:", OUT_DIR)

# Create __init__.py in scripts/ if missing (so 'scripts' is importable)
if SCRIPTS_DIR.exists():
    init_py = SCRIPTS_DIR / "__init__.py"
    if not init_py.exists():
        init_py.write_text("# package marker\n")
        print("Created scripts/__init__.py")
    else:
        print("scripts/__init__.py already present")
else:
    print("WARNING: scripts/ directory not found. If your scripts are elsewhere, update PROJECT_ROOT or create scripts/")

# List python files in scripts/
py_files = []
if SCRIPTS_DIR.exists():
    py_files = sorted([p.name for p in SCRIPTS_DIR.glob("*.py")])
print("Python files found in scripts/:", py_files)

# Check common required packages and optionally install missing ones
required = ["pandas","numpy","scikit-learn","scipy","joblib","tqdm","matplotlib","seaborn"]
missing = [pkg for pkg in required if importlib.util.find_spec(pkg) is None]

if missing:
    print("Missing packages detected:", missing)
    # If you want to auto-install, set this flag to True
    AUTO_INSTALL = True
    if AUTO_INSTALL:
        print("Attempting to install missing packages with pip. This may take several minutes.")
        subprocess.check_call([sys.executable, "-m", "pip", "install"] + missing)
        print("Install attempt finished. If binaries were installed, restart the kernel before running scripts.")
    else:
        print("AUTO_INSTALL is False. To auto-install set AUTO_INSTALL=True in this cell and re-run it.")
else:
    print("All common packages appear installed.")


Project root: C:\Users\KIIT\OneDrive\Documents\Labmentix\netflix
Scripts dir exists: True C:\Users\KIIT\OneDrive\Documents\Labmentix\netflix\scripts
Raw CSV exists: True C:\Users\KIIT\OneDrive\Documents\Labmentix\netflix\data\raw\NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv
Outputs dir: C:\Users\KIIT\OneDrive\Documents\Labmentix\netflix\outputs
scripts/__init__.py already present
Python files found in scripts/: ['__init__.py', 'build_features.py', 'clean_data.py', 'run_pipeline.py', 'utils.py']
Missing packages detected: ['scikit-learn']
Attempting to install missing packages with pip. This may take several minutes.
Install attempt finished. If binaries were installed, restart the kernel before running scripts.


In [4]:
# Cell 2 — Helper: run a script (simulate CLI args), capture exceptions and log results
import runpy, sys, traceback, io, os
from pathlib import Path
from datetime import datetime

PROJECT_ROOT = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix").resolve()
SCRIPTS_DIR = PROJECT_ROOT / "scripts"
OUT_DIR = PROJECT_ROOT / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)
LOG_PATH = OUT_DIR / "pipeline_run_log.txt"

def run_script(script_name, argv_list=None, cwd=None, capture_output=False):
    """
    Run a script located in scripts/ by name (e.g., 'clean_data.py').
    - argv_list: list of strings to pass as argv after script name
    - cwd: working directory to temporarily switch to (default scripts dir)
    - capture_output: not used for now (prints to notebook)
    Returns: dict with keys: status ('OK' or 'ERROR'), exc (exception string or None)
    """
    script_path = (SCRIPTS_DIR / script_name).resolve()
    if not script_path.exists():
        msg = f"Script not found: {script_path}"
        print(msg)
        return {"status":"NOT_FOUND", "exc": msg}
    argv_backup = sys.argv.copy()
    sys.argv = [str(script_path)] + (argv_list if argv_list else [])
    old_cwd = os.getcwd()
    if cwd is None:
        cwd = SCRIPTS_DIR
    os.chdir(str(cwd))
    status = "OK"
    exc_text = None
    print("\n" + "="*70)
    print(f"RUNNING: {script_path.name}")
    print("Simulated argv:", sys.argv)
    try:
        runpy.run_path(str(script_path), run_name="__main__")
        print(f"Finished: {script_path.name} (status=OK)")
    except Exception:
        status = "ERROR"
        exc_buf = io.StringIO()
        traceback.print_exc(file=exc_buf)
        exc_text = exc_buf.getvalue()
        print(f"ERROR while running {script_path.name}:\n", exc_text)
    finally:
        # restore
        sys.argv = argv_backup
        os.chdir(old_cwd)
        with open(LOG_PATH, "a", encoding="utf-8") as f:
            f.write(f"{datetime.now().isoformat()} - {script_path.name} - STATUS={status}\n")
            if exc_text:
                f.write(exc_text + "\n")
    return {"status": status, "exc": exc_text}


In [5]:
# Cell 3 — Run clean_data.py (if your scripts include it)
from pathlib import Path
PROJECT_ROOT = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix").resolve()
SCRIPTS_DIR = PROJECT_ROOT / "scripts"
RAW_CSV = PROJECT_ROOT / "data/raw/NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv"
OUT_DIR = PROJECT_ROOT / "outputs"

script_name = "clean_data.py"
if (SCRIPTS_DIR / script_name).exists():
    result = run_script(script_name, argv_list=[
        "--csv", str(RAW_CSV),
        "--out", str(OUT_DIR / "cleaned_netflix.csv")
    ])
    print("Result:", result)
else:
    print(f"{script_name} not found in {SCRIPTS_DIR}. Skipping.")



RUNNING: clean_data.py
Simulated argv: ['C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\scripts\\clean_data.py', '--csv', 'C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\data\\raw\\NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv', '--out', 'C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\outputs\\cleaned_netflix.csv']


2025-10-06 15:41:29 — INFO — Saved cleaned csv to C:\Users\KIIT\OneDrive\Documents\Labmentix\netflix\outputs\cleaned_netflix.csv


Finished: clean_data.py (status=OK)
Result: {'status': 'OK', 'exc': None}


In [6]:
# Cell 4 — Run run_pipeline.py (end-to-end pipeline)
from pathlib import Path
PROJECT_ROOT = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix").resolve()
SCRIPTS_DIR = PROJECT_ROOT / "scripts"
RAW_CSV = PROJECT_ROOT / "data/raw/NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv"
OUT_DIR = PROJECT_ROOT / "outputs"

script_name = "run_pipeline.py"
if (SCRIPTS_DIR / script_name).exists():
    # Example args: adjust --n_clusters and other flags as your script expects
    argv = [
        "--csv", str(RAW_CSV),
        "--out", str(OUT_DIR),
        "--n_clusters", "5",
        "--svd_n_components", "40"
    ]
    result = run_script(script_name, argv_list=argv)
    print("Result:", result)
else:
    print(f"{script_name} not found. Check scripts/ directory for pipeline script name.")



RUNNING: run_pipeline.py
Simulated argv: ['C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\scripts\\run_pipeline.py', '--csv', 'C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\data\\raw\\NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv', '--out', 'C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\outputs', '--n_clusters', '5', '--svd_n_components', '40']


2025-10-06 15:41:49 — INFO — Loading CSV...
2025-10-06 15:41:49 — INFO — Cleaning data...
2025-10-06 15:41:49 — INFO — Saved cleaned CSV to outputs/cleaned_netflix.csv (rows=7787)
2025-10-06 15:41:49 — INFO — Building features (TF-IDF + SVD + genres)...
2025-10-06 15:41:50 — INFO — Saved feature artifacts (tfidf, svd, mlb) and X_combined.npy
2025-10-06 15:41:50 — INFO — Clustering with KMeans k=6 ...
2025-10-06 15:41:55 — INFO — Saved final CSV to outputs/outputs (silhouette=0.0342)
2025-10-06 15:41:55 — INFO — Cluster distribution:
2025-10-06 15:41:55 — INFO — Pipeline finished. Output saved at: outputs\outputs


cluster
0    1104
1     329
2     787
3    1096
4    3654
5     817
Finished: run_pipeline.py (status=OK)
Result: {'status': 'OK', 'exc': None}


In [7]:
# Cell 5 — Master pipeline runner: edit steps below to match your scripts & CLI args
from pathlib import Path
PROJECT_ROOT = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix").resolve()
SCRIPTS_DIR = PROJECT_ROOT / "scripts"
OUT_DIR = PROJECT_ROOT / "outputs"
RAW_CSV = PROJECT_ROOT / "data/raw/NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv"

pipeline_steps = [
    # Example step: Clean data
    {"script":"clean_data.py", "args":["--csv", str(RAW_CSV), "--out", str(OUT_DIR / "cleaned_netflix.csv")]},
    # Example step: feature creation (if you have such a script)
    # {"script":"create_features.py", "args":["--in", str(OUT_DIR/"cleaned_netflix.csv"), "--out", str(OUT_DIR/"X_combined.npy")]},
    # Main pipeline
    {"script":"run_pipeline.py", "args":["--csv", str(RAW_CSV), "--out", str(OUT_DIR), "--n_clusters", "6", "--svd_n_components", "40"]}
]

for step in pipeline_steps:
    script_name = step["script"]
    args = step.get("args", [])
    script_path = SCRIPTS_DIR / script_name
    if script_path.exists():
        print("\nLaunching:", script_name, "with args:", args)
        res = run_script(script_name, argv_list=args)
        if res["status"] != "OK":
            print(f"Step {script_name} failed. Check logs at {OUT_DIR/'pipeline_run_log.txt'} and the error above. Aborting remaining steps.")
            break
    else:
        print(f"Skipping {script_name} (not found).")


2025-10-06 15:42:15 — INFO — Saved cleaned csv to C:\Users\KIIT\OneDrive\Documents\Labmentix\netflix\outputs\cleaned_netflix.csv
2025-10-06 15:42:15 — INFO — Loading CSV...



Launching: clean_data.py with args: ['--csv', 'C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\data\\raw\\NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv', '--out', 'C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\outputs\\cleaned_netflix.csv']

RUNNING: clean_data.py
Simulated argv: ['C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\scripts\\clean_data.py', '--csv', 'C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\data\\raw\\NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv', '--out', 'C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\outputs\\cleaned_netflix.csv']
Finished: clean_data.py (status=OK)

Launching: run_pipeline.py with args: ['--csv', 'C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\data\\raw\\NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv', '--out', 'C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\outputs', '--n_clusters', '6', '--svd_n_components', '40']

RUNNING: run_pipeline.py
Simulated argv: ['C:\\Users\\KIIT\\One

2025-10-06 15:42:15 — INFO — Cleaning data...
2025-10-06 15:42:15 — INFO — Saved cleaned CSV to outputs/cleaned_netflix.csv (rows=7787)
2025-10-06 15:42:15 — INFO — Building features (TF-IDF + SVD + genres)...
2025-10-06 15:42:16 — INFO — Saved feature artifacts (tfidf, svd, mlb) and X_combined.npy
2025-10-06 15:42:16 — INFO — Clustering with KMeans k=6 ...
2025-10-06 15:42:17 — INFO — Saved final CSV to outputs/outputs (silhouette=0.0342)
2025-10-06 15:42:17 — INFO — Cluster distribution:
2025-10-06 15:42:17 — INFO — Pipeline finished. Output saved at: outputs\outputs


cluster
0    1104
1     329
2     787
3    1096
4    3654
5     817
Finished: run_pipeline.py (status=OK)


In [8]:
# Cell 6 — Verify outputs: list recent files and preview CSVs
from pathlib import Path
import os
import pandas as pd

PROJECT_ROOT = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix").resolve()
OUT_DIR = PROJECT_ROOT / "outputs"

print("Recent files in outputs/ (most recent first):")
files = sorted(OUT_DIR.glob("*"), key=lambda p: p.stat().st_mtime, reverse=True)[:40]
for f in files:
    try:
        size_kb = f.stat().st_size / 1024
    except Exception:
        size_kb = 0
    print("-", f.name, f"({size_kb:.1f} KB)")

# Preview common outputs (if they exist)
for fname in ["cleaned_netflix.csv", "netflix_with_clusters.csv", "cluster_summary_table.csv", "cluster_summary_table_enhanced.csv"]:
    p = OUT_DIR / fname
    if p.exists():
        print(f"\nPreview {fname}:")
        display(pd.read_csv(p).head(6))
    else:
        print(f"{fname} not found in outputs/")


Recent files in outputs/ (most recent first):
- pipeline_run_log.txt (0.2 KB)
- cleaned_netflix.csv (3275.8 KB)
- netflix_with_clusters_summary.csv (1993.3 KB)
- pca_clusters_sample.png (232.3 KB)
- cluster_counts.png (20.6 KB)
- cluster_summary_table_enhanced.csv (2.1 KB)
- cluster_interpretation_snippets.md (3.7 KB)
- cluster_summary_table.csv (1.0 KB)
- netflix_with_clusters.csv (3291.0 KB)
- mlb_encoder.joblib (1.2 KB)
- scaler.joblib (0.9 KB)
- svd_transformer.joblib (939.2 KB)
- tfidf_vectorizer.joblib (108.9 KB)
- X_combined.npy (5110.3 KB)

Preview cleaned_netflix.csv:


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,genres_list,duration_num
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...,"['International TV Shows', 'TV Dramas', 'TV Sc...",4
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...,"['Dramas', 'International Movies']",93
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow...","['Horror Movies', 'International Movies']",78
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi...","['Action & Adventure', 'Independent Movies', '...",80
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...,['Dramas'],123
5,s6,TV Show,46,Serdar Akar,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,"July 1, 2017",2016,TV-MA,1 Season,"International TV Shows, TV Dramas, TV Mysteries",A genetics professor experiments with a treatm...,"['International TV Shows', 'TV Dramas', 'TV My...",1



Preview netflix_with_clusters.csv:


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,duration_num,genres_list,cluster
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...,4,"['International TV Shows', 'TV Dramas', 'TV Sc...",1
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...,93,"['Dramas', 'International Movies']",0
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow...",78,"['Horror Movies', 'International Movies']",0
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi...",80,"['Action & Adventure', 'Independent Movies', '...",0
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...,123,['Dramas'],0
5,s6,TV Show,46,Serdar Akar,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,"July 1, 2017",2016,TV-MA,1 Season,"International TV Shows, TV Dramas, TV Mysteries",A genetics professor experiments with a treatm...,1,"['International TV Shows', 'TV Dramas', 'TV My...",1



Preview cluster_summary_table.csv:


Unnamed: 0,cluster,n_titles,top_genres,top_terms,sample_titles
0,0,4682,International Movies (2253); Dramas (1886); Co...,"life, young, man, new, family, woman",7:19 | 23:59 | 9 | 21 | 122 | 706 | 1920 | 1922
1,1,2639,International TV Shows (1197); TV Dramas (700)...,"series, world, life, friends, new, family",3% | 46 | 1983 | 1994 | Feb-09 | ​SAINT SEIYA:...
2,2,466,Dramas (208); International Movies (166); Come...,"young, family, man, love, woman, life",187 | 28 Days | 300 Miles to Heaven | A Bridge...



Preview cluster_summary_table_enhanced.csv:


Unnamed: 0,cluster,n_titles,top_genres,top_terms,sample_titles
0,0,4682,"International Movies, Dramas, Comedies","life, young, man, new, family",7:19 (2016) — After a devastating earthquake h...
1,1,2639,"International TV Shows, TV Dramas, TV Comedies","series, world, life, friends, new",3% (2020) — In a future where the elite inhabi...
2,2,466,"Dramas, International Movies, Comedies","young, family, man, woman, love",187 (1997) — After one of his high school stud...
