In [7]:
import os, sys
from pathlib import Path
print("cwd:", os.getcwd())
print("python:", sys.executable)
print("sys.path[0:5]:", sys.path[:5])
# verify script files exist:
base = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix")
print("scripts folder exists:", (base / "scripts").exists())
print("run_pipeline.py exists:", (base / "scripts" / "run_pipeline.py").exists())
print("clean_data.py exists:", (base / "scripts" / "clean_data.py").exists())
print("build_features.py exists:", (base / "scripts" / "build_features.py").exists())


cwd: C:\Users\KIIT\OneDrive\Documents\Labmentix\netflix
python: C:\Users\KIIT\OneDrive\Documents\Labmentix\netflix\venv\Scripts\python.exe
sys.path[0:5]: ['C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\\python310.zip', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\\DLLs', 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\\lib', 'C:\\Users\\KIIT\\AppData\\Local\\Microsoft\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0', 'C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\venv']
scripts folder exists: True
run_pipeline.py exists: True
clean_data.py exists: True
build_features.py exists: True


In [8]:
import importlib.util
from pathlib import Path

utils_path = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/scripts/utils.py")
spec = importlib.util.spec_from_file_location("utils_local", str(utils_path))
utils_local = importlib.util.module_from_spec(spec)
spec.loader.exec_module(utils_local)

# now you can call functions:
df = utils_local.safe_load_csv("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/data/raw/NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv")
print("Loaded rows:", len(df))


Loaded rows: 7787


In [10]:
# Option B: run inside current kernel by ensuring project root is on sys.path
import sys, runpy
from pathlib import Path

proj_root = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix").resolve()
script = proj_root / "scripts" / "clean_data.py"
csv_in = proj_root / "data" / "raw" / "NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv"
out_csv = proj_root / "outputs" / "cleaned_netflix.csv"

assert script.exists(), f"Script not found: {script}"
assert csv_in.exists(), f"CSV not found: {csv_in}"

# Add project root at front of sys.path if not already present
if str(proj_root) not in sys.path:
    sys.path.insert(0, str(proj_root))
    print("Inserted project root into sys.path")

# Temporarily set sys.argv as the script expects; save and restore original argv
orig_argv = sys.argv.copy()
sys.argv = [str(script), "--csv", str(csv_in), "--out", str(out_csv)]

try:
    runpy.run_path(str(script), run_name="__main__")
finally:
    sys.argv = orig_argv

print("Finished. Check:", out_csv)


Inserted project root into sys.path


2025-10-02 01:44:18 — INFO — Saved cleaned csv to C:\Users\KIIT\OneDrive\Documents\Labmentix\netflix\outputs\cleaned_netflix.csv


Finished. Check: C:\Users\KIIT\OneDrive\Documents\Labmentix\netflix\outputs\cleaned_netflix.csv


In [11]:
import runpy, sys
from pathlib import Path

script = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/scripts/build_features.py").resolve()
assert script.exists()

sys_argv_backup = sys.argv.copy()
sys.argv = [str(script),
            "--cleaned", "C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs/cleaned_netflix.csv",
            "--outx", "C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs/X_combined.npy"]
runpy.run_path(str(script), run_name="__main__")
sys.argv = sys_argv_backup
print("build_features.py finished (check outputs/X_combined.npy).")


2025-10-02 01:44:59 — INFO — Saved X to C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs/X_combined.npy and vectorizers to outputs/


build_features.py finished (check outputs/X_combined.npy).


In [15]:
import sys
print("Python executable:", sys.executable)
# Install tqdm into the same interpreter the notebook is using:
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install tqdm


Python executable: C:\Users\KIIT\OneDrive\Documents\Labmentix\netflix\venv\Scripts\python.exe
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1


In [16]:
import runpy, sys
from pathlib import Path

script = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/scripts/run_pipeline.py").resolve()
assert script.exists()

sys_argv_backup = sys.argv.copy()
sys.argv = [str(script),
            "--csv", "C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/data/raw/NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv",
            "--out", "C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs/netflix_with_clusters.csv",
            "--sample", "1000",
            "--k", "6",
            "--tfidf_max_features", "2000",
            "--svd_n_components", "40"
           ]
runpy.run_path(str(script), run_name="__main__")
sys.argv = sys_argv_backup
print("run_pipeline.py finished (sample run). Check outputs/ for artifacts.")


2025-10-02 01:47:47 — INFO — Loading CSV...
2025-10-02 01:47:47 — INFO — Sampling 1000 rows (seed=42) for quick run...
2025-10-02 01:47:47 — INFO — Cleaning data...
2025-10-02 01:47:47 — INFO — Saved cleaned CSV to outputs/cleaned_netflix.csv (rows=1000)
2025-10-02 01:47:47 — INFO — Building features (TF-IDF + SVD + genres)...
2025-10-02 01:47:47 — INFO — Saved feature artifacts (tfidf, svd, mlb) and X_combined.npy
2025-10-02 01:47:47 — INFO — Clustering with KMeans k=6 ...
2025-10-02 01:47:50 — INFO — Saved final CSV to outputs/netflix_with_clusters.csv (silhouette=0.0240)
2025-10-02 01:47:50 — INFO — Cluster distribution:
2025-10-02 01:47:50 — INFO — Pipeline finished. Output saved at: outputs\netflix_with_clusters.csv


cluster
0     85
1    193
2     99
3    170
4    109
5    344
run_pipeline.py finished (sample run). Check outputs/ for artifacts.


In [17]:
from pathlib import Path
import pandas as pd

out_dir = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs")
print("Outputs exist:", out_dir.exists())
for f in ["cleaned_netflix.csv", "X_combined.npy", "netflix_with_clusters.csv", "kmeans_final.joblib"]:
    print(f, "->", (out_dir / f).exists())

# If final CSV exists, show a quick preview:
final = out_dir / "netflix_with_clusters.csv"
if final.exists():
    df_final = pd.read_csv(final)
    print("Final shape:", df_final.shape)
    display(df_final.head(8))


Outputs exist: True
cleaned_netflix.csv -> True
X_combined.npy -> True
netflix_with_clusters.csv -> True
kmeans_final.joblib -> True
Final shape: (1000, 15)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,genres_list,duration_num,cluster
0,s7325,Movie,Unchained: The Untold Story of Freestyle Motoc...,"Paul Taublieb, Jon Freeman",Josh Brolin,United States,"October 1, 2016",2016,TV-MA,92 min,"Documentaries, Sports Movies",This adrenaline-rush documentary traces the ri...,"['Documentaries', 'Sports Movies']",92,2
1,s4695,Movie,Our Idiot Brother,Jesse Peretz,"Paul Rudd, Elizabeth Banks, Zooey Deschanel, E...",United States,"February 26, 2019",2011,R,90 min,"Comedies, Dramas, Independent Movies",A seemingly clueless idealist relies on his ex...,"['Comedies', 'Dramas', 'Independent Movies']",90,5
2,s1323,Movie,Chhota Bheem Kungfu Dhamaka,"Rajiv Chilaka, Binayak Das","Sonal Kaushal, Rupa Bhimani, Jigna Bharadhwaj,...",India,"August 15, 2019",2019,TV-Y7,112 min,Children & Family Movies,Chhota Bheem is in China for a martial arts co...,['Children & Family Movies'],112,0
3,s5107,Movie,Ravenous,Robin Aubert,"Marc-André Grondin, Monia Chokri, Charlotte St...",Canada,"March 2, 2018",2017,TV-MA,104 min,"Horror Movies, International Movies",As a zombie plague ravages their rural Quebec ...,"['Horror Movies', 'International Movies']",104,3
4,s102,Movie,3 Seconds Divorce,Shazia Javed,,Canada,"June 15, 2019",2018,TV-PG,53 min,"Documentaries, Faith & Spirituality",A Muslim women's activist group in India prote...,"['Documentaries', 'Faith & Spirituality']",53,2
5,s3654,Movie,Listen,Philippe Aractingi,"Hadi Bou Ayash, Ruba Zarour, Yara Bou Nassar, ...",Lebanon,"October 19, 2020",2017,TV-MA,103 min,"Dramas, Independent Movies, International Movies",A sound engineer falls for a model but after a...,"['Dramas', 'Independent Movies', 'Internationa...",103,5
6,s3520,TV Show,"Lady, la vendedora de rosas",,"Natalia Reyes, Michelle Orozco, Majida Issa, E...",Colombia,"April 1, 2018",2015,TV-14,1 Season,"International TV Shows, Spanish-Language TV Sh...",This series tells the life story of Colombian ...,"['International TV Shows', 'Spanish-Language T...",1,1
7,s488,Movie,An American Tail: The Mystery of the Night Mon...,Larry Latham,"Thomas Dekker, Lacey Chabert, Jane Singer, Neh...",United States,"April 1, 2018",1999,G,75 min,Children & Family Movies,When a monster goes on a mouse-napping spree i...,['Children & Family Movies'],75,0


In [18]:
print("""Why runpy.run_path? It executes the script with __name__ == '__main__', so the script's argument parsing executes exactly as if you used python script.py ..., but within the same Python process. That is convenient for notebooks but modifies the notebook runtime state (e.g., variables, logging). Use subprocess.run(...) to execute in a separate process if you want isolation.

When using runpy.run_path, always save and restore sys.argv as shown — otherwise the notebook's sys.argv gets changed.

If a script uses relative paths (no absolute paths), ensure the current working directory (os.getcwd()) matches what the script expects, or pass absolute paths in arguments (I used absolute paths above).

If a script is long-running, prefer subprocess.run so the notebook remains responsive.

If you get errors, copy the full traceback and the exact snippet you ran and paste here — I’ll debug line-by-line.""")

Why runpy.run_path? It executes the script with __name__ == '__main__', so the script's argument parsing executes exactly as if you used python script.py ..., but within the same Python process. That is convenient for notebooks but modifies the notebook runtime state (e.g., variables, logging). Use subprocess.run(...) to execute in a separate process if you want isolation.

When using runpy.run_path, always save and restore sys.argv as shown — otherwise the notebook's sys.argv gets changed.

If a script uses relative paths (no absolute paths), ensure the current working directory (os.getcwd()) matches what the script expects, or pass absolute paths in arguments (I used absolute paths above).

If a script is long-running, prefer subprocess.run so the notebook remains responsive.

If you get errors, copy the full traceback and the exact snippet you ran and paste here — I’ll debug line-by-line.
