
# MovieLens 1M — **NMF + K‑Means** (factor/k sweep) vs **SVD baselines** (K‑Means++, DBSCAN)

This notebook:
- Loads **MovieLens 1M** robustly
- Builds **user × movie** implicit matrix (≥4 stars → 1)
- **Baseline (SVD)** → standardized SVD features → **K‑Means++** & **DBSCAN** (with suitability verdicts)
- **NMF sweep** over factors × k to pick the best, with a suitability verdict
- **Cluster profiling**: top factors & sample movies, plus top genres per cluster
- Exports cluster labels to CSV


In [2]:

# ---- Configuration ----
# 1) Set this to the folder that contains ratings.dat, users.dat, movies.dat
DATA_DIR = "datasets/movielens-1m/"  

# Signals & weighting (used for SVD baselines)
USE_IMPLICIT = True            # ≥ IMPLICIT_THRESHOLD → 1 (else 0)
IMPLICIT_THRESHOLD = 4
HEAD_PRUNE_PCT = 0.005         # drop top 0.5% most-rated movies
TFIDF_ALPHA = 1.2              # 1.3–2.0; higher = stronger down-weighting

# Preprocessing (user filter + SVD dims for baselines)
MIN_RATINGS_PER_USER = 100     # stronger raters → clearer clusters (try 120–150)
ROW_NORMALIZE = True           # L2 normalize rows before SVD
N_COMPONENTS = 140             # SVD embedding size

# K sweeps
K_LIST = [10, 12, 14, 16, 18]
KMEANS_N_INIT = 20
KMEANS_MAX_ITER = 300
RANDOM_STATE = 42

# DBSCAN
DBSCAN_MIN_SAMPLES = 10
DBSCAN_EPS_LIST = None         # set list to override elbow (e.g., [0.1, 0.12, 0.14])

# NMF sweep
N_FACTORS_LIST = [48, 64, 96]  # try a few; 64 is a good default

# Profiling
TOP_GENRES_N = 10
TOP_MOVIES_PER_FACTOR = 12     # for factor interpretation

# Quick iteration
MAX_USERS = None              

In [None]:

import os, numpy as np, pandas as pd
from scipy.sparse import csr_matrix, csc_matrix, diags
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.preprocessing import StandardScaler, normalize as l2norm
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

np.random.seed(RANDOM_STATE)

def suitability_label(method, sil_cos=None, db=None, n_clusters=None):
    reasons = []
    ok = True
    if n_clusters is not None and n_clusters < 2:
        ok = False; reasons.append("only one cluster")
    if sil_cos is None or (isinstance(sil_cos, float) and (np.isnan(sil_cos) or np.isinf(sil_cos))):
        ok = False; reasons.append("silhouette undefined")
    else:
        if sil_cos < 0.05:
            ok = False; reasons.append(f"low silhouette ({sil_cos:.3f})")
    if db is not None and not (np.isnan(db) or np.isinf(db)):
        if db > 8.0:
            ok = False; reasons.append(f"high Davies–Bouldin ({db:.2f})")
    verdict = "SUITABLE" if ok else "NOT SUITABLE"
    reason_txt = "; ".join(reasons) if reasons else "metrics look OK"
    print(f"[{method}] {verdict} — {reason_txt}")
    return ok


## 1) Robust load (MovieLens 1M)

In [None]:

# Robust loader with safe movies parsing
ratings_path = os.path.join(DATA_DIR, "ratings.dat")
users_path   = os.path.join(DATA_DIR, "users.dat")
movies_path  = os.path.join(DATA_DIR, "movies.dat")

ratings = pd.read_csv(ratings_path, sep="::", engine="python", encoding="latin-1",
                      names=["user_id", "movie_id", "rating", "timestamp"])
users = pd.read_csv(users_path, sep="::", engine="python", encoding="latin-1",
                    names=["user_id", "gender", "age", "occupation", "zip"])

try:
    movies = pd.read_csv(movies_path, sep="::", engine="python", encoding="latin-1",
                         names=["movie_id", "title", "genres"], on_bad_lines="skip")
except TypeError:
    movies = pd.read_csv(movies_path, sep="::", engine="python", encoding="latin-1",
                         names=["movie_id", "title", "genres"])

if movies.shape[1] != 3 or movies["movie_id"].isna().any():
    rows = []
    with open(movies_path, "r", encoding="latin-1", errors="replace") as f:
        for line in f:
            parts = line.rstrip("\r\n").split("::")
            if len(parts) >= 3:
                rows.append((int(parts[0]), parts[1], "::".join(parts[2:])))
    movies = pd.DataFrame(rows, columns=["movie_id", "title", "genres"])

print("Counts → ratings:", len(ratings), "| users:", users["user_id"].nunique(),
      "| movies:", len(movies), "| unique movie_ids:", movies["movie_id"].nunique())

assert len(ratings) == 1000209, "ratings.dat doesn't look like MovieLens 1M"
assert users["user_id"].nunique() == 6040, "users.dat doesn't look like MovieLens 1M"

movies = movies.drop_duplicates("movie_id").set_index("movie_id")


Counts → ratings: 1000209 | users: 6040 | movies: 3883 | unique movie_ids: 3883


## 2) Build user×movie implicit matrix (≥4 stars → 1)

In [None]:

# Map to indices
user_ids = np.sort(ratings["user_id"].unique())
movie_ids = np.sort(ratings["movie_id"].unique())
u_index = {u:i for i,u in enumerate(user_ids)}
m_index = {m:i for i,m in enumerate(movie_ids)}

# Implicit likes
rows = ratings["user_id"].map(u_index).values
cols = ratings["movie_id"].map(m_index).values
vals = (ratings["rating"] >= IMPLICIT_THRESHOLD).astype(np.float32).values

X_implicit = csr_matrix((vals, (rows, cols)), shape=(len(user_ids), len(movie_ids)), dtype=np.float32)
X_implicit


NameError: name 'np' is not defined

## 3) User filtering (based on original implicit counts)

In [None]:

user_nlikes = np.diff(X_implicit.indptr)
keep_users_idx = np.where(user_nlikes >= MIN_RATINGS_PER_USER)[0]
user_ids_kept = user_ids[keep_users_idx]
print(f"Users kept: {len(keep_users_idx)} / {len(user_ids)} (min_ratings={MIN_RATINGS_PER_USER})")


Users kept: 2945 / 6040 (min_ratings=100)


## 4) Baseline SVD features (for K‑Means++ & DBSCAN)

In [None]:

# Start from implicit matrix, optionally head-prune & TF-IDF^alpha across movies
Xb = X_implicit.copy()

# Head-prune very popular movies (optional)
if HEAD_PRUNE_PCT and HEAD_PRUNE_PCT > 0:
    df_movies = np.diff(csc_matrix(Xb).indptr)   # #users per movie
    cut = np.quantile(df_movies, 1 - HEAD_PRUNE_PCT)
    keep_mask = df_movies <= cut
    Xb = Xb[:, keep_mask]
    movie_ids_b = movie_ids[keep_mask]
    print("Baseline head-prune → kept movies:", Xb.shape[1], "| dropped:", (~keep_mask).sum())

# TF-IDF^alpha (down-weight popular movies)
df = np.diff(csc_matrix(Xb).indptr).astype(np.float32)
idf = np.log((1.0 + Xb.shape[0]) / (1.0 + df)) + 1.0
idf = idf ** float(TFIDF_ALPHA) if TFIDF_ALPHA != 1.0 else idf
Xb = Xb @ diags(idf)

# Apply user filter (based on original counts)
Xb = Xb[keep_users_idx]
print("Baseline matrix shape after filtering:", Xb.shape)

# L2 row-norm → SVD → standardize → L2 (spherical)
Xbn = l2norm(Xb, norm="l2", axis=1, copy=False) if ROW_NORMALIZE else Xb
svd = TruncatedSVD(n_components=N_COMPONENTS, random_state=RANDOM_STATE)
Xs = svd.fit_transform(Xbn)
print(f"SVD shape: {Xs.shape} | explained variance sum ~{svd.explained_variance_ratio_.sum():.3f}")
Z = StandardScaler().fit_transform(Xs)
Zs = l2norm(Z, norm="l2", axis=1)
Zs.shape


Baseline head-prune → kept movies: 3687 | dropped: 19
Baseline matrix shape after filtering: (2945, 3687)
SVD shape: (2945, 140) | explained variance sum ~0.333


(2945, 140)

## 5) Baselines — K‑Means++ sweep (on SVD features)

In [None]:

rows = []
for k in K_LIST:
    km = KMeans(n_clusters=k, init="k-means++", n_init=KMEANS_N_INIT, max_iter=KMEANS_MAX_ITER, random_state=RANDOM_STATE)
    labels = km.fit_predict(Zs)
    sil = silhouette_score(Zs, labels, metric="cosine")
    db  = davies_bouldin_score(Zs, labels)
    rows.append((k, sil, db, km.inertia_))
df_km = pd.DataFrame(rows, columns=["k","sil_cosine","davies_bouldin","inertia"]).sort_values("sil_cosine", ascending=False)
print("=== Baseline: K-Means++ on SVD features ===")
print(df_km.to_string(index=False, float_format=lambda x: f"{x:.3f}"))
k0, sil0, db0, _ = df_km.iloc[0]
suitability_label(f"K-Means++ baseline (k={int(k0)})", sil_cos=float(sil0), db=float(db0), n_clusters=int(k0))


=== Baseline: K-Means++ on SVD features ===
 k  sil_cosine  davies_bouldin  inertia
18       0.029           6.113 2744.900
16       0.028           6.420 2760.398
14       0.026           6.690 2779.363
12       0.024           6.896 2795.615
10       0.021           7.288 2818.201
[K-Means++ baseline (k=18)] NOT SUITABLE — low silhouette (0.029)


False

## 6) Baselines — DBSCAN (cosine) on SVD features

In [None]:

metric="cosine"
k = max(2, DBSCAN_MIN_SAMPLES)
if DBSCAN_EPS_LIST is None:
    nbrs = NearestNeighbors(n_neighbors=k, metric=metric).fit(Zs)
    dists, _ = nbrs.kneighbors(Zs)
    k_d = np.sort(dists[:, k-1])
    perc = [80, 85, 90, 92, 94, 96, 98]
    eps_list = list(np.unique(np.round(np.percentile(k_d, perc), 4)))
else:
    eps_list = DBSCAN_EPS_LIST

rows = []
for eps in eps_list:
    labels = DBSCAN(eps=float(eps), min_samples=DBSCAN_MIN_SAMPLES, metric=metric, n_jobs=-1).fit_predict(Zs)
    n_noise = int(np.sum(labels == -1))
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    if n_clusters >= 2 and (labels != -1).any():
        m = labels != -1
        sil = silhouette_score(Zs[m], labels[m], metric="cosine")
        db  = davies_bouldin_score(Zs[m], labels[m])
    else:
        sil = np.nan; db = np.nan
    rows.append((float(eps), n_clusters, n_noise, sil, db))
df_db = pd.DataFrame(rows, columns=["eps","n_clusters","noise","sil_cosine","davies_bouldin"]).sort_values(["n_clusters","sil_cosine"], ascending=[False, False])
print("=== Baseline: DBSCAN (cosine) on SVD features ===")
print(df_db.to_string(index=False, float_format=lambda x: f"{x:.3f}" if isinstance(x,float) else str(x)))
if len(df_db)==0 or int(df_db.iloc[0]["n_clusters"])<2 or pd.isna(df_db.iloc[0]["sil_cosine"]):
    suitability_label("DBSCAN baseline", sil_cos=np.nan, db=np.nan, n_clusters=1)
else:
    eps_best = float(df_db.iloc[0]["eps"])
    labels_db = DBSCAN(eps=eps_best, min_samples=DBSCAN_MIN_SAMPLES, metric=metric, n_jobs=-1).fit_predict(Zs)
    m = labels_db != -1
    sil = silhouette_score(Zs[m], labels_db[m], metric="cosine") if m.any() else np.nan
    db  = davies_bouldin_score(Zs[m], labels_db[m]) if m.any() else np.nan
    n_clusters = len(set(labels_db)) - (1 if -1 in labels_db else 0)
    suitability_label(f"DBSCAN baseline (eps={eps_best:.4f})", sil_cos=sil, db=db, n_clusters=n_clusters)


=== Baseline: DBSCAN (cosine) on SVD features ===
  eps  n_clusters  noise  sil_cosine  davies_bouldin
0.747           1      1         NaN             NaN
0.751           1      0         NaN             NaN
0.755           1      0         NaN             NaN
0.756           1      0         NaN             NaN
0.759           1      0         NaN             NaN
0.761           1      0         NaN             NaN
0.766           1      0         NaN             NaN
[DBSCAN baseline] NOT SUITABLE — only one cluster; silhouette undefined


## 7) NMF factor × k sweep (pick best)

In [None]:

# Use the implicit matrix with the same kept users, no TF-IDF needed for NMF
Xn = X_implicit[keep_users_idx]

nmf_results = []
best = {"sil": -np.inf}

for nf in N_FACTORS_LIST:
    nmf = NMF(n_components=nf, init="nndsvda", random_state=RANDOM_STATE, max_iter=400, l1_ratio=0.0)
    W = nmf.fit_transform(Xn)   # users × factors (non-negative)
    Wn = l2norm(W, norm="l2", axis=1)  # normalize for cosine
    for k in K_LIST:
        km = KMeans(n_clusters=k, init="k-means++", n_init=KMEANS_N_INIT, max_iter=KMEANS_MAX_ITER, random_state=RANDOM_STATE)
        labels = km.fit_predict(Wn)
        sil = silhouette_score(Wn, labels, metric="cosine")
        db  = davies_bouldin_score(Wn, labels)
        nmf_results.append((nf, k, sil, db))
        if sil > best["sil"]:
            best.update({"nf": nf, "k": k, "sil": sil, "db": db, "labels": labels, "W": W, "H": nmf.components_})

df_nmf = pd.DataFrame(nmf_results, columns=["n_factors","k","sil_cosine","davies_bouldin"]).sort_values("sil_cosine", ascending=False)
print("=== NMF + K-Means (factor × k sweep) ===")
print(df_nmf.head(10).to_string(index=False, float_format=lambda x: f"{x:.3f}"))

suitability_label(f"NMF + K-Means (nf={best['nf']}, k={best['k']})", sil_cos=float(best["sil"]), db=float(best["db"]), n_clusters=int(best["k"]))

# Persist best
best_nf = int(best["nf"]); best_k = int(best["k"])
labels_best = best["labels"]; W_best = best["W"]; H_best = best["H"]
print(f"Best — nf={best_nf}, k={best_k} | silhouette={best['sil']:.3f} | DB={best['db']:.3f}")




=== NMF + K-Means (factor × k sweep) ===
 n_factors  k  sil_cosine  davies_bouldin
        48 18       0.209           2.062
        48 16       0.208           2.065
        48 14       0.203           2.086
        48 12       0.191           2.245
        64 12       0.189           2.274
        64 14       0.188           2.223
        64 16       0.186           2.246
        64 18       0.185           2.281
        48 10       0.182           2.296
        96 10       0.180           2.579
[NMF + K-Means (nf=48, k=18)] SUITABLE — metrics look OK
Best — nf=48, k=18 | silhouette=0.209 | DB=2.062


## 8) Cluster profiling (for best NMF + K‑Means)

In [None]:

# Top factors per cluster (average W per cluster), sample movies per factor
kept_user_ids = user_ids_kept  # alias
clusters = pd.Series(labels_best, index=kept_user_ids, name="cluster")
centroids = pd.DataFrame(W_best, index=kept_user_ids).groupby(clusters).mean()  # cluster × factors

def top_movies_for_factor(H, factor_idx, top_n=TOP_MOVIES_PER_FACTOR):
    idx = np.argsort(H[factor_idx])[::-1][:top_n]
    mids = movie_ids[idx]
    # handle missing movie rows safely
    df = movies.loc[mids, ["title","genres"]].reset_index()
    return df

# Show top 3 factors with sample movies for first 3 clusters
for c in centroids.index[:3]:
    top_f = np.argsort(centroids.loc[c].values)[::-1][:3]
    print(f"\nCluster {c} — top factors: {top_f.tolist()}")
    for f in top_f:
        print(f"  Factor {int(f)} — sample movies:")
        display(top_movies_for_factor(H_best, int(f), top_n=10))



Cluster 0 — top factors: [32, 37, 24]
  Factor 32 — sample movies:


Unnamed: 0,movie_id,title,genres
0,3256,Patriot Games (1992),Action|Thriller
1,1610,"Hunt for Red October, The (1990)",Action|Thriller
2,457,"Fugitive, The (1993)",Action|Thriller
3,349,Clear and Present Danger (1994),Action|Adventure|Thriller
4,733,"Rock, The (1996)",Action|Adventure|Thriller
5,1608,Air Force One (1997),Action|Thriller
6,377,Speed (1994),Action|Romance|Thriller
7,2353,Enemy of the State (1998),Action|Thriller
8,1370,Die Hard 2 (1990),Action|Thriller
9,474,In the Line of Fire (1993),Action|Thriller


  Factor 37 — sample movies:


Unnamed: 0,movie_id,title,genres
0,3578,Gladiator (2000),Action|Drama
1,3753,"Patriot, The (2000)",Action|Drama|War
2,3793,X-Men (2000),Action|Sci-Fi
3,3555,U-571 (2000),Action|Thriller
4,3510,Frequency (2000),Drama|Thriller
5,3623,Mission: Impossible 2 (2000),Action|Thriller
6,3408,Erin Brockovich (2000),Drama
7,3755,"Perfect Storm, The (2000)",Action|Adventure|Thriller
8,3624,Shanghai Noon (2000),Action
9,3147,"Green Mile, The (1999)",Drama|Thriller


  Factor 24 — sample movies:


Unnamed: 0,movie_id,title,genres
0,480,Jurassic Park (1993),Action|Adventure|Sci-Fi
1,1580,Men in Black (1997),Action|Adventure|Comedy|Sci-Fi
2,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
3,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
4,780,Independence Day (ID4) (1996),Action|Sci-Fi|War
5,2916,Total Recall (1990),Action|Adventure|Sci-Fi|Thriller
6,1527,"Fifth Element, The (1997)",Action|Sci-Fi
7,2628,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Fantasy|Sci-Fi
8,316,Stargate (1994),Action|Adventure|Sci-Fi
9,1917,Armageddon (1998),Action|Adventure|Sci-Fi|Thriller



Cluster 1 — top factors: [40, 33, 35]
  Factor 40 — sample movies:


Unnamed: 0,movie_id,title,genres
0,3114,Toy Story 2 (1999),Animation|Children's|Comedy
1,1,Toy Story (1995),Animation|Children's|Comedy
2,2355,"Bug's Life, A (1998)",Animation|Children's|Comedy
3,34,Babe (1995),Children's|Comedy|Drama
4,588,Aladdin (1992),Animation|Children's|Comedy|Musical
5,364,"Lion King, The (1994)",Animation|Children's|Musical
6,595,Beauty and the Beast (1991),Animation|Children's|Musical
7,2761,"Iron Giant, The (1999)",Animation|Children's
8,3751,Chicken Run (2000),Animation|Children's|Comedy
9,2384,Babe: Pig in the City (1998),Children's|Comedy


  Factor 33 — sample movies:


Unnamed: 0,movie_id,title,genres
0,1148,"Wrong Trousers, The (1993)",Animation|Comedy
1,745,"Close Shave, A (1995)",Animation|Comedy|Thriller
2,1223,"Grand Day Out, A (1992)",Animation|Comedy
3,720,Wallace & Gromit: The Best of Aardman Animatio...,Animation
4,3751,Chicken Run (2000),Animation|Children's|Comedy
5,3429,Creature Comforts (1990),Animation|Comedy
6,551,"Nightmare Before Christmas, The (1993)",Children's|Comedy|Musical
7,2291,Edward Scissorhands (1990),Drama|Romance
8,1136,Monty Python and the Holy Grail (1974),Comedy
9,1080,Monty Python's Life of Brian (1979),Comedy


  Factor 35 — sample movies:


Unnamed: 0,movie_id,title,genres
0,1270,Back to the Future (1985),Comedy|Sci-Fi
1,2716,Ghostbusters (1984),Comedy|Horror
2,2797,Big (1988),Comedy|Fantasy
3,1097,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
4,1265,Groundhog Day (1993),Comedy|Romance
5,1307,When Harry Met Sally... (1989),Comedy|Romance
6,2762,"Sixth Sense, The (1999)",Thriller
7,356,Forrest Gump (1994),Comedy|Romance|War
8,2918,Ferris Bueller's Day Off (1986),Comedy
9,2174,Beetlejuice (1988),Comedy|Fantasy



Cluster 2 — top factors: [38, 42, 43]
  Factor 38 — sample movies:


Unnamed: 0,movie_id,title,genres
0,1201,"Good, The Bad and The Ugly, The (1966)",Action|Western
1,2951,"Fistful of Dollars, A (1964)",Action|Western
2,1266,Unforgiven (1992),Western
3,3508,"Outlaw Josey Wales, The (1976)",Western
4,3681,For a Few Dollars More (1965),Western
5,2921,High Plains Drifter (1972),Western
6,2922,Hang 'em High (1967),Western
7,2401,Pale Rider (1985),Western
8,553,Tombstone (1993),Western
9,3671,Blazing Saddles (1974),Comedy|Western


  Factor 42 — sample movies:


Unnamed: 0,movie_id,title,genres
0,858,"Godfather, The (1972)",Action|Crime|Drama
1,1221,"Godfather: Part II, The (1974)",Action|Crime|Drama
2,111,Taxi Driver (1976),Drama|Thriller
3,2023,"Godfather: Part III, The (1990)",Action|Crime|Drama
4,1953,"French Connection, The (1971)",Action|Crime|Drama|Thriller
5,1213,GoodFellas (1990),Crime|Drama
6,3362,Dog Day Afternoon (1975),Comedy|Crime|Drama
7,1193,One Flew Over the Cuckoo's Nest (1975),Drama
8,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi
9,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War


  Factor 43 — sample movies:


Unnamed: 0,movie_id,title,genres
0,1036,Die Hard (1988),Action|Thriller
1,1240,"Terminator, The (1984)",Action|Sci-Fi|Thriller
2,1200,Aliens (1986),Action|Sci-Fi|Thriller|War
3,1214,Alien (1979),Action|Horror|Sci-Fi|Thriller
4,3527,Predator (1987),Action|Sci-Fi|Thriller
5,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
6,2000,Lethal Weapon (1987),Action|Comedy|Crime|Drama
7,1954,Rocky (1976),Action|Drama
8,1387,Jaws (1975),Action|Horror
9,2194,"Untouchables, The (1987)",Action|Crime|Drama


### Top genres per cluster

In [None]:

# For each cluster, collect the union of movies rated by users in that cluster, then tally genres
ratings_kept = ratings[ratings["user_id"].isin(kept_user_ids)][["user_id","movie_id"]]
ratings_kept = ratings_kept.merge(clusters.rename("cluster"), left_on="user_id", right_index=True, how="left")

def top_genres_for_cluster(c, top_n=TOP_GENRES_N):
    mids = ratings_kept.loc[ratings_kept["cluster"]==c, "movie_id"].unique()
    m = movies.loc[mids, "genres"].dropna().astype(str)
    genre_counts = m.str.get_dummies(sep="|").sum().sort_values(ascending=False).head(top_n)
    return genre_counts

for c in sorted(clusters.unique())[:3]:
    print(f"\nTop genres for cluster {c}:")
    display(top_genres_for_cluster(c))



Top genres for cluster 0:


Drama         1019
Comedy         935
Action         468
Thriller       416
Romance        358
Horror         267
Adventure      257
Sci-Fi         247
Children's     230
Crime          162
dtype: int64


Top genres for cluster 1:


Drama         1043
Comedy         964
Action         457
Thriller       421
Romance        365
Horror         296
Sci-Fi         263
Adventure      262
Children's     242
Crime          148
dtype: int64


Top genres for cluster 2:


Drama         1160
Comedy         983
Action         475
Thriller       450
Romance        385
Horror         317
Sci-Fi         265
Adventure      261
Children's     233
Crime          183
dtype: int64

## 9) Save outputs

In [None]:

out_dir = "./outputs_ml1m"
os.makedirs(out_dir, exist_ok=True)

# Save best user clusters
pd.DataFrame({"user_id": kept_user_ids, "cluster_nmf_kmeans": labels_best}).to_csv(os.path.join(out_dir, "user_clusters_nmf_kmeans.csv"), index=False)

# Save summary tables
df_km.to_csv(os.path.join(out_dir, "baseline_kmeans_svd.csv"), index=False)
df_db.to_csv(os.path.join(out_dir, "baseline_dbscan_svd.csv"), index=False)
df_nmf.to_csv(os.path.join(out_dir, "nmf_kmeans_sweep.csv"), index=False)

print("Saved outputs to:", os.path.abspath(out_dir))


Saved outputs to: c:\Users\shuang\Documents\outputs_ml1m
