In [1]:
# 03_feature_engineering_and_baselines
print("Build TF-IDF text features, reduce with TruncatedSVD, encode genres, run baseline clustering experiments (genres-only, text-only, combined).")


Build TF-IDF text features, reduce with TruncatedSVD, encode genres, run baseline clustering experiments (genres-only, text-only, combined).


In [26]:
# Cell 1
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import joblib

PROJECT_ROOT = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix")
CLEANED_IN = PROJECT_ROOT / "outputs" / "cleaned_netflix.csv"
OUT_DIR = PROJECT_ROOT / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(CLEANED_IN)
print("Loaded cleaned data:", df.shape)


Loaded cleaned data: (7787, 14)


In [27]:
# Cell 2- Fit TF-IDF (safe default)
tfidf = TfidfVectorizer(max_features=3000, stop_words="english", ngram_range=(1,2))
corpus = df["description"].fillna("").astype(str).tolist()
X_tfidf = tfidf.fit_transform(corpus)
print("TF-IDF matrix shape:", X_tfidf.shape)


TF-IDF matrix shape: (7787, 3000)


In [28]:
# Cell 3
svd = TruncatedSVD(n_components=40, random_state=42)
X_svd = svd.fit_transform(X_tfidf)
print("SVD reduced text shape:", X_svd.shape)


SVD reduced text shape: (7787, 40)


In [29]:
# Cell 4 (REPLACE with this defensive version)
from sklearn.preprocessing import MultiLabelBinarizer
from itertools import chain

# Ensure 'genres_list' exists and is a proper list for every row
if 'genres_list' not in df.columns:
    if 'listed_in' in df.columns:
        df['genres_list'] = df['listed_in'].fillna("").astype(str).apply(lambda s: [x.strip() for x in s.split(",") if x.strip()])
    else:
        df['genres_list'] = [[] for _ in range(len(df))]

# make sure each entry is a list
def ensure_list_item(x):
    if pd.isna(x) or x == "":
        return []
    if isinstance(x, list):
        return x
    s = str(x)
    if s.startswith('[') and s.endswith(']'):
        s = s.strip('[]')
        return [p.strip().strip("'\"") for p in s.split(',') if p.strip()]
    return [p.strip() for p in s.split(',') if p.strip()]

df['genres_list'] = df['genres_list'].apply(ensure_list_item)

# Detect unique genres in dataset
all_genres = sorted(set(chain.from_iterable(df['genres_list'])))
print("Unique genres found:", len(all_genres))
if len(all_genres) == 0:
    # No genres, produce an empty (n_samples, 0) array for compatibility
    X_genres = np.zeros((df.shape[0], 0), dtype=float)
    print("Warning: no genres found. X_genres will have shape", X_genres.shape)
else:
    mlb = MultiLabelBinarizer(sparse_output=False)
    X_genres = mlb.fit_transform(df['genres_list'])
    print("Genre multi-hot shape:", X_genres.shape)
    # optional: save the encoder
    import joblib
    joblib.dump(mlb, OUT_DIR / "mlb_encoder.joblib")
    print("Saved MultiLabelBinarizer to outputs/mlb_encoder.joblib")


Unique genres found: 42
Genre multi-hot shape: (7787, 42)
Saved MultiLabelBinarizer to outputs/mlb_encoder.joblib


In [30]:
# Cell 5
scaler = StandardScaler()
num_cols = ["duration_num", "release_year"]
num_df = df[num_cols].fillna(df[num_cols].median())
X_num = scaler.fit_transform(num_df)
print("Numeric features scaled shape:", X_num.shape)


Numeric features scaled shape: (7787, 2)


In [37]:
# Cell 6 (robust: combine features and save X_combined.npy safely)
import numpy as np
from pathlib import Path
import os
import scipy.sparse as sp

OUT_DIR = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)
save_path = OUT_DIR / "X_combined.npy"

# assume X_svd, X_genres, X_num exist from earlier cells
# Defensive checks:
print("Types/shapes before combine:")
print(" - X_svd:", type(X_svd), getattr(X_svd, "shape", None))
print(" - X_genres:", type(X_genres), getattr(X_genres, "shape", None))
print(" - X_num:", type(X_num), getattr(X_num, "shape", None))

# Convert sparse inputs to dense if needed (but check memory)
def to_dense_if_sparse(arr, max_bytes=1_000_000_000):
    """Convert sparse matrix to dense array if resulting size < max_bytes (default ~1GB)."""
    if sp.issparse(arr):
        n_bytes = arr.shape[0] * arr.shape[1] * 8  # approximate for float64
        print(f" - Detected sparse matrix, approx bytes if dense: {n_bytes:,}")
        if n_bytes > max_bytes:
            raise MemoryError(f"Converting sparse -> dense would use ~{n_bytes:,} bytes which exceeds threshold.")
        return arr.toarray()
    return np.asarray(arr)

# Ensure all components are dense numeric numpy arrays
try:
    X_svd_arr = to_dense_if_sparse(X_svd)
    X_genres_arr = to_dense_if_sparse(X_genres)
    X_num_arr = to_dense_if_sparse(X_num)
except MemoryError as me:
    print("MemoryError:", me)
    print("Try reducing feature sizes (e.g. fewer SVD components / fewer TF-IDF features) or save to disk incrementally.")
    raise

# Ensure 2D arrays and compatible row counts
for name, arr in [("X_svd", X_svd_arr), ("X_genres", X_genres_arr), ("X_num", X_num_arr)]:
    arr = np.asarray(arr)
    if arr.ndim == 1:
        arr = arr.reshape(-1, 1)
    if arr.shape[0] != X_svd_arr.shape[0]:
        raise ValueError(f"Row count mismatch: {name}.rows={arr.shape[0]} vs X_svd.rows={X_svd_arr.shape[0]}")
    # replace with possibly reshaped arr in local vars
    if name == "X_svd":
        X_svd_arr = arr
    elif name == "X_genres":
        X_genres_arr = arr
    else:
        X_num_arr = arr

# Combine horizontally (dense)
X_combined = np.hstack([X_svd_arr, X_genres_arr, X_num_arr])
print("✅ Combined feature matrix shape:", X_combined.shape)

# Convert to a compact numeric dtype to reduce file size (safe: float32)
if np.issubdtype(X_combined.dtype, np.floating):
    X_to_save = X_combined.astype(np.float32, copy=False)
else:
    # try to coerce to float32; if fails, keep as object and warn
    try:
        X_to_save = X_combined.astype(np.float32)
    except Exception:
        X_to_save = X_combined
        print("Warning: X_combined could not be cast to float32; it will be saved with existing dtype:", X_combined.dtype)

# --- Save robustly using a string path and file handle ---
def safe_save_numpy(arr, path_obj):
    path_str = str(path_obj)
    try:
        # use a file handle — more robust on Windows/OneDrive
        with open(path_str, "wb") as f:
            np.save(f, arr, allow_pickle=False)
        print("✅ Saved numpy array to:", path_str)
        return True
    except OSError as e:
        print("OSError while saving to", path_str, ":", e)
        return False

success = safe_save_numpy(X_to_save, save_path)

# If saving failed (OneDrive path issue), fall back to a local temp folder
if not success:
    fallback_dir = Path("C:/temp")
    fallback_dir.mkdir(parents=True, exist_ok=True)
    fallback_path = fallback_dir / "X_combined.npy"
    print("Attempting fallback save to:", fallback_path)
    success2 = safe_save_numpy(X_to_save, fallback_path)
    if success2:
        print("Saved to fallback path. Consider using a local path instead of OneDrive to avoid sync/lock issues.")
    else:
        raise OSError("Failed to save X_combined.npy to both primary and fallback locations.")


Types/shapes before combine:
 - X_svd: <class 'numpy.ndarray'> (7787, 40)
 - X_genres: <class 'numpy.ndarray'> (7787, 42)
 - X_num: <class 'numpy.ndarray'> (7787, 2)
✅ Combined feature matrix shape: (7787, 84)
OSError while saving to C:\Users\KIIT\OneDrive\Documents\Labmentix\netflix\outputs\X_combined.npy : [Errno 22] Invalid argument: 'C:\\Users\\KIIT\\OneDrive\\Documents\\Labmentix\\netflix\\outputs\\X_combined.npy'
Attempting fallback save to: C:\temp\X_combined.npy
✅ Saved numpy array to: C:\temp\X_combined.npy
Saved to fallback path. Consider using a local path instead of OneDrive to avoid sync/lock issues.


In [36]:
# Cell 7 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

baseline_results = {}
# Prepare candidate feature sets
candidates = [
    ("text", X_svd),
    ("genres", X_genres),
    ("combined", X_combined)
]

for name, X in candidates:
    # Ensure X is a NumPy array (dense) or sparse matrix that supports shape
    if hasattr(X, "shape"):
        n_features = X.shape[1]
        n_samples = X.shape[0]
    else:
        X = np.asarray(X)
        n_features = X.shape[1]
        n_samples = X.shape[0]

    if n_features == 0:
        print(f"SKIP baseline '{name}': feature matrix has 0 columns (shape={X.shape}).")
        baseline_results[name] = None
        continue

    # Defensive: if X is sparse, convert to array for KMeans (scikit-learn accepts sparse but silhouette may need dense)
    try:
        # If it's scipy sparse, KMeans accepts it but silhouette needs dense arrays for some versions;
        # we'll convert to dense only if it's small enough
        import scipy.sparse as sp
        if sp.issparse(X):
            # convert to dense if reasonable
            if X.shape[0] * X.shape[1] <= 5_000_000:  # ~5M entries threshold (adjust if needed)
                X_dense = X.toarray()
            else:
                # compute using sparse as-is (KMeans can accept sparse in newer sklearn)
                X_dense = X
        else:
            X_dense = X
    except Exception:
        X_dense = X

    # Fit KMeans with k=5 (ensure k < n_samples)
    k_try = 5
    if n_samples <= k_try:
        k_try = max(2, n_samples // 2)
        print(f"Adjusted k for '{name}' to {k_try} because n_samples={n_samples}")

    km = KMeans(n_clusters=k_try, random_state=42)
    labels = km.fit_predict(X_dense)

    # Validate that we have at least 2 unique clusters for silhouette
    unique_labels = np.unique(labels)
    if unique_labels.size < 2:
        print(f"SKIP silhouette for '{name}': only {unique_labels.size} unique label(s) found.")
        baseline_results[name] = None
        continue

    try:
        score = silhouette_score(X_dense, labels)
        baseline_results[name] = float(score)
        print(f"Silhouette [{name}] (k={k_try}): {score:.4f}")
    except Exception as e:
        print(f"Could not compute silhouette for '{name}': {e}")
        baseline_results[name] = None

print("\nBaseline results summary:", baseline_results)


Silhouette [text] (k=5): 0.1540
Silhouette [genres] (k=5): 0.1863
Silhouette [combined] (k=5): 0.1855

Baseline results summary: {'text': 0.1539922302062514, 'genres': 0.186302179232272, 'combined': 0.18551902271399665}


In [35]:
# Cell 8
joblib.dump(tfidf, OUT_DIR / "tfidf_vectorizer.joblib")
joblib.dump(svd, OUT_DIR / "svd_transformer.joblib")
joblib.dump(mlb, OUT_DIR / "mlb_encoder.joblib")
joblib.dump(scaler, OUT_DIR / "scaler.joblib")
print("Saved TF-IDF, SVD, MLBin, and scaler to outputs/")


Saved TF-IDF, SVD, MLBin, and scaler to outputs/
