In [1]:
# 03_feature_engineering_and_baselines
print("Build TF-IDF text features, reduce with TruncatedSVD, encode genres, run baseline clustering experiments (genres-only, text-only, combined).")


Build TF-IDF text features, reduce with TruncatedSVD, encode genres, run baseline clustering experiments (genres-only, text-only, combined).


In [2]:
# Cell 2: imports and load cleaned csv (with optional sampling)
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import joblib

RANDOM_STATE = 42
CLEANED_CSV = Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs/cleaned_netflix.csv")
assert CLEANED_CSV.exists(), "Run Notebook 02 first to produce outputs/cleaned_netflix.csv"

df = pd.read_csv(CLEANED_CSV)

# If genres_list is stored as string (rare), convert safely
def ensure_list(x):
    if pd.isna(x) or x == "":
        return []
    if isinstance(x, list):
        return x
    s = str(x)
    if s.startswith('[') and s.endswith(']'):
        s2 = s.strip('[]')
        return [p.strip().strip("'\"") for p in s2.split(',') if p.strip()]
    return [p.strip() for p in s.split(',') if p.strip()]

df['genres_list'] = df.get('genres_list', "").apply(ensure_list)
print("Working rows:", len(df))

# SAMPLE_SIZE: set integer to sample for speed (None uses full dataset)
SAMPLE_SIZE = None
if SAMPLE_SIZE is not None and len(df) > SAMPLE_SIZE:
    df = df.sample(n=SAMPLE_SIZE, random_state=RANDOM_STATE).reset_index(drop=True)
    print("Sampled to", len(df))


Working rows: 7787


In [4]:
# Cell 3: TF-IDF + TruncatedSVD
descriptions = df['description'].fillna("").astype(str)

tfidf = TfidfVectorizer(max_df=0.80, min_df=5, max_features=3000, ngram_range=(1,2))
X_text = tfidf.fit_transform(descriptions)
print("TF-IDF shape:", X_text.shape)

n_svd = 50 if X_text.shape[0] > 200 else min(20, X_text.shape[1])
svd = TruncatedSVD(n_components=n_svd, random_state=RANDOM_STATE)
X_text_red = svd.fit_transform(X_text)
print("Reduced text shape:", X_text_red.shape)

# Save vectorizers
Path("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs").mkdir(exist_ok=True)
joblib.dump(tfidf, "C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs/tfidf.joblib")
joblib.dump(svd, "C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs/svd.joblib")


TF-IDF shape: (7787, 3000)
Reduced text shape: (7787, 50)


['C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs/svd.joblib']

In [5]:
# Cell 4: genres matrix & numeric features
mlb = MultiLabelBinarizer(sparse_output=False)
X_genres = mlb.fit_transform(df['genres_list'])
print("Genres shape:", X_genres.shape)
joblib.dump(mlb, "C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs/mlb.joblib")

duration = df.get('duration_num', pd.Series([np.nan]*len(df))).fillna(-1).values.reshape(-1,1)
release_year = pd.to_numeric(df.get('release_year', pd.Series([np.nan]*len(df))), errors='coerce').fillna(-1).values.reshape(-1,1)

# Combine: text + genres + numeric
X_combined = np.hstack([X_text_red, X_genres, duration, release_year])
print("Combined feature shape:", X_combined.shape)


Genres shape: (7787, 42)
Combined feature shape: (7787, 94)


In [6]:
# Cell 5: baseline clustering (k=5) for genres-only, text-only, combined
def run_kmeans(X, k=5):
    km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10)
    labels = km.fit_predict(X)
    s = silhouette_score(X, labels) if len(set(labels))>1 else float('nan')
    return labels, s

results = {}
if X_genres.shape[1] > 0:
    lab_g, s_g = run_kmeans(X_genres, k=5)
    results['genres_only'] = s_g
else:
    results['genres_only'] = None

lab_t, s_t = run_kmeans(X_text_red, k=5)
lab_c, s_c = run_kmeans(X_combined, k=5)

results['text_only'] = s_t
results['combined'] = s_c
print("Baseline silhouette scores:", results)

# attach baseline labels to df for inspection
df['cluster_genres_baseline'] = lab_g if 'lab_g' in locals() else -1
df['cluster_text_baseline'] = lab_t
df['cluster_combined_baseline'] = lab_c


Baseline silhouette scores: {'genres_only': 0.20229962160557366, 'text_only': 0.04889988392435852, 'combined': 0.5844546759233296}


In [7]:
# Cell 6: inspect a few examples per combined baseline cluster
print("Combined baseline samples per cluster:")
for c in sorted(df['cluster_combined_baseline'].unique()):
    print("Cluster", c, df[df['cluster_combined_baseline']==c]['title'].dropna().head(6).tolist())


Combined baseline samples per cluster:
Cluster 0 ['3%', '46', '1983', '1994', 'Feb-09', '\u200bSAINT SEIYA: Knights of the Zodiac']
Cluster 1 ['21', '187', '706', '15-Aug', '\u200b\u200bKuch Bheege Alfaaz', '\u200bGoli Soda 2']
Cluster 2 ['23:59', '#Rucker50', '100 Things to do Before High School', "100 Years: One Woman's Fight for Justice", '13TH: A Conversation with Oprah Winfrey & Ava DuVernay', '27: Gone Too Soon']
Cluster 3 ['1920', 'Oct-01', '22-Jul', '2 States', '3 Idiots', '7 Khoon Maaf']
Cluster 4 ['7:19', '9', '122', '1922', '2,215', '3022']


In [8]:
# Cell 7: save features & labels for downstream notebooks
import numpy as np
np.save("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs/X_combined.npy", X_combined)
df.to_csv("C:/Users/KIIT/OneDrive/Documents/Labmentix/netflix/outputs/with_baseline_labels.csv", index=False)
print("Saved outputs/X_combined.npy and outputs/with_baseline_labels.csv")


Saved outputs/X_combined.npy and outputs/with_baseline_labels.csv
