# 02 – TF-IDF + Clustering
Vectorizes cleaned text with TF-IDF, reduces with PCA, clusters (KMeans/DBSCAN),
and prints evaluation metrics.

In [3]:
# Ensure we can import from project/src no matter where the notebook is opened
import sys, os
from pathlib import Path

# climb up until we find the project root (folder that contains "src/smclust")
ROOT = Path.cwd()
while ROOT != ROOT.parent and not (ROOT / "src" / "smclust").exists():
    ROOT = ROOT.parent

sys.path.insert(0, str(ROOT / "src"))
print("Project root:", ROOT)
print("In sys.path:", ROOT / "src")

# optional: auto-reload modules so edits in src/ show up without restarting kernel
%load_ext autoreload
%autoreload 2


Project root: /Users/muhammadsalmanmalik/Library/CloudStorage/OneDrive-UniversitätMünster/ml_projects/social-media-clustering
In sys.path: /Users/muhammadsalmanmalik/Library/CloudStorage/OneDrive-UniversitätMünster/ml_projects/social-media-clustering/src


In [4]:
import numpy as np
from smclust.data_loader import load_messages
from smclust.preprocessing import preprocess_df
from smclust.vectorize import tfidf_features
from smclust.reduce import pca_reduce
from smclust.cluster import kmeans_cluster, dbscan_cluster
from smclust.metrics import silhouette, davies_bouldin, dunn_index

In [5]:
df = load_messages()
df = preprocess_df(df)
X_sparse, vec = tfidf_features(df["text_clean"], max_features=5000, ngram_range=(1,2))
X = X_sparse.toarray()

Xp, pca = pca_reduce(X, n_components=50, random_state=42)

labels_km, _ = kmeans_cluster(Xp, n_clusters=8, random_state=42)
labels_db, _ = dbscan_cluster(Xp, eps=0.7, min_samples=5)

print("KMeans  -> sil:", silhouette(Xp, labels_km),
      " db:", davies_bouldin(Xp, labels_km),
      " dunn:", dunn_index(Xp, labels_km))

print("DBSCAN  -> sil:", silhouette(Xp, labels_db),
      " db:", davies_bouldin(Xp, labels_db),
      " dunn:", dunn_index(Xp, labels_db))

: 