# 03 – Word2Vec + Clustering
Trains Word2Vec on lemmas, builds document vectors, reduces with PCA, clusters,
and evaluates (Silhouette, Davies–Bouldin, Dunn).

In [None]:
import numpy as np
from smclust.data_loader import load_messages
from smclust.preprocessing import preprocess_df
from smclust.embeddings import train_word2vec, doc_vector
from smclust.reduce import pca_reduce
from smclust.cluster import kmeans_cluster, dbscan_cluster
from smclust.metrics import silhouette, davies_bouldin, dunn_index

In [None]:
df = load_messages()
df = preprocess_df(df)

tokenized = df["lemmas"].tolist()
w2v = train_word2vec(tokenized, vector_size=100, window=5, min_count=2, workers=4, seed=42)
X = np.vstack([doc_vector(toks, w2v) for toks in tokenized])

Xp, pca = pca_reduce(X, n_components=50, random_state=42)

labels_km, _ = kmeans_cluster(Xp, n_clusters=8, random_state=42)
labels_db, _ = dbscan_cluster(Xp, eps=0.7, min_samples=5)

print("KMeans  -> sil:", silhouette(Xp, labels_km),
      " db:", davies_bouldin(Xp, labels_km),
      " dunn:", dunn_index(Xp, labels_km))

print("DBSCAN  -> sil:", silhouette(Xp, labels_db),
      " db:", davies_bouldin(Xp, labels_db),
      " dunn:", dunn_index(Xp, labels_db))