# Global Cyberattack Pattern Analysis — Data Mining (Anaconda Edition)

**Notebook:** 05_unsupervised_clustering.ipynb — KMeans + PCA visualization

In [None]:
import pandas as pd, matplotlib.pyplot as plt, os
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

os.makedirs("reports/unsupervised", exist_ok=True)

df = pd.read_csv("data/cisa_kev.csv", parse_dates=["DateAdded"], dayfirst=False)
if "DateAdded" in df.columns:
    df["year_added"] = df["DateAdded"].dt.year
    df["month_added"] = df["DateAdded"].dt.month

features = [c for c in ["VendorProject","Product","CWE_ID","year_added","month_added"] if c in df.columns]
X = df[features]

cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if c not in cat_cols]
prep = ColumnTransformer([
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                      ("oh", OneHotEncoder(handle_unknown="ignore"))]), cat_cols),
    ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols)
])

Z = prep.fit_transform(X)
kmeans = KMeans(n_clusters=3, random_state=42, n_init="auto")
clusters = kmeans.fit_predict(Z)
sil = silhouette_score(Z, clusters)
print("Silhouette:", sil)

pca = PCA(n_components=2, random_state=42)
Z2 = pca.fit_transform(Z.toarray() if hasattr(Z, "toarray") else Z)

plt.figure()
plt.scatter(Z2[:,0], Z2[:,1], c=clusters)
plt.title("KMeans Clusters (PCA)")
plt.tight_layout()
plt.savefig("reports/unsupervised/kmeans_pca.png", dpi=200)