In [5]:
# ============================ HW07 – полный код ============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics import (
    silhouette_score,
    davies_bouldin_score,
    calinski_harabasz_score,
    adjusted_rand_score as ARI,
)
import json, os

# ------------------------------------------------------------------
# 1. Создаём только нужные подпапки рядом с ноутбуком
# ------------------------------------------------------------------
ARTIFACTS = "artifacts"
os.makedirs(f"{ARTIFACTS}/labels", exist_ok=True)
os.makedirs(f"{ARTIFACTS}/figures", exist_ok=True)

# ------------------------------------------------------------------
# 2. Пути к датасетам (папка data уже существует)
# ------------------------------------------------------------------
DATASETS = {
    "ds1": "data/S07-hw-dataset-01.csv",
    "ds2": "data/S07-hw-dataset-02.csv",
    "ds3": "data/S07-hw-dataset-03.csv",
}

# ------------------------------------------------------------------
# 3. Утилиты
# ------------------------------------------------------------------
def load_and_preprocess(path: str):
    """Возвращает (X_scaled, sample_id)."""
    df = pd.read_csv(path)
    X = df.drop(columns=["sample_id"])
    X = SimpleImputer(strategy="mean").fit_transform(X)
    X = StandardScaler().fit_transform(X)
    return X, df["sample_id"].values


def cluster_metrics(X, labels):
    """Внутренние метрики кластеризации."""
    uniq = np.unique(labels)
    if len(uniq) < 2:
        return {"silhouette": None, "davies_bouldin": None, "calinski_harabasz": None}
    return {
        "silhouette": float(silhouette_score(X, labels)),
        "davies_bouldin": float(davies_bouldin_score(X, labels)),
        "calinski_harabasz": float(calinski_harabasz_score(X, labels)),
    }


def save_labels(sample_id, labels, suffix):
    pd.DataFrame({"sample_id": sample_id, "cluster_label": labels})\
      .to_csv(f"{ARTIFACTS}/labels/labels_{suffix}.csv", index=False)


def save_fig(name):
    plt.savefig(f"{ARTIFACTS}/figures/{name}.png", dpi=300)
    plt.close()


# ------------------------------------------------------------------
# 4. Загрузка данных
# ------------------------------------------------------------------
data = {}
for name, path in DATASETS.items():
    X, sid = load_and_preprocess(path)
    data[name] = {"X": X, "sample_id": sid}

# ------------------------------------------------------------------
# 5. KMeans (k = 2..20)
# ------------------------------------------------------------------
kmeans_res = {}
for name in data:
    X = data[name]["X"]
    res = []
    for k in range(2, 21):
        labels = KMeans(k, n_init=10, random_state=42).fit_predict(X)
        res.append({"k": k, "labels": labels, "metrics": cluster_metrics(X, labels)})
    best = max(res, key=lambda x: x["metrics"]["silhouette"])
    kmeans_res[name] = best
    save_labels(data[name]["sample_id"], best["labels"], f"{name}_kmeans")

# ------------------------------------------------------------------
# 6. DBSCAN (eps 0.1-1.0, min_samples 5,8,12)
# ------------------------------------------------------------------
dbscan_res = {}
for name in data:
    X = data[name]["X"]
    res = []
    for eps in np.arange(0.1, 1.1, 0.1):
        for ms in [5, 8, 12]:
            labels = DBSCAN(eps=eps, min_samples=ms).fit_predict(X)
            mask = labels != -1
            if mask.sum() > 0 and len(np.unique(labels[mask])) >= 2:
                res.append({
                    "eps": eps,
                    "min_samples": ms,
                    "labels": labels,
                    "noise_frac": 1 - mask.mean(),
                    "metrics": cluster_metrics(X[mask], labels[mask]),
                })
    if res:
        best = max(res, key=lambda x: x["metrics"]["silhouette"])
        dbscan_res[name] = best
        save_labels(data[name]["sample_id"], best["labels"], f"{name}_dbscan")

# ------------------------------------------------------------------
# 7. Agglomerative (k = 2..10, linkage ward/complete/average)
# ------------------------------------------------------------------
agg_res = {}
for name in data:
    X = data[name]["X"]
    res = []
    for k in range(2, 11):
        for link in ["ward", "complete", "average"]:
            labels = AgglomerativeClustering(n_clusters=k, linkage=link).fit_predict(X)
            res.append({"k": k, "linkage": link, "labels": labels, "metrics": cluster_metrics(X, labels)})
    best = max(res, key=lambda x: x["metrics"]["silhouette"])
    agg_res[name] = best
    save_labels(data[name]["sample_id"], best["labels"], f"{name}_agg")

# ------------------------------------------------------------------
# 8. PCA 2D-визуализации
# ------------------------------------------------------------------
for name in data:
    X = data[name]["X"]
    X2 = PCA(n_components=2, random_state=42).fit_transform(X)
    for algo, res in [
        ("kmeans", kmeans_res[name]),
        ("dbscan", dbscan_res.get(name, {})),
        ("agg", agg_res[name]),
    ]:
        if not res:
            continue
        plt.figure(figsize=(6, 5))
        plt.scatter(X2[:, 0], X2[:, 1], c=res["labels"], s=15, cmap="tab10")
        plt.title(f"{name.upper()} – {algo.upper()} – PCA(2D)")
        plt.xlabel("PC1")
        plt.ylabel("PC2")
        plt.grid(alpha=0.3)
        save_fig(f"{name}_{algo}_pca")

# ------------------------------------------------------------------
# 9. Устойчивость KMeans (ds1, 5 seed)
# ------------------------------------------------------------------
X = data["ds1"]["X"]
labels_list = [
    KMeans(n_clusters=kmeans_res["ds1"]["k"], n_init=10, random_state=seed).fit_predict(X)
    for seed in range(5)
]
ari_vals = [ARI(labels_list[i], labels_list[j]) for i in range(5) for j in range(i + 1, 5)]
stability = {"mean_ari": float(np.mean(ari_vals)), "std_ari": float(np.std(ari_vals))}

# ------------------------------------------------------------------
# 10. Сводные JSON-файлы
# ------------------------------------------------------------------
metrics_summary = {
    name: {
        "kmeans": kmeans_res[name]["metrics"],
        "dbscan": dbscan_res.get(name, {}).get("metrics", {}),
        "agg": agg_res[name]["metrics"],
    }
    for name in data
}

best_configs = {
    name: {
        "kmeans": {"k": kmeans_res[name]["k"]},
        "dbscan": {
            k: v
            for k, v in dbscan_res.get(name, {}).items()
            if k in ["eps", "min_samples", "noise_frac"]
        },
        "agg": {"k": agg_res[name]["k"], "linkage": agg_res[name]["linkage"]},
    }
    for name in data
}

with open(f"{ARTIFACTS}/metrics_summary.json", "w", encoding="utf-8") as f:
    json.dump(metrics_summary, f, indent=2)

with open(f"{ARTIFACTS}/best_configs.json", "w", encoding="utf-8") as f:
    json.dump(best_configs, f, indent=2)

# ------------------------------------------------------------------
# 11. report.md (UTF-8)
# ------------------------------------------------------------------
with open("report.md", "w", encoding="utf-8") as f:
    f.write(
        f"""# HW07 Report

## 1. Datasets
ds1, ds2, ds3 – 500×5, numeric, no missing values

## 2. Protocol
- Pre-processing: StandardScaler
- KMeans: k = 2..20
- DBSCAN: eps = 0.1..1.0, min_samples = 5, 8, 12
- Agglomerative: k = 2..10, linkage ∈ {{"ward", "complete", "average"}}

## 3. Models
See `best_configs.json`

## 4. Results
See `metrics_summary.json`

## 5. Analysis
KMeans stability (ds1): mean ARI = {stability['mean_ari']:.3f} ± {stability['std_ari']:.3f}
DBSCAN captures non-spherical clusters; Agglomerative competitive on spherical data

## 6. Conclusion
Feature scaling is essential for KMeans; DBSCAN is sensitive to eps/min_samples; PCA-2D is sufficient for visualization
"""
    )

print("Готово – все артефакты сохранены в папке artifacts и report.md")

Готово – все артефакты сохранены в папке artifacts и report.md
