In [1]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer  # ← Это правильный путь для новых версий
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score,
    adjusted_rand_score
)
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings("ignore")

# Установим seed для воспроизводимости
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Пути
DATA_DIR = "data"
ARTIFACTS_DIR = "artifacts"
FIGURES_DIR = os.path.join(ARTIFACTS_DIR, "figures")
LABELS_DIR = os.path.join(ARTIFACTS_DIR, "labels")

os.makedirs(FIGURES_DIR, exist_ok=True)
os.makedirs(LABELS_DIR, exist_ok=True)

# Список выбранных датасетов
datasets = {
    "ds1": "S07-hw-dataset-01.csv",
    "ds2": "S07-hw-dataset-02.csv",
    "ds3": "S07-hw-dataset-03.csv",
}

# Для артефактов
metrics_summary = {}
best_configs = {}
all_labels = {}

# Вспомогательные функции
def save_plot(path):
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    plt.close()

def plot_pca(X, labels, title, path):
    pca = PCA(n_components=2, random_state=RANDOM_STATE)
    X_pca = pca.fit_transform(X)
    plt.figure(figsize=(6, 5))
    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=labels, palette="tab10", s=30)
    plt.title(title)
    save_plot(path)

def evaluate_clustering(X, labels, name, dataset_key, noise_ratio=None):
    if len(np.unique(labels)) < 2:
        print(f"[{name}] Только один кластер — метрики не применимы.")
        return None
    
    # Учитываем шум только для DBSCAN
    mask = labels != -1 if noise_ratio is not None else np.ones(len(labels), dtype=bool)
    X_clean = X[mask]
    labels_clean = labels[mask]

    sil = silhouette_score(X_clean, labels_clean) if len(np.unique(labels_clean)) > 1 else -1
    ch = calinski_harabasz_score(X_clean, labels_clean) if len(np.unique(labels_clean)) > 1 else -1
    db = davies_bouldin_score(X_clean, labels_clean) if len(np.unique(labels_clean)) > 1 else np.inf

    metrics = {
        "silhouette": float(sil),
        "calinski_harabasz": float(ch),
        "davies_bouldin": float(db),
    }
    if noise_ratio is not None:
        metrics["noise_ratio"] = float(noise_ratio)

    print(f"{name} → Sil: {sil:.3f}, CH: {ch:.1f}, DB: {db:.3f}" + 
          (f", Noise: {noise_ratio:.2%}" if noise_ratio is not None else ""))
    
    return metrics

# Основной цикл по датасетам
for ds_key, filename in datasets.items():
    print(f"\n{'='*60}\nОбработка {ds_key}: {filename}\n{'='*60}")
    
    # --- 2.3.1: Загрузка и EDA ---
    df = pd.read_csv(os.path.join(DATA_DIR, filename))
    sample_id = df["sample_id"]
    X_raw = df.drop(columns=["sample_id"])
    
    print("→ head():")
    display(df.head(3))
    print("\n→ info():")
    df.info()
    print("\n→ describe():")
    display(df.describe())
    print("\n→ Пропуски:")
    print(df.isnull().sum())
    
    # Определяем типы признаков
    numeric_features = X_raw.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = X_raw.select_dtypes(exclude=[np.number]).columns.tolist()
    print(f"\nЧисловые: {len(numeric_features)}, Категориальные: {len(categorical_features)}")
    
    # --- 2.3.2: Препроцессинг ---
    transformers = []
    if numeric_features:
        num_pipe = Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ])
        transformers.append(("num", num_pipe, numeric_features))
    
    if categorical_features:
        cat_pipe = Pipeline([
            ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ])
        transformers.append(("cat", cat_pipe, categorical_features))
    
    preprocessor = ColumnTransformer(transformers, remainder="drop")
    X_processed = preprocessor.fit_transform(X_raw)
    
    # --- 2.3.3–2.3.5: Модели и метрики ---
    results = {}
    best_method = None
    best_score = -np.inf
    best_labels = None
    best_name = ""
    
    # === KMeans ===
    k_range = range(2, 21)
    sil_scores = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10)
        labels = kmeans.fit_predict(X_processed)
        sil = silhouette_score(X_processed, labels)
        sil_scores.append(sil)
    
    # График silhouette vs k
    plt.figure(figsize=(6, 4))
    plt.plot(k_range, sil_scores, marker="o")
    plt.xlabel("k")
    plt.ylabel("Silhouette Score")
    plt.title(f"{ds_key}: Silhouette vs k (KMeans)")
    sil_k_path = os.path.join(FIGURES_DIR, f"sil_k_{ds_key}.png")
    save_plot(sil_k_path)
    
    # Лучший k по silhouette
    best_k = k_range[np.argmax(sil_scores)]
    kmeans_final = KMeans(n_clusters=best_k, random_state=RANDOM_STATE, n_init=10)
    labels_kmeans = kmeans_final.fit_predict(X_processed)
    metrics_kmeans = evaluate_clustering(X_processed, labels_kmeans, "KMeans", ds_key)
    results["KMeans"] = {
        "labels": labels_kmeans,
        "metrics": metrics_kmeans,
        "params": {"k": int(best_k)}
    }
    if metrics_kmeans and metrics_kmeans["silhouette"] > best_score:
        best_score = metrics_kmeans["silhouette"]
        best_labels = labels_kmeans
        best_method = "KMeans"
        best_name = f"KMeans (k={best_k})"

    # === DBSCAN ===
    eps_values = np.linspace(0.1, 2.0, 20)
    min_samples_values = [5, 10, 15]
    best_eps, best_min_samples = None, None
    best_sil_dbscan = -1
    best_labels_dbscan = None

    for eps in eps_values:
        for min_samples in min_samples_values:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            labels = dbscan.fit_predict(X_processed)
            n_noise = np.sum(labels == -1)
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            if n_clusters < 2:
                continue
            sil = silhouette_score(X_processed[labels != -1], labels[labels != -1])
            if sil > best_sil_dbscan:
                best_sil_dbscan = sil
                best_eps, best_min_samples = eps, min_samples
                best_labels_dbscan = labels

    if best_labels_dbscan is not None:
        noise_ratio = np.mean(best_labels_dbscan == -1)
        metrics_dbscan = evaluate_clustering(
            X_processed, best_labels_dbscan, "DBSCAN", ds_key, noise_ratio=noise_ratio
        )
        results["DBSCAN"] = {
            "labels": best_labels_dbscan,
            "metrics": metrics_dbscan,
            "params": {"eps": float(best_eps), "min_samples": int(best_min_samples)}
        }
        if metrics_dbscan and metrics_dbscan["silhouette"] > best_score:
            best_score = metrics_dbscan["silhouette"]
            best_labels = best_labels_dbscan
            best_method = "DBSCAN"
            best_name = f"DBSCAN (eps={best_eps:.2f}, ms={best_min_samples})"

        # График silhouette vs eps (фикс min_samples=10)
        sil_eps = []
        for eps in eps_values:
            dbscan = DBSCAN(eps=eps, min_samples=10)
            labels = dbscan.fit_predict(X_processed)
            if len(set(labels)) - (1 if -1 in labels else 0) < 2:
                sil_eps.append(-1)
            else:
                sil = silhouette_score(X_processed[labels != -1], labels[labels != -1])
                sil_eps.append(sil)
        plt.figure(figsize=(6, 4))
        plt.plot(eps_values, sil_eps, marker="o")
        plt.xlabel("eps")
        plt.ylabel("Silhouette Score")
        plt.title(f"{ds_key}: Silhouette vs eps (DBSCAN, min_samples=10)")
        sil_eps_path = os.path.join(FIGURES_DIR, f"sil_eps_{ds_key}.png")
        save_plot(sil_eps_path)

    # === Agglomerative Clustering (альтернатива, если DBSCAN не сработал) ===
    if "DBSCAN" not in results:
        linkages = ["ward", "average", "complete"]
        agg_metrics = {}
        for linkage in linkages:
            try:
                agg = AgglomerativeClustering(n_clusters=best_k, linkage=linkage)
                labels_agg = agg.fit_predict(X_processed)
                metrics_agg = evaluate_clustering(X_processed, labels_agg, f"Agg ({linkage})", ds_key)
                agg_metrics[linkage] = metrics_agg
                if metrics_agg and metrics_agg["silhouette"] > best_score:
                    best_score = metrics_agg["silhouette"]
                    best_labels = labels_agg
                    best_method = f"Agg_{linkage}"
                    best_name = f"Agg (k={best_k}, {linkage})"
            except ValueError as e:
                print(f"Agg {linkage} failed: {e}")
        
        # Сохраняем лучший Agg
        if agg_metrics:
            best_link = max(agg_metrics, key=lambda x: agg_metrics[x]["silhouette"] if agg_metrics[x] else -1)
            results[f"Agg_{best_link}"] = {
                "labels": AgglomerativeClustering(n_clusters=best_k, linkage=best_link).fit_predict(X_processed),
                "metrics": agg_metrics[best_link],
                "params": {"k": int(best_k), "linkage": best_link}
            }

    # --- 2.3.5: PCA визуализация лучшего решения ---
    plot_pca(
        X_processed, best_labels,
        f"{ds_key}: Best clustering ({best_name})",
        os.path.join(FIGURES_DIR, f"pca_{ds_key}.png")
    )

    # --- 2.3.6: Устойчивость (только для первого датасета) ---
    if ds_key == "ds1":
        print("\n→ Проверка устойчивости KMeans (8 запусков)...")
        aris = []
        for i in range(8):
            kmeans_tmp = KMeans(n_clusters=best_k, random_state=i, n_init=10)
            labels_tmp = kmeans_tmp.fit_predict(X_processed)
            ari = adjusted_rand_score(labels_kmeans, labels_tmp)
            aris.append(ari)
        print(f"ARI между запусками: {aris}, среднее: {np.mean(aris):.3f}, std: {np.std(aris):.3f}")

    # --- 2.3.7: Итог по датасету ---
    print(f"\n→ Лучший метод для {ds_key}: {best_name}")

    # Сохраняем для артефактов
    metrics_summary[ds_key] = {k: v["metrics"] for k, v in results.items() if v["metrics"]}
    best_configs[ds_key] = {
        "method": best_method,
        "params": results[best_method]["params"],
        "criterion": "silhouette"
    }
    all_labels[ds_key] = pd.DataFrame({
        "sample_id": sample_id,
        "cluster_label": best_labels
    })

# --- 2.4: Сохранение артефактов ---
# metrics_summary.json
with open(os.path.join(ARTIFACTS_DIR, "metrics_summary.json"), "w") as f:
    json.dump(metrics_summary, f, indent=2)

# best_configs.json
with open(os.path.join(ARTIFACTS_DIR, "best_configs.json"), "w") as f:
    json.dump(best_configs, f, indent=2)

# labels/*.csv
for ds_key, df_labels in all_labels.items():
    df_labels.to_csv(os.path.join(LABELS_DIR, f"labels_hw07_{ds_key}.csv"), index=False)



Обработка ds1: S07-hw-dataset-01.csv
→ head():


Unnamed: 0,sample_id,f01,f02,f03,f04,f05,f06,f07,f08
0,0,-0.536647,-69.8129,-0.002657,71.743147,-11.396498,-12.291287,-6.836847,-0.504094
1,1,15.230731,52.727216,-1.273634,-104.123302,11.589643,34.316967,-49.468873,0.390356
2,2,18.542693,77.31715,-1.321686,-111.946636,10.254346,25.892951,44.59525,0.325893



→ info():
<class 'pandas.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sample_id  12000 non-null  int64  
 1   f01        12000 non-null  float64
 2   f02        12000 non-null  float64
 3   f03        12000 non-null  float64
 4   f04        12000 non-null  float64
 5   f05        12000 non-null  float64
 6   f06        12000 non-null  float64
 7   f07        12000 non-null  float64
 8   f08        12000 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 843.9 KB

→ describe():


Unnamed: 0,sample_id,f01,f02,f03,f04,f05,f06,f07,f08
count,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0,12000.0
mean,5999.5,-2.424716,19.107804,-0.222063,-8.284501,-0.190717,0.962972,0.033724,0.007638
std,3464.24595,11.014315,60.790338,0.50063,59.269838,7.026435,14.794713,59.541782,0.607053
min,0.0,-19.912573,-92.892652,-1.590979,-134.303679,-11.869169,-20.521164,-215.098834,-2.633469
25%,2999.75,-9.472623,-40.282955,-0.125145,-48.345007,-5.132473,-8.807706,-39.90052,-0.401483
50%,5999.5,-6.869404,54.069335,-0.031753,16.211728,0.44473,-6.134169,-0.578494,0.005306
75%,8999.25,0.523841,70.280739,0.05498,28.067178,3.942368,2.334426,39.719821,0.410132
max,11999.0,24.403381,112.229523,0.512277,75.088604,13.717091,41.452857,213.381767,2.490745



→ Пропуски:
sample_id    0
f01          0
f02          0
f03          0
f04          0
f05          0
f06          0
f07          0
f08          0
dtype: int64

Числовые: 8, Категориальные: 0
KMeans → Sil: 0.522, CH: 11787.0, DB: 0.685
DBSCAN → Sil: 0.876, CH: 1091.7, DB: 0.170, Noise: 99.69%

→ Проверка устойчивости KMeans (8 запусков)...
ARI между запусками: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], среднее: 1.000, std: 0.000

→ Лучший метод для ds1: DBSCAN (eps=0.20, ms=15)

Обработка ds2: S07-hw-dataset-02.csv
→ head():


Unnamed: 0,sample_id,x1,x2,z_noise
0,0,0.098849,-1.846034,21.288122
1,1,-1.024516,1.829616,6.072952
2,2,-1.094178,-0.158545,-18.938342



→ info():
<class 'pandas.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sample_id  8000 non-null   int64  
 1   x1         8000 non-null   float64
 2   x2         8000 non-null   float64
 3   z_noise    8000 non-null   float64
dtypes: float64(3), int64(1)
memory usage: 250.1 KB

→ describe():


Unnamed: 0,sample_id,x1,x2,z_noise
count,8000.0,8000.0,8000.0,8000.0
mean,3999.5,0.478867,0.241112,0.110454
std,2309.54541,0.955138,0.663195,8.097716
min,0.0,-2.487352,-2.499237,-34.056074
25%,1999.75,-0.116516,-0.242357,-5.39221
50%,3999.5,0.490658,0.241092,0.13247
75%,5999.25,1.085263,0.726526,5.655605
max,7999.0,2.987555,2.995553,29.460076



→ Пропуски:
sample_id    0
x1           0
x2           0
z_noise      0
dtype: int64

Числовые: 3, Категориальные: 0
KMeans → Sil: 0.307, CH: 3573.4, DB: 1.323
DBSCAN → Sil: 0.581, CH: 2758.1, DB: 0.578, Noise: 91.85%

→ Лучший метод для ds2: DBSCAN (eps=0.10, ms=10)

Обработка ds3: S07-hw-dataset-03.csv
→ head():


Unnamed: 0,sample_id,x1,x2,f_corr,f_noise
0,0,-2.71047,4.997107,-1.015703,0.718508
1,1,8.730238,-8.787416,3.953063,-1.105349
2,2,-1.0796,-2.558708,0.976628,-3.605776



→ info():
<class 'pandas.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sample_id  15000 non-null  int64  
 1   x1         15000 non-null  float64
 2   x2         15000 non-null  float64
 3   f_corr     15000 non-null  float64
 4   f_noise    15000 non-null  float64
dtypes: float64(4), int64(1)
memory usage: 586.1 KB

→ describe():


Unnamed: 0,sample_id,x1,x2,f_corr,f_noise
count,15000.0,15000.0,15000.0,15000.0,15000.0
mean,7499.5,1.246296,1.033764,0.212776,-0.027067
std,4330.271354,4.592421,4.710791,1.530017,2.506375
min,0.0,-9.995585,-9.980853,-5.212038,-8.785884
25%,3749.75,-1.782144,-2.666393,-0.966224,-1.731128
50%,7499.5,0.664226,1.831257,0.296508,-0.052391
75%,11249.25,4.435671,4.96963,1.390273,1.673831
max,14999.0,16.207863,14.271153,5.795876,11.266865



→ Пропуски:
sample_id    0
x1           0
x2           0
f_corr       0
f_noise      0
dtype: int64

Числовые: 4, Категориальные: 0
KMeans → Sil: 0.316, CH: 6957.2, DB: 1.158
DBSCAN → Sil: 0.812, CH: 3564.0, DB: 0.245, Noise: 99.59%

→ Лучший метод для ds3: DBSCAN (eps=0.10, ms=10)
