In [16]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [17]:
iris = datasets.load_iris()
X = iris.data

In [18]:

def no_processing(X): return X
def normalization(X): return MinMaxScaler().fit_transform(X)
def transform(X): return StandardScaler().fit_transform(X)
def pca(X): return PCA(n_components=2).fit_transform(X)
def t_n(X): return StandardScaler().fit_transform(MinMaxScaler().fit_transform(X))
def t_n_pca(X): return PCA(n_components=2).fit_transform(StandardScaler().fit_transform(MinMaxScaler().fit_transform(X)))

preprocessing_methods = {
    "No Data Processing": no_processing,
    "Using Normalization": normalization,
    "Using Transform": transform,
    "Using PCA": pca,
    "Using T+N": t_n,
    "T+N+PCA": t_n_pca
}

In [19]:

cluster_nums = [3, 4, 5]
results = {
    "KMeans": {},
    "Agglomerative": {},
    "MeanShift": {}
}

In [20]:
# KMeans and Agglomerative
for prep_name, prep_func in preprocessing_methods.items():
    X_prep = prep_func(X)
    for c in cluster_nums:
        # KMeans
        try:
            km = KMeans(n_clusters=c, n_init=10, random_state=42)
            labels_km = km.fit_predict(X_prep)
            sil_km = silhouette_score(X_prep, labels_km)
            cal_km = calinski_harabasz_score(X_prep, labels_km)
            dav_km = davies_bouldin_score(X_prep, labels_km)
        except Exception:
            sil_km = cal_km = dav_km = "NA"
        results["KMeans"].setdefault(prep_name, []).append((sil_km, cal_km, dav_km))

        # Agglomerative
        try:
            agg = AgglomerativeClustering(n_clusters=c)
            labels_agg = agg.fit_predict(X_prep)
            sil_agg = silhouette_score(X_prep, labels_agg)
            cal_agg = calinski_harabasz_score(X_prep, labels_agg)
            dav_agg = davies_bouldin_score(X_prep, labels_agg)
        except Exception:
            sil_agg = cal_agg = dav_agg = "NA"
        results["Agglomerative"].setdefault(prep_name, []).append((sil_agg, cal_agg, dav_agg))

        # MeanShift (only for c=3 to match your table, since MeanShift doesn't use n_clusters)
        if c == 3:
            try:
                ms = MeanShift()
                labels_ms = ms.fit_predict(X_prep)
                n_clusters_ms = len(np.unique(labels_ms))
                if n_clusters_ms == 3:  # Only record if 3 clusters found
                    sil_ms = silhouette_score(X_prep, labels_ms)
                    cal_ms = calinski_harabasz_score(X_prep, labels_ms)
                    dav_ms = davies_bouldin_score(X_prep, labels_ms)
                else:
                    sil_ms = cal_ms = dav_ms = "NA"
            except Exception:
                sil_ms = cal_ms = dav_ms = "NA"
            results["MeanShift"].setdefault(prep_name, []).append((sil_ms, cal_ms, dav_ms))
        else:
            results["MeanShift"].setdefault(prep_name, []).append(("NA", "NA", "NA"))

In [24]:
def create_results_dataframe(algorithm, preprocessing_methods, cluster_nums, results):
    rows = []
    for metric_idx, metric_name in enumerate(["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"]):
        row = {"Parameters": metric_name}
        for prep in preprocessing_methods:
            for cidx, c in enumerate(cluster_nums):
                value = results[algorithm][prep][cidx][metric_idx]
                row[f"{prep}\nc={c}"] = np.round(value, 3) if value != "NA" else np.nan
        rows.append(row)
    df = pd.DataFrame(rows)
    return df

df_kmeans = create_results_dataframe("KMeans", list(preprocessing_methods.keys()), cluster_nums, results)
df_agg = create_results_dataframe("Agglomerative", list(preprocessing_methods.keys()), cluster_nums, results)
df_meanshift = create_results_dataframe("MeanShift", list(preprocessing_methods.keys()), cluster_nums, results)



In [25]:
print(df_kmeans)

          Parameters  No Data Processing\nc=3  No Data Processing\nc=4  \
0         Silhouette                    0.553                    0.498   
1  Calinski-Harabasz                  561.628                  530.766   
2     Davies-Bouldin                    0.662                    0.780   

   No Data Processing\nc=5  Using Normalization\nc=3  \
0                    0.491                     0.505   
1                  495.370                   359.845   
2                    0.816                     0.760   

   Using Normalization\nc=4  Using Normalization\nc=5  Using Transform\nc=3  \
0                     0.445                     0.353                 0.460   
1                   314.473                   289.506               241.904   
2                     0.900                     0.957                 0.834   

   Using Transform\nc=4  Using Transform\nc=5  Using PCA\nc=3  Using PCA\nc=4  \
0                 0.387                 0.346           0.598           0.558   

In [26]:
print(df_agg)

          Parameters  No Data Processing\nc=3  No Data Processing\nc=4  \
0         Silhouette                    0.554                    0.489   
1  Calinski-Harabasz                  558.058                  515.079   
2     Davies-Bouldin                    0.656                    0.795   

   No Data Processing\nc=5  Using Normalization\nc=3  \
0                    0.484                     0.505   
1                  488.485                   349.254   
2                    0.820                     0.748   

   Using Normalization\nc=4  Using Normalization\nc=5  Using Transform\nc=3  \
0                     0.433                     0.349                 0.447   
1                   301.104                   272.024               222.719   
2                     0.849                     0.906                 0.803   

   Using Transform\nc=4  Using Transform\nc=5  Using PCA\nc=3  Using PCA\nc=4  \
0                 0.401                 0.331           0.598           0.541   

In [27]:
print(df_meanshift)

          Parameters  No Data Processing\nc=3  No Data Processing\nc=4  \
0         Silhouette                      NaN                      NaN   
1  Calinski-Harabasz                      NaN                      NaN   
2     Davies-Bouldin                      NaN                      NaN   

   No Data Processing\nc=5  Using Normalization\nc=3  \
0                      NaN                       NaN   
1                      NaN                       NaN   
2                      NaN                       NaN   

   Using Normalization\nc=4  Using Normalization\nc=5  Using Transform\nc=3  \
0                       NaN                       NaN                   NaN   
1                       NaN                       NaN                   NaN   
2                       NaN                       NaN                   NaN   

   Using Transform\nc=4  Using Transform\nc=5  Using PCA\nc=3  Using PCA\nc=4  \
0                   NaN                   NaN             NaN             NaN   