In [6]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [7]:
iris = datasets.load_iris()
X = iris.data

In [8]:

def no_processing(X): return X
def normalization(X): return MinMaxScaler().fit_transform(X)
def transform(X): return StandardScaler().fit_transform(X)
def pca(X): return PCA(n_components=2).fit_transform(X)
def t_n(X): return StandardScaler().fit_transform(MinMaxScaler().fit_transform(X))
def t_n_pca(X): return PCA(n_components=2).fit_transform(StandardScaler().fit_transform(MinMaxScaler().fit_transform(X)))

preprocessing_methods = {
    "No Data Processing": no_processing,
    "Using Normalization": normalization,
    "Using Transform": transform,
    "Using PCA": pca,
    "Using T+N": t_n,
    "T+N+PCA": t_n_pca
}

In [9]:

cluster_nums = [2, 4, 5]
results = {
    "KMeans": {},
    "Agglomerative": {},
    "MeanShift": {}
}

In [10]:
def evaluate_clustering(model, data):
    try:
        labels = model.fit_predict(data)
        sil = silhouette_score(data, labels)
        cal = calinski_harabasz_score(data, labels)
        dav = davies_bouldin_score(data, labels)
        return sil, cal, dav
    except Exception:
        return "NA", "NA", "NA"

def process_meanshift(data, expected_clusters=3):
    try:
        ms_model = MeanShift()
        labels = ms_model.fit_predict(data)
        if len(np.unique(labels)) == expected_clusters:
            sil = silhouette_score(data, labels)
            cal = calinski_harabasz_score(data, labels)
            dav = davies_bouldin_score(data, labels)
            return sil, cal, dav
    except Exception:
        pass
    return "NA", "NA", "NA"

for method_name, method_func in preprocessing_methods.items():
    processed_data = method_func(X)

    for num_clusters in cluster_nums:
        kmeans_model = KMeans(n_clusters=num_clusters, n_init=10, random_state=42)
        kmeans_scores = evaluate_clustering(kmeans_model, processed_data)
        results["KMeans"].setdefault(method_name, []).append(kmeans_scores)

        agg_model = AgglomerativeClustering(n_clusters=num_clusters)
        agg_scores = evaluate_clustering(agg_model, processed_data)
        results["Agglomerative"].setdefault(method_name, []).append(agg_scores)

        if num_clusters == 3:
            ms_scores = process_meanshift(processed_data)
        else:
            ms_scores = ("NA", "NA", "NA")
        results["MeanShift"].setdefault(method_name, []).append(ms_scores)


In [15]:
def generate_results_table(algorithm, preprocessing_methods, clusters, result_data):
    metrics = ["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"]
    data = []

    for prep in preprocessing_methods:
        for i, c in enumerate(clusters):
            row = {
                "Preprocessing": prep,
                "Clusters": c
            }
            try:
                values = result_data[algorithm][prep][i]
                for j, metric in enumerate(metrics):
                    row[metric] = np.round(values[j], 3) if values[j] != "NA" else np.nan
            except Exception:
                for metric in metrics:
                    row[metric] = np.nan
            data.append(row)

    return pd.DataFrame(data)

methods_list = list(preprocessing_methods)
df_kmeans = generate_results_table("KMeans", methods_list, cluster_nums, results)
df_agg = generate_results_table("Agglomerative", methods_list, cluster_nums, results)
df_meanshift = generate_results_table("MeanShift", methods_list, cluster_nums, results)


In [16]:
print(df_kmeans)

          Preprocessing  Clusters  Silhouette  Calinski-Harabasz  \
0    No Data Processing         2       0.681            513.925   
1    No Data Processing         4       0.498            530.766   
2    No Data Processing         5       0.491            495.370   
3   Using Normalization         2       0.630            354.366   
4   Using Normalization         4       0.445            314.473   
5   Using Normalization         5       0.353            289.506   
6       Using Transform         2       0.582            251.349   
7       Using Transform         4       0.387            207.266   
8       Using Transform         5       0.346            202.952   
9             Using PCA         2       0.706            570.839   
10            Using PCA         4       0.558            719.124   
11            Using PCA         5       0.552            685.027   
12            Using T+N         2       0.582            251.349   
13            Using T+N         4       0.387   

In [17]:
print(df_agg)

          Preprocessing  Clusters  Silhouette  Calinski-Harabasz  \
0    No Data Processing         2       0.687            502.822   
1    No Data Processing         4       0.489            515.079   
2    No Data Processing         5       0.484            488.485   
3   Using Normalization         2       0.630            354.366   
4   Using Normalization         4       0.433            301.104   
5   Using Normalization         5       0.349            272.024   
6       Using Transform         2       0.577            240.246   
7       Using Transform         4       0.401            201.251   
8       Using Transform         5       0.331            192.681   
9             Using PCA         2       0.711            557.461   
10            Using PCA         4       0.541            673.946   
11            Using PCA         5       0.549            665.883   
12            Using T+N         2       0.577            240.246   
13            Using T+N         4       0.401   

In [18]:
print(df_meanshift)

          Preprocessing  Clusters  Silhouette  Calinski-Harabasz  \
0    No Data Processing         2         NaN                NaN   
1    No Data Processing         4         NaN                NaN   
2    No Data Processing         5         NaN                NaN   
3   Using Normalization         2         NaN                NaN   
4   Using Normalization         4         NaN                NaN   
5   Using Normalization         5         NaN                NaN   
6       Using Transform         2         NaN                NaN   
7       Using Transform         4         NaN                NaN   
8       Using Transform         5         NaN                NaN   
9             Using PCA         2         NaN                NaN   
10            Using PCA         4         NaN                NaN   
11            Using PCA         5         NaN                NaN   
12            Using T+N         2         NaN                NaN   
13            Using T+N         4         NaN   