In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

X = pd.DataFrame(X, columns=iris.feature_names)

X.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA



def preprocess_data(X, method):
    if method == 'Normalization':
        scaler = StandardScaler()
        xfit = scaler.fit_transform(X)
        X_preprocessed = pd.DataFrame(xfit, columns=X.columns)
    elif method == 'Transformation':
        # Apply any transformation technique if needed
        X_preprocessed = X 
    elif method == 'PCA':
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)
        X_preprocessed = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
    elif method == 'T+N':
        scaler = StandardScaler()  # Use StandardScaler for normalization
        X_normalized = scaler.fit_transform(X)
        # Apply logarithmic transformation
        X_preprocessed = pd.DataFrame(np.log1p(X_normalized), columns=X.columns)
    elif method == 'T+N+PCA':
        scaler = StandardScaler()  # Use StandardScaler for normalization
        X_normalized = scaler.fit_transform(X)
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X_normalized)
        X_preprocessed = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
    else:
        X_preprocessed = X
    return X_preprocessed




def perform_clustering(X, n_clusters):
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    labels = clustering.fit_predict(X)
    return labels



In [7]:
preprocessing_methods = ['No Data Processing', 'Normalization', 'Transformation', 'PCA', 'T+N', 'T+N+PCA']
cluster_numbers = [3, 4, 5]

In [8]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

results = {}
for method in preprocessing_methods:
    for n_clusters in cluster_numbers:
        X_processed = preprocess_data(X, method)
        
        # Drop rows with missing values
        X_processed = X_processed.dropna()
        
        labels = perform_clustering(X_processed, n_clusters)
        silhouette = silhouette_score(X_processed, labels)
        calinski_harabasz = calinski_harabasz_score(X_processed, labels)
        davies_bouldin = davies_bouldin_score(X_processed, labels)
        results[(method, n_clusters)] = {'Silhouette': silhouette, 'Calinski-Harabasz': calinski_harabasz, 'Davies-Bouldin': davies_bouldin}
        
new = pd.DataFrame.from_dict(results)


  X_preprocessed = pd.DataFrame(np.log1p(X_normalized), columns=X.columns)
  X_preprocessed = pd.DataFrame(np.log1p(X_normalized), columns=X.columns)
  X_preprocessed = pd.DataFrame(np.log1p(X_normalized), columns=X.columns)


In [9]:
new

Unnamed: 0_level_0,No Data Processing,No Data Processing,No Data Processing,Normalization,Normalization,Normalization,Transformation,Transformation,Transformation,PCA,PCA,PCA,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA
Unnamed: 0_level_1,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.554324,0.488967,0.484383,0.446689,0.400636,0.330587,0.554324,0.488967,0.484383,0.598475,0.540977,0.548784,0.296722,0.273166,0.259903,0.51106,0.448735,0.404169
Calinski-Harabasz,558.058041,515.078906,488.484904,222.719164,201.251454,192.681283,558.058041,515.078906,488.484904,688.617548,673.946264,665.883112,50.378841,43.311413,39.706208,286.328664,254.090094,254.996196
Davies-Bouldin,0.656256,0.795264,0.820417,0.803467,0.978821,0.974249,0.656256,0.795264,0.820417,0.560496,0.654624,0.652573,1.111812,1.188096,1.165642,0.70543,0.722612,0.79125


In [10]:
new.to_csv('hierarchical.csv')