In [9]:
import pandas as pd
import numpy as np

In [10]:
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

X = pd.DataFrame(X, columns=iris.feature_names)

X.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA



def preprocess_data(X, method):
    if method == 'Normalization':
        scaler = StandardScaler()
        X_preprocessed = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    elif method == 'Transformation':
        # Apply any transformation technique if needed
        X_preprocessed = X 
    elif method == 'PCA':
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)
        X_preprocessed = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
    elif method == 'T+N':
        scaler = StandardScaler()  # Use StandardScaler for normalization
        X_normalized = scaler.fit_transform(X)
        # Apply logarithmic transformation
        X_preprocessed = pd.DataFrame(np.log1p(X_normalized), columns=X.columns)
    elif method == 'T+N+PCA':
        scaler = StandardScaler()  # Use StandardScaler for normalization
        X_normalized = scaler.fit_transform(X)
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X_normalized)
        X_preprocessed = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
    else:
        X_preprocessed = X
    return X_preprocessed


def perform_clustering(X, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X)
    return labels



In [12]:
preprocessing_methods = ['No Data Processing', 'Normalization', 'Transformation', 'PCA', 'T+N', 'T+N+PCA']
cluster_numbers = [3, 4, 5]

In [13]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

results = {}
for method in preprocessing_methods:
    for n_clusters in cluster_numbers:
        X_processed = preprocess_data(X, method)
        
        # Drop rows with missing values
        X_processed = X_processed.dropna()
        
        labels = perform_clustering(X_processed, n_clusters)
        silhouette = silhouette_score(X_processed, labels)
        calinski_harabasz = calinski_harabasz_score(X_processed, labels)
        davies_bouldin = davies_bouldin_score(X_processed, labels)
        results[(method, n_clusters)] = {'Silhouette': silhouette, 'Calinski-Harabasz': calinski_harabasz, 'Davies-Bouldin': davies_bouldin}
        
new = pd.DataFrame.from_dict(results)



  X_preprocessed = pd.DataFrame(np.log1p(X_normalized), columns=X.columns)
  X_preprocessed = pd.DataFrame(np.log1p(X_normalized), columns=X.columns)
  X_preprocessed = pd.DataFrame(np.log1p(X_normalized), columns=X.columns)


In [14]:
new

Unnamed: 0_level_0,No Data Processing,No Data Processing,No Data Processing,Normalization,Normalization,Normalization,Transformation,Transformation,Transformation,PCA,PCA,PCA,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA
Unnamed: 0_level_1,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.551192,0.497643,0.49308,0.479881,0.385045,0.345033,0.551192,0.497643,0.49308,0.597676,0.557741,0.510041,0.35364,0.304965,0.258262,0.522791,0.445187,0.411085
Calinski-Harabasz,561.593732,529.529095,495.243414,157.360153,206.680603,202.63585,561.593732,529.529095,495.243414,693.708433,719.123544,642.060666,55.559954,47.396572,43.52821,180.97546,263.884045,278.451395
Davies-Bouldin,0.666039,0.75414,0.819384,0.789363,0.869779,0.943894,0.666039,0.75414,0.819384,0.564816,0.615069,0.7525,1.027606,1.10905,1.14048,0.739126,0.744669,0.780293


In [15]:
new.to_csv('KMeans.csv')