In [6]:
import pandas as pd
import numpy as np

In [7]:
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

X = pd.DataFrame(X, columns=iris.feature_names)

X.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MeanShift
from sklearn.decomposition import PCA



def preprocess_data(X, method):
    if method == 'Normalization':
        scaler = StandardScaler()
        X_preprocessed = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    elif method == 'Transformation':
        # Apply any transformation technique if needed
        X_preprocessed = X 
    elif method == 'PCA':
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)
        X_preprocessed = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
    elif method == 'T+N':
        scaler = StandardScaler()  # Use StandardScaler for normalization
        X_normalized = scaler.fit_transform(X)
        # Apply logarithmic transformation
        X_preprocessed = pd.DataFrame(np.log1p(X_normalized), columns=X.columns)
    elif method == 'T+N+PCA':
        scaler = StandardScaler()  # Use StandardScaler for normalization
        X_normalized = scaler.fit_transform(X)
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X_normalized)
        X_preprocessed = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
    else:
        X_preprocessed = X
    return X_preprocessed


def perform_clustering(X):
    ms = MeanShift()
    labels = ms.fit_predict(X)
    return labels


In [12]:
preprocessing_methods = ['No Data Processing', 'Normalization', 'Transformation', 'PCA', 'T+N', 'T+N+PCA']
cluster_numbers = [3, 4, 5]

In [13]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

results = {}
for method in preprocessing_methods:
    for n_clusters in cluster_numbers:
        X_processed = preprocess_data(X, method)
        
        # Drop rows with missing values
        X_processed = X_processed.dropna()
        
        labels = perform_clustering(X_processed)
        silhouette = silhouette_score(X_processed, labels)
        calinski_harabasz = calinski_harabasz_score(X_processed, labels)
        davies_bouldin = davies_bouldin_score(X_processed, labels)
        results[(method, n_clusters)] = {'Silhouette': silhouette, 'Calinski-Harabasz': calinski_harabasz, 'Davies-Bouldin': davies_bouldin}
        
new = pd.DataFrame.from_dict(results)


  X_preprocessed = pd.DataFrame(np.log1p(X_normalized), columns=X.columns)
  X_preprocessed = pd.DataFrame(np.log1p(X_normalized), columns=X.columns)
  X_preprocessed = pd.DataFrame(np.log1p(X_normalized), columns=X.columns)


In [14]:
new

Unnamed: 0_level_0,No Data Processing,No Data Processing,No Data Processing,Normalization,Normalization,Normalization,Transformation,Transformation,Transformation,PCA,PCA,PCA,T+N,T+N,T+N,T+N+PCA,T+N+PCA,T+N+PCA
Unnamed: 0_level_1,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5,3,4,5
Silhouette,0.685788,0.685788,0.685788,0.58175,0.58175,0.58175,0.685788,0.685788,0.685788,0.710311,0.710311,0.710311,0.365451,0.365451,0.365451,0.61452,0.61452,0.61452
Calinski-Harabasz,509.703427,509.703427,509.703427,251.349339,251.349339,251.349339,509.703427,509.703427,509.703427,565.734052,565.734052,565.734052,29.745157,29.745157,29.745157,283.005488,283.005488,283.005488
Davies-Bouldin,0.388552,0.388552,0.388552,0.593313,0.593313,0.593313,0.388552,0.388552,0.388552,0.355059,0.355059,0.355059,0.741617,0.741617,0.741617,0.543999,0.543999,0.543999


In [15]:
new.to_csv('kmeans_shift.csv')