<a href="https://colab.research.google.com/github/Archit-29/Clustering/blob/main/Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#IMPORT LIBARARIES
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
#LOAD DATASET
# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data
feature_names = iris.feature_names
# Convert to DataFrame for easier handling
iris_df = pd.DataFrame(X, columns=feature_names)

In [None]:
#PREPROCESSING
def preprocess_data(X, method):
    if method == "No Data Processing":
        return X
    elif method == "Using Normalization":
        scaler = StandardScaler()
        return scaler.fit_transform(X)
    elif method == "Using Transform":
        # Example: Log transformation (make sure data is positive)
        return np.log1p(X)
    elif method == "Using PCA":
        pca = PCA(n_components=2)  # Reduce to 2 dimensions
        return pca.fit_transform(X)
    elif method == "Using T+N":
        # Combine normalization and another method (e.g., normalization)
        scaler = StandardScaler()
        return scaler.fit_transform(X)  # Can modify to add another transformation
    elif method == "Using T+N+PCA":
        # Normalize then apply PCA
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        pca = PCA(n_components=2)  # Adjust components
        return pca.fit_transform(X_scaled)
    else:
        raise ValueError("Unknown preprocessing method.")


In [None]:
#CLUSTERING METRICS
def evaluate_clustering(X, n_clusters, algorithm):
    try:
        if algorithm == 'kmeans':
            model = KMeans(n_clusters=n_clusters, random_state=42)
        elif algorithm == 'hierarchical':
            model = AgglomerativeClustering(n_clusters=n_clusters)
        elif algorithm == 'meanshift':
            model = MeanShift()
            labels = model.fit_predict(X)
            n_clusters = None  # Set n_clusters to None for Mean Shift

        labels = model.fit_predict(X)
        silhouette = silhouette_score(X, labels)
        calinski = calinski_harabasz_score(X, labels)
        davies = davies_bouldin_score(X, labels)
    except:
        silhouette, calinski, davies = "N/A", "N/A", "N/A"

    return silhouette, calinski, davies


In [None]:
#PREPROCESSING MODELS
preprocessing_methods = [
    "No Data Processing",
    "Using Normalization",
    "Using Transform",
    "Using PCA",
    "Using T+N",
    "Using T+N+PCA"
]

In [None]:
# Initialize dictionaries to store results for each clustering method
kmeans_results = []
hierarchical_results = []
meanshift_results = []

# Cluster numbers (c values) as subcolumns
cluster_nums = [3, 4, 5]

In [None]:
#CALCULATIONS
# Evaluate for each preprocessing method and clustering algorithm
for method in preprocessing_methods:
    kmeans_row = []
    hierarchical_row = []

    for c in cluster_nums:
        X_processed = preprocess_data(X, method)  # Apply preprocessing

        # K-Means
        silhouette, calinski, davies = evaluate_clustering(X_processed, c, 'kmeans')
        kmeans_row.extend([silhouette, calinski, davies])

        # Hierarchical Clustering
        silhouette, calinski, davies = evaluate_clustering(X_processed, c, 'hierarchical')
        hierarchical_row.extend([silhouette, calinski, davies])

    kmeans_results.append(kmeans_row)
    hierarchical_results.append(hierarchical_row)

# Mean Shift (no cluster number needed, only metrics)
for method in preprocessing_methods:
    X_processed = preprocess_data(X, method)
    silhouette, calinski, davies = evaluate_clustering(X_processed, None, 'meanshift')
    meanshift_results.append([silhouette, calinski, davies])

In [None]:
# Convert the results into DataFrames
columns = pd.MultiIndex.from_product([["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"], ["c=3", "c=4", "c=5"]])

kmeans_df = pd.DataFrame(kmeans_results, index=preprocessing_methods, columns=columns)
hierarchical_df = pd.DataFrame(hierarchical_results, index=preprocessing_methods, columns=columns)

# Mean Shift table (no cluster numbers, just metrics)
meanshift_df = pd.DataFrame(meanshift_results, index=preprocessing_methods, columns=["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"])


In [None]:
# DISPLAY RESULT
print("\nK-Means Clustering Results:")
print(kmeans_df)

print("\nHierarchical Clustering Results:")
print(hierarchical_df)

print("\nMean Shift Clustering Results:")
print(meanshift_df)


K-Means Clustering Results:
                    Silhouette                       Calinski-Harabasz  \
                           c=3         c=4       c=5               c=3   
No Data Processing    0.551192  561.593732  0.666039          0.497643   
Using Normalization   0.479881  157.360153  0.789363          0.385045   
Using Transform       0.571878  502.487430  0.937543          0.392813   
Using PCA             0.597676  693.708433  0.564816          0.557741   
Using T+N             0.479881  157.360153  0.789363          0.385045   
Using T+N+PCA         0.522791  180.975460  0.739126          0.445187   

                                          Davies-Bouldin                        
                            c=4       c=5            c=3         c=4       c=5  
No Data Processing   529.529095  0.754140       0.493080  495.243414  0.819384  
Using Normalization  206.680603  0.869779       0.345033  202.635850  0.943894  
Using Transform      721.754299  1.028285       0.3267