<a href="https://colab.research.google.com/github/shubhaan06/Clustering/blob/main/Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
# Clustering Assignment - Iris Dataset (with DBSCAN instead of Mean Shift)
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [23]:
# Load dataset
data = load_iris()
X = data.data
df = pd.DataFrame(X, columns=data.feature_names)

In [24]:
# Define Preprocessing Functions
def normalize(X):
    return MinMaxScaler().fit_transform(X)

def standardize(X):
    return StandardScaler().fit_transform(X)

def apply_pca(X, n_components=2):
    return PCA(n_components=n_components).fit_transform(X)

def log_transform(X):
    return np.log1p(X)

In [25]:
# Define evaluation function
def evaluate_clustering(X, labels):
    if len(set(labels)) <= 1 or len(set(labels)) == len(labels):
        return {'Silhouette': 0, 'Calinski-Harabasz': 0, 'Davies-Bouldin': 999}
    return {
        'Silhouette': round(silhouette_score(X, labels), 2),
        'Calinski-Harabasz': round(calinski_harabasz_score(X, labels), 2),
        'Davies-Bouldin': round(davies_bouldin_score(X, labels), 2)
    }

In [26]:
# Preprocessing scenarios
scenarios = {
    "Raw": lambda X: X,
    "Normalized": normalize,
    "Log Transformed": log_transform,
    "PCA": lambda X: apply_pca(X),
    "T+N": lambda X: normalize(log_transform(X)),
    "T+N+PCA": lambda X: apply_pca(normalize(log_transform(X)))
}

In [27]:
clustering_algorithms = {
    "KMeans": lambda k: KMeans(n_clusters=k, random_state=42),
    "Hierarchical": lambda k: AgglomerativeClustering(n_clusters=k),
    "GMM": lambda k: GaussianMixture(n_components=k, random_state=42)
}

In [28]:
# Result collector
results = []

In [29]:
for method_name, preprocess_func in scenarios.items():
    X_proc = preprocess_func(df.values)
    for algo_name, algo_func in clustering_algorithms.items():
        cluster_range = [3, 4, 5]  # Same range for all algorithms now
        for k in cluster_range:
            model = algo_func(k)
            if algo_name == "GMM":
                labels = model.fit_predict(X_proc)  # GMM uses fit_predict
            else:
                labels = model.fit_predict(X_proc)
            scores = evaluate_clustering(X_proc, labels)
            scores.update({
                "Preprocessing": method_name,
                "Algorithm": algo_name,
                "Clusters": k
            })
            results.append(scores)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Pivot Table View
pivot_table = pd.pivot_table(results_df,
                              values=['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'],
                              index=['Algorithm', 'Preprocessing', 'Clusters'])
print("\n=== Evaluation Results ===")
display(pivot_table)


=== Evaluation Results ===


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Calinski-Harabasz,Davies-Bouldin,Silhouette
Algorithm,Preprocessing,Clusters,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GMM,Log Transformed,3,500.82,0.99,0.57
GMM,Log Transformed,4,655.57,1.11,0.37
GMM,Log Transformed,5,634.48,1.26,0.29
GMM,Normalized,3,308.79,0.87,0.45
GMM,Normalized,4,243.69,1.18,0.36
GMM,Normalized,5,213.87,0.99,0.37
GMM,PCA,3,563.92,0.67,0.53
GMM,PCA,4,485.75,0.8,0.44
GMM,PCA,5,544.28,0.75,0.48
GMM,Raw,3,481.78,0.75,0.5


In [30]:
# Normalize and Find Best Configuration
normalized = results_df.copy()
normalized['Silhouette_N'] = (normalized['Silhouette'] - normalized['Silhouette'].min()) / (normalized['Silhouette'].max() - normalized['Silhouette'].min())
normalized['CH_N'] = (normalized['Calinski-Harabasz'] - normalized['Calinski-Harabasz'].min()) / (normalized['Calinski-Harabasz'].max() - normalized['Calinski-Harabasz'].min())
normalized['DB_N'] = 1 - (normalized['Davies-Bouldin'] - normalized['Davies-Bouldin'].min()) / (normalized['Davies-Bouldin'].max() - normalized['Davies-Bouldin'].min())
normalized['Overall_Score'] = (normalized['Silhouette_N'] + normalized['CH_N'] + normalized['DB_N']) / 3

best_row = normalized.sort_values(by='Overall_Score', ascending=False).iloc[0]
print("\n=== 🔍 Best Clustering Configuration ===")
print(f"Algorithm     : {best_row['Algorithm']}")
print(f"Preprocessing : {best_row['Preprocessing']}")
print(f"Clusters      : {best_row['Clusters']}")
print(f"Silhouette    : {best_row['Silhouette']}")
print(f"Calinski-Harabasz : {best_row['Calinski-Harabasz']}")
print(f"Davies-Bouldin    : {best_row['Davies-Bouldin']}")
print(f"Overall Score     : {round(best_row['Overall_Score'], 3)}")


=== 🔍 Best Clustering Configuration ===
Algorithm     : Hierarchical
Preprocessing : Log Transformed
Clusters      : 3
Silhouette    : 0.57
Calinski-Harabasz : 974.18
Davies-Bouldin    : 0.63
Overall Score     : 0.954
