<a href="https://colab.research.google.com/github/shivanshg29/clustering/blob/main/ClusteringAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.datasets import load_wine

In [11]:
# Load the UCI Wine dataset
wine = load_wine()
X = wine.data
y = wine.target

In [12]:
X.shape

(178, 13)

In [13]:
def evaluate_clustering(X, clustering_method, n_clusters, preprocessing=None):
    X_processed = X.copy()

    # Apply preprocessing if specified
    if preprocessing == 'normalization':
        scaler = StandardScaler()
        X_processed = scaler.fit_transform(X_processed)
    elif preprocessing == 'transform':
        transformer = PowerTransformer()
        X_processed = transformer.fit_transform(X_processed)
    elif preprocessing == 'pca':
        pca = PCA(n_components=min(X.shape[1], 3))
        X_processed = pca.fit_transform(X_processed)
    elif preprocessing == 't+n':
        transformer = PowerTransformer()
        scaler = StandardScaler()
        X_processed = scaler.fit_transform(transformer.fit_transform(X_processed))
    elif preprocessing == 't+n+pca':
        transformer = PowerTransformer()
        scaler = StandardScaler()
        pca = PCA(n_components=min(X.shape[1], 3))
        X_processed = pca.fit_transform(scaler.fit_transform(transformer.fit_transform(X_processed)))

    # Apply clustering
    # Special case for c=1 (single cluster)
    if n_clusters == 1:
        # For single cluster, assign all points to cluster 0
        labels = np.zeros(X_processed.shape[0], dtype=int)
        # No need to calculate metrics for single cluster
        return 'NA', 'NA', 'NA'

    if clustering_method == 'kmeans':
        model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    elif clustering_method == 'hierarchical':
        model = AgglomerativeClustering(n_clusters=n_clusters)
    elif clustering_method == 'meanshift':
        if preprocessing:
            bandwidth = estimate_bandwidth(X_processed, quantile=0.2)
            if bandwidth <= 0:
                bandwidth = 2.0
        else:
            bandwidth = 10.0
        model = MeanShift(bandwidth=bandwidth, bin_seeding=True)

    # Fit the model
    labels = model.fit_predict(X_processed)

    # We need at least 2 clusters for most metrics
    unique_labels = len(np.unique(labels))
    if unique_labels < 2:
        return 'NA', 'NA', 'NA'

    # Calculate metrics
    try:
        silhouette = round(silhouette_score(X_processed, labels), 2)
    except:
        silhouette = 'NA'

    try:
        calinski = round(calinski_harabasz_score(X_processed, labels))
    except:
        calinski = 'NA'

    try:
        davies = round(davies_bouldin_score(X_processed, labels), 2)
    except:
        davies = 'NA'

    return silhouette, calinski, davies

In [14]:
preprocessing_methods = ['none', 'normalization', 'transform', 'pca', 't+n', 't+n+pca']
cluster_counts = [1, 2, 3]  # c=1, c=2, c=3
clustering_methods = ['kmeans', 'hierarchical', 'meanshift']

# Create a dictionary to store all results
results = {}
best_silhouette = -1  # Track best silhouette score
best_config = None    # Track configuration with best silhouette

# Run the clustering and collect results
for method in clustering_methods:
    method_results = {}
    for metric in ['silhouette', 'calinski', 'davies']:
        method_results[metric] = {}

    for preprocessing in preprocessing_methods:
        for n_clusters in cluster_counts:
            print(f"Processing {method}, {preprocessing}, c={n_clusters}")

            try:
                silhouette, calinski, davies = evaluate_clustering(
                    X,
                    method,
                    n_clusters,
                    preprocessing if preprocessing != 'none' else None
                )

                # Store results
                method_results['silhouette'][(preprocessing, n_clusters)] = silhouette
                method_results['calinski'][(preprocessing, n_clusters)] = calinski
                method_results['davies'][(preprocessing, n_clusters)] = davies

                # Check if this is the best silhouette score so far
                if silhouette != 'NA':
                    silhouette_val = float(silhouette)
                    if silhouette_val > best_silhouette:
                        best_silhouette = silhouette_val
                        best_config = {
                            'algorithm': method,
                            'preprocessing': preprocessing,
                            'n_clusters': n_clusters,
                            'silhouette': silhouette_val
                        }
            except Exception as e:
                print(f"Error with {method}, {preprocessing}, {n_clusters}: {e}")
                # Store NA values for errors
                method_results['silhouette'][(preprocessing, n_clusters)] = 'NA'
                method_results['calinski'][(preprocessing, n_clusters)] = 'NA'
                method_results['davies'][(preprocessing, n_clusters)] = 'NA'

    results[method] = method_results

Processing kmeans, none, c=1
Processing kmeans, none, c=2
Processing kmeans, none, c=3
Processing kmeans, normalization, c=1
Processing kmeans, normalization, c=2
Processing kmeans, normalization, c=3
Processing kmeans, transform, c=1
Processing kmeans, transform, c=2
Processing kmeans, transform, c=3
Processing kmeans, pca, c=1
Processing kmeans, pca, c=2
Processing kmeans, pca, c=3
Processing kmeans, t+n, c=1
Processing kmeans, t+n, c=2
Processing kmeans, t+n, c=3
Processing kmeans, t+n+pca, c=1
Processing kmeans, t+n+pca, c=2
Processing kmeans, t+n+pca, c=3
Processing hierarchical, none, c=1
Processing hierarchical, none, c=2
Processing hierarchical, none, c=3
Processing hierarchical, normalization, c=1
Processing hierarchical, normalization, c=2
Processing hierarchical, normalization, c=3
Processing hierarchical, transform, c=1
Processing hierarchical, transform, c=2
Processing hierarchical, transform, c=3
Processing hierarchical, pca, c=1
Processing hierarchical, pca, c=2
Processi

In [15]:
def create_table(method_name, method_results):
    # Column headers
    headers = [
        'No Data Processing', 'No Data Processing', 'No Data Processing',
        'Using Normalization', 'Using Normalization', 'Using Normalization',
        'Using Transform', 'Using Transform', 'Using Transform',
        'Using PCA', 'Using PCA', 'Using PCA',
        'Using T+N', 'Using T+N', 'Using T+N',
        'T+N+PCA', 'T+N+PCA', 'T+N+PCA'
    ]

    sub_headers = ['c=1', 'c=2', 'c=3'] * 6

    # Create the table
    table = pd.DataFrame(
        index=['Parameters', 'Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'],
        columns=pd.MultiIndex.from_arrays([headers, sub_headers])
    )

    # Fill the Parameters row (just for reference)
    table.loc['Parameters'] = 'Parameters'

    # Fill the metrics
    for preprocessing in preprocessing_methods:
        display_name = {
            'none': 'No Data Processing',
            'normalization': 'Using Normalization',
            'transform': 'Using Transform',
            'pca': 'Using PCA',
            't+n': 'Using T+N',
            't+n+pca': 'T+N+PCA'
        }[preprocessing]

        for n_clusters in cluster_counts:
            col_index = (display_name, f'c={n_clusters}')

            # Fill metrics
            table.loc['Silhouette', col_index] = method_results['silhouette'][(preprocessing, n_clusters)]
            table.loc['Calinski-Harabasz', col_index] = method_results['calinski'][(preprocessing, n_clusters)]
            table.loc['Davies-Bouldin', col_index] = method_results['davies'][(preprocessing, n_clusters)]

    return table

In [16]:
# Create tables for each method
kmeans_table = create_table('K-Mean Clustering', results['kmeans'])
hierarchical_table = create_table('Hierarchical Clustering', results['hierarchical'])
meanshift_table = create_table('K-mean Shift Clustering', results['meanshift'])

# Save individual tables
kmeans_table.to_csv("kmeans_wine_clustering_results.csv")
hierarchical_table.to_csv("hierarchical_wine_clustering_results.csv")
meanshift_table.to_csv("meanshift_wine_clustering_results.csv")

# Save combined table
with open("combined_wine_clustering_results.csv", "w") as f:
    f.write("Performance using different clustering techniques on various parameters\n\n")
    f.write("Using K-Mean Clustering\n")
    kmeans_table.to_csv(f)
    f.write("\nUsing Hierarchical Clustering\n")
    hierarchical_table.to_csv(f)
    f.write("\nUsing K-mean Shift Clustering\n")
    meanshift_table.to_csv(f)

    # Add best algorithm information
    f.write("\n\nBest Clustering Configuration:\n")
    f.write(f"Best Clustering Algorithm: {best_config['algorithm']}\n")
    f.write(f"Best Number of Clusters: {best_config['n_clusters']}\n")
    f.write(f"Best Silhouette Score: {best_config['silhouette']:.2f}\n")
    f.write(f"Best Preprocessing Method: {best_config['preprocessing']}\n")

# Create a formatted version for display
def format_table_for_display(table, title):
    display_table = table.copy()
    # Format the table for better display
    for col in display_table.columns:
        for idx in display_table.index[1:]:  # Skip parameters row
            if display_table.loc[idx, col] == 'NA':
                display_table.loc[idx, col] = 'NA'
            else:
                try:
                    # Format numbers
                    val = float(display_table.loc[idx, col])
                    if idx == 'Calinski-Harabasz':
                        display_table.loc[idx, col] = int(val)
                    else:
                        display_table.loc[idx, col] = f"{val:.2f}"
                except:
                    pass

    # Add a title
    return display_table

In [17]:
print("Using K-Mean Clustering")
print(format_table_for_display(kmeans_table, "K-Mean Clustering"))
print("\nUsing Hierarchical Clustering")
print(format_table_for_display(hierarchical_table, "Hierarchical Clustering"))
print("\nUsing K-mean Shift Clustering")
print(format_table_for_display(meanshift_table, "K-mean Shift Clustering"))

# Display best algorithm information
print("\nBest Clustering Configuration:")
print(f"Best Clustering Algorithm: {best_config['algorithm']}")
print(f"Best Number of Clusters: {best_config['n_clusters']}")
print(f"Best Silhouette Score: {best_config['silhouette']:.2f}")
print(f"Best Preprocessing Method: {best_config['preprocessing']}")

print("\nClustering analysis completed and results saved to CSV files.")

Using K-Mean Clustering
                  No Data Processing                          \
                                 c=1         c=2         c=3   
Parameters                Parameters  Parameters  Parameters   
Silhouette                        NA        0.66        0.57   
Calinski-Harabasz                 NA         505         562   
Davies-Bouldin                    NA        0.48        0.53   

                  Using Normalization                         Using Transform  \
                                  c=1         c=2         c=3             c=1   
Parameters                 Parameters  Parameters  Parameters      Parameters   
Silhouette                         NA        0.26        0.28              NA   
Calinski-Harabasz                  NA          70          71              NA   
Davies-Bouldin                     NA        1.53        1.39              NA   

                                            Using PCA                          \
                       