<a href="https://colab.research.google.com/github/shaurya-bhatia-sb/Wine-Dataset-Clustering-Analysis/blob/main/Clustering_wine_MLProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MeanShift, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns

Data Pre-Processing

In [None]:
def apply_scaling(data, method='none'):
    if method == 'normalize':
        return MinMaxScaler().fit_transform(data)
    elif method == 'standardize':
        return StandardScaler().fit_transform(data)
    return data

def apply_pca(data, n_components=2):
    pca = PCA(n_components=n_components)
    return pca.fit_transform(data)

Load Dataset

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
columns = ['Class', 'Alcohol', 'Malic_acid', 'Ash', 'Alcalinity', 'Magnesium', 'Phenols',
           'Flavanoids', 'Nonflavanoid', 'Proanthocyanins', 'Color_intensity', 'Hue',
           'OD280_OD315', 'Proline']
data = pd.read_csv(url, header=None, names=columns)
X = data.drop('Class', axis=1)
y = data['Class']

Clustering and Evaluation Function

In [None]:
def cluster_and_evaluate(X, algorithm, clusters=3):
    model = algorithm(n_clusters=clusters)
    labels = model.fit_predict(X)
    silhouette = silhouette_score(X, labels)
    ch_score = calinski_harabasz_score(X, labels)
    db_score = davies_bouldin_score(X, labels)
    return silhouette, ch_score, db_score

Different Configurations

In [None]:
results = []
for scaling in ['none', 'normalize', 'standardize']:
    X_scaled = apply_scaling(X, scaling)
    for use_pca in [True, False]:
        X_final = apply_pca(X_scaled) if use_pca else X_scaled
        for clusters in [3, 4, 5]:
            for algorithm, name in [(KMeans, 'KMeans'), (AgglomerativeClustering, 'Hierarchical')]:
                silhouette, ch_score, db_score = cluster_and_evaluate(X_final, algorithm, clusters)
                results.append([scaling, use_pca, name, clusters, silhouette, ch_score, db_score])


Dataframe Creation

In [None]:
results_df = pd.DataFrame(results, columns=['Scaling', 'PCA', 'Algorithm', 'Clusters',
                                            'Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'])
display(results_df)

Results

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(data=results_df, x='Clusters', y='Silhouette', hue='Algorithm')
plt.title("Silhouette Score by Clustering Method")
plt.show()