## K-means on wine dataset & yellowbrick & cluster characterization

In [None]:
# sklearn imports
from sklearn.datasets import load_wine
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# pandas and numpy imports
import pandas as pd
import numpy as np

# plotting imports
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

# yellowbrick libs
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.cluster import SilhouetteVisualizer

from ch_7_consts import WINE_FEATURES

# set sns theme and set pandas to display all rows and columns
sns.set_theme()

### Load wine dataset

The Wine dataset is a classic multivariate dataset used for classification tasks in machine learning. 
- It consists of 178 samples of wine from three different cultivars (classes) in the same region in Italy. 
- Each sample has 11 continuous attributes (features) that are the result of a chemical analysis of the wines. 
- The goal of using this dataset is usually to build a classifier that can predict the cultivar of a wine based on its chemical composition.

The dataset contains following features:

- **Alcohol**: The alcohol content in the wine, measured in percentage.
- **Malic Acid**: The amount of malic acid in the wine.
- **Ash**: The measure of the non-aqueous residue remaining after heating.
- **Alcalinity of Ash**: A measure of the alkalinity of the ash formed post-combustion.
- **Magnesium**: The amount of magnesium in the wine.
- **Total Phenols**: The total amount of phenolic compounds.
- **Flavanoids**: Indicates the flavanoid phenolic content.
- **Nonflavanoid Phenols**: Measures the non-flavanoid phenolic content.
- **Proanthocyanins**: Indicates the proanthocyanin content.
- **Color Intensity**: The intensity of the wine's color, measured optically.
- **Hue**: The color attribute that describes a pure color, usually measured via spectrophotometry.
- **OD280/OD315 of diluted wines**: Measures the antioxidant content using absorbance ratio.
- **Proline**: The amount of the amino acid proline.

Citation : Aeberhard,Stefan and Forina,M.. (1991). Wine. UCI Machine Learning Repository. https://doi.org/10.24432/C5PC7J.

In [None]:
# Load the wine dataset
wine_data = load_wine()

wine_df_numeric = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)

wine_df = wine_df_numeric.copy()
wine_df['label'] = wine_data.target


# Scale the data
standard_scaler = StandardScaler()
standard_scaler.fit(wine_df_numeric)
wine_df_numeric_scaled = standard_scaler.transform(wine_df_numeric)

# Create data df
wine_df_numeric = pd.DataFrame(
    wine_df_numeric_scaled, 
    columns = wine_df_numeric.columns
)

g = sns.boxplot(wine_df_numeric)
_ = g.set_xticklabels(g.get_xticklabels(), rotation=90)

### Run k-means with yellowbrick

In [None]:
model = KMeans(n_init='auto', random_state=42)
visualizer = KElbowVisualizer(model, k=(2,3,4,5,6,7))

visualizer=visualizer.fit(wine_df_numeric)
visualizer.show()

In [None]:
model = KMeans(n_init='auto', random_state=42)

visualizer = KElbowVisualizer(model, k=(2,3,4,5,6,7), metric='silhouette', locate_elbow=False)

visualizer=visualizer.fit(wine_df_numeric)   # Fit the data to the visualizer
visualizer.show()

In [None]:
# Perform PCA
pca = PCA()
pca.fit(wine_df_numeric)
pca_transformed_data = pca.transform(wine_df_numeric)

print('Number of PCA components {}'.format(pca_transformed_data.shape[1]))

cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)*100

plt.plot(range(len(cumulative_explained_variance)), cumulative_explained_variance)
plt.scatter(range(len(cumulative_explained_variance)),cumulative_explained_variance)
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.show()

In [None]:
candidate_k = [2,3,4,5]

In [None]:
tab10 = list(matplotlib.colormaps['tab10'].colors)

In [None]:
# Perform clustering for multiple resolutons
fig, axs = plt.subplots(
    len(candidate_k), 2, 
    figsize=(12, 24)
)

cluster_label_dict = {}

for i, num_clust in enumerate(candidate_k):
    
    
    # Perform clustering for current number of clusters
    kmeans = KMeans(n_clusters=num_clust, n_init='auto', random_state=2)
    kmeans.fit(wine_df_numeric.to_numpy())
    
    sns.scatterplot(
        x=pca_transformed_data[:,0], 
        y=pca_transformed_data[:,1], 
        hue=kmeans.labels_, 
        ax=axs[i, 0], 
        s=14, palette='tab10'
    )
        
    visualizer = SilhouetteVisualizer(
        kmeans, 
        is_fitted=True,
        colors=tab10,
        ax=axs[i, 1]
    )
    
    visualizer = visualizer.fit(wine_df_numeric.to_numpy())
    
    cluster_label_dict[num_clust] = kmeans.labels_


### Cluster characterization

In [None]:
from IPython.display import display, Markdown

In [None]:
# Plot numerical features
wine_df['cluster'] = cluster_label_dict[3]
wine_df['malic_acid'] = wine_df['malic_acid']/10
wine_df['total_phenols'] = wine_df['total_phenols']*100
wine_df['flavanoids'] = wine_df['flavanoids']*100
wine_df['nonflavanoid_phenols'] = wine_df['nonflavanoid_phenols']*100
wine_df['proanthocyanins'] = wine_df['proanthocyanins']*100

for col in wine_df:
    
    if col not in ['label', 'cluster']:
        
        display(Markdown(col + ' - ' + WINE_FEATURES[col]))
        
        
        plt.figure()
        sns.boxplot(wine_df, x='cluster', y=col)
        plt.title(col)
        plt.show()
        