# Unsupervised ML - Clustering wines according to thermochemical properties

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.neural_network import MLPClassifier

from sklearn import metrics


# Exploratory Data Analysis

In [None]:
redwine_data = pd.read_csv('../input/wine-quality-selection/winequality-red.csv', delimiter=',' )
redwine_data

In [None]:
whitewine_data = pd.read_csv('../input/wine-quality-selection/winequality-white.csv', delimiter=',' )
whitewine_data

In [None]:
corr_matrix = redwine_data.corr().abs()
plt.figure(figsize=(14,10))
sns.heatmap(corr_matrix, annot=True, cmap='Reds')

In [None]:
corr_matrix = whitewine_data.corr().abs()
plt.figure(figsize=(14,10))
sns.heatmap(corr_matrix, annot=True, cmap='Blues')

- From the heatmaps, it looks like the quality of wines are highly correlated with alcohol i.e. 
    - In descending order, red wines quality are highly dependent on alchohol (0.48), volatile acidity (0.39), sulphates (0.25) and citric acid (0.23)
    - In descending order, white wines quality are highly dependent on alchohol (0.44), density (0.31) and chlorides (0.21)

- There are also noticeable strong correlation between properties highlighted in saturated cells of the heatmap in the red wines and white wines.

# Checking the data for nulls

In [None]:
whitewine_data.isnull().sum()

In [None]:
redwine_data.isnull().sum()

In [None]:
whitewine_data.dtypes

In [None]:
redwine_data.dtypes

In [None]:
whitewine_data['quality'].unique()

In [None]:
redwine_data['quality'].unique()

# Preprocessing

- For now we'll focus on red wines

In [None]:
X = redwine_data.drop(['quality'], axis=1)
X.head()

In [None]:
y = redwine_data['quality']
y.head()

- We'll scale our X using RobustScaler, which removes the median and scales the data according to the interquantile range i.e. interquantile range (IQR) as default. The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile).

In [None]:
scaler = RobustScaler() # gives a better results in accuracy and clearer cluster distinction than StandardScaler()
scaler

In [None]:
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_scaled

# Clustering
- For clustering we'll use KMeans since it a simple general purpose algorithm that can give good results and is very scalable.

In [None]:
inertias = []

for i in range(1, 11):
    km = KMeans(n_clusters=i).fit(X_scaled)
    inertias.append(km.inertia_)
    
fig, ax = plt.subplots(figsize=(8, 6))
sns.lineplot(x=list(range(1, 11)), y=inertias, ax=ax)
ax.set_title('Searching for Elbow')
ax.set_xlabel('Clusters')
ax.set_ylabel('Inertia')

plt.show()

- Possible number of clusters = 2, 3 or 4
- Let's use 3 clusters for our KMeans

In [None]:
kmeans = KMeans(n_clusters=3, random_state=1)
kmeans.fit(X_scaled) # no transforming, just fitting

In [None]:
clusters = kmeans.predict(X_scaled)
clusters

# Visualisation
- In order to visualise our results, we have to first reduce our 11 columns of X into 2 dimensions. We can use Principal Component Analysis (PCA), which transforms 11 columns into 2 columns of principle components

In [None]:
pca = PCA(n_components=2)
pca

In [None]:
X_pca = pd.DataFrame(pca.fit_transform(X_scaled), columns=['PC1', 'PC2'])
X_pca['cluster'] = clusters # We assign each row their respective cluster so we can use it to colour our clusters later
X_pca

In [None]:
kmeans.cluster_centers_  # We will also plot the cluster centers later, so the next step also applies PCA to the 11-dimensional centers to transform them into 2 dimensions

In [None]:
centers_pca = pd.DataFrame(pca.transform(kmeans.cluster_centers_), columns=['PC1', 'PC2'])
centers_pca

In [None]:
# Plotting our clusters
plt.figure(figsize=(14, 10))
plt.scatter(X_pca[X_pca['cluster']==0].loc[:, 'PC1'], X_pca[X_pca['cluster']==0].loc[:, 'PC2'], color = 'slateblue')
plt.scatter(X_pca[X_pca['cluster']==1].loc[:, 'PC1'], X_pca[X_pca['cluster']==1].loc[:, 'PC2'], color = 'springgreen')
plt.scatter(X_pca[X_pca['cluster']==2].loc[:, 'PC1'], X_pca[X_pca['cluster']==2].loc[:, 'PC2'], color = 'red')

plt.scatter(centers_pca['PC1'], centers_pca['PC2'], marker='x', color='black', s=300)
plt.xlabel('PC1')
plt.ylabel('PC2')

# Training
- We'll use Multi-Layer Perceptron Classifier as our model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, clusters, train_size=0.8, random_state=1) # 0.8 since we have a large dataset i.e. 1598 rows

In [None]:
model = MLPClassifier(hidden_layer_sizes=(256, 256), max_iter=500, random_state=1)
# convergence is roughly at 400 iterations, warning of convergence is not yet reached if max_iter is small
# hidden layer sizes is large. Small sizes should not work well for this red wine data
model

In [None]:
model.fit(X_train, y_train) # don't need to transform, just fit the model

In [None]:
print(f"Model accuracy: {model.score(X_test, y_test)}")

# Explain model prediction using LIME

In [None]:
import lime
from lime import lime_tabular

explainer = lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=X_train.columns,
    class_names=['Cluster_0', 'Cluster_1', 'Cluster_2'],
    mode='classification',
    random_state=1
)

- Explanations - Cluster 0

In [None]:
X_test.iloc[1]

In [None]:
y_test

In [None]:
y_test[1]

In [None]:
exp = explainer.explain_instance(
    data_row=X_test.iloc[0],
    predict_fn=model.predict_proba,
    labels=(0,),
    num_features=11,
    # top_labels=3
)
exp.show_in_notebook(show_table=True)

- Explanations - Cluster 1

In [None]:
X_test.iloc[7]

In [None]:
y_test[7]

In [None]:
exp = explainer.explain_instance(
    data_row=X_test.iloc[7],
    predict_fn=model.predict_proba,
    labels=(1,),
    num_features=11,
    # top_labels=3
)
exp.show_in_notebook(show_table=True)

- Explanations - Cluster 2

In [None]:
X_test.iloc[8]

In [None]:
y_test[8]

In [None]:
exp = explainer.explain_instance(
    data_row=X_test.iloc[8],
    predict_fn=model.predict_proba,
    labels=(2,),
    num_features=11,
    # top_labels=3
)
exp.show_in_notebook(show_table=True)

# Discussion
- For our red wines dataset, we have achieved a model accuracy of 0.9875

# Now let's investigate the white wines

# Preprocessing

In [None]:
X2 = whitewine_data.drop('quality', axis=1)
X2.head()

In [None]:
y2 = whitewine_data['quality']
y2.head()

In [None]:
X2_scaled = pd.DataFrame(scaler.fit_transform(X2), columns=X2.columns)
X2_scaled

# Clustering

In [None]:
whitewine_data['quality'].unique()

In [None]:
inertias2 = []

for i in range(1, 11):
    km2 = KMeans(n_clusters=i).fit(X2_scaled)
    inertias2.append(km2.inertia_)
    
fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(x=list(range(1, 11)), y=inertias2, ax=ax)
ax.set_title('Searching for Elbow')
ax.set_xlabel('Clusters')
ax.set_ylabel('Inertia')

plt.show()

- Possible number of clusters = 2 or 3. Let's pick 3 for our KMeans for white wines


In [None]:
kmeans2 = KMeans(n_clusters=3)
kmeans2

In [None]:
kmeans2.fit(X2_scaled)

In [None]:
clusters2 = kmeans2.predict(X2_scaled)
clusters2

# Visualisation

In [None]:
X2_pca = pd.DataFrame(pca.fit_transform(X2_scaled), columns=['PC1', 'PC2'])
X2_pca['cluster'] = clusters2
X2_pca

In [None]:
kmeans2.cluster_centers_

In [None]:
centers2_pca = pd.DataFrame(pca.fit_transform(kmeans2.cluster_centers_), columns=['PC1', 'PC2'])
centers2_pca

In [None]:
plt.figure(figsize=[14, 10])
plt.scatter(X2_pca[X2_pca['cluster']==0].loc[:, 'PC1'], X2_pca[X2_pca['cluster']==0].loc[:, 'PC2'], color = 'slateblue')
plt.scatter(X2_pca[X2_pca['cluster']==1].loc[:, 'PC1'], X2_pca[X2_pca['cluster']==1].loc[:, 'PC2'], color = 'springgreen')
plt.scatter(X2_pca[X2_pca['cluster']==2].loc[:, 'PC1'], X2_pca[X2_pca['cluster']==2].loc[:, 'PC2'], color = 'red')

plt.scatter(centers2_pca['PC1'], centers2_pca['PC2'], marker='x', color='black', s=300)
plt.xlabel('PC1')
plt.ylabel('PC2')

# Training
- Again we'll use MLP for our white wines

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_scaled, clusters2, train_size=0.8)

In [None]:
model2 = MLPClassifier(hidden_layer_sizes=(256, 256), max_iter=500)
model2

In [None]:
model2.fit(X2_train, y2_train)

In [None]:
print(f"Model score for white wine: {model2.score(X2_test, y2_test)}")

# Explaining model prediction with LIME

In [None]:
import lime
from lime import lime_tabular

explainer = lime_tabular.LimeTabularExplainer(
    training_data=np.array(X2_train),
    feature_names=X2_train.columns,
    class_names=['Cluster_0', 'Cluster_1', 'Cluster_2'],
    mode='classification',
    random_state=1
)

In [None]:
X2_test.iloc[0]

In [None]:
y2_test[0]

In [None]:
exp = explainer.explain_instance(
    data_row=X2_test.iloc[0],
    predict_fn=model2.predict_proba,
    labels=(0,),
    num_features=11,
    # top_labels=3
)
exp.show_in_notebook(show_table=True)

In [None]:
X2_test.iloc[18]

In [None]:
y2_test[18]

In [None]:
exp = explainer.explain_instance(
    data_row=X2_test.iloc[18],
    predict_fn=model2.predict_proba,
    labels=(1,),
    num_features=11,
    # top_labels=3
)
exp.show_in_notebook(show_table=True)

In [None]:
X2_test.iloc[23]

In [None]:
y2_test[23]

In [None]:
exp = explainer.explain_instance(
    data_row=X2_test.iloc[23],
    predict_fn=model2.predict_proba,
    labels=(2,),
    num_features=11,
    # top_labels=3
)
exp.show_in_notebook(show_table=True)

# Discussion
- For our white wines dataset, we have achieved a model accuracy of 0.995

# Conclusion
- the model for white wines gives a slightly higher accuracy than for red wines (99.5% vs 98.8%)

# Further EDA but using clusters to colour our graphs
- We'll ignore the quality but use the clusters produced by our KMeans algorithm to see if there are any trends or groups in the graphs
- For red wines, we're interested in the the properties of residual sugar and chlorides as they are dominant features from our LIME explainer.
- For white wines, we're interested in the the properties of fixed acidity, citric acid, residual sugar, chlorides, free sulphur dioxide, density, and alcohol as they are dominant features from our LIME explainer.

In [None]:
redwine_data['cluster'] = clusters
redwine_data

- Comparing clusters with quality in redwine_data

In [None]:
redwine_data['quality'].unique()

In [None]:
whitewine_data['cluster'] = clusters2
whitewine_data

In [None]:
whitewine_data['quality'].unique()

In [None]:
redwine_data

In [None]:
redwine_cluster_0 = redwine_data[redwine_data['cluster']==0]
redwine_cluster_1 = redwine_data[redwine_data['cluster']==1]
redwine_cluster_2= redwine_data[redwine_data['cluster']==2]

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10,6), sharey=True)

ax1.boxplot(redwine_cluster_0.loc[:,'residual sugar'], patch_artist=True, boxprops=dict(facecolor='slateblue'))
ax2.boxplot(redwine_cluster_1.loc[:,'residual sugar'], patch_artist=True, boxprops=dict(facecolor='springgreen'))
ax3.boxplot(redwine_cluster_2.loc[:,'residual sugar'], patch_artist=True, boxprops=dict(facecolor='red'))

ax1.set_xticklabels([f'Cluster 0\n({len(redwine_cluster_0)} red wines)'])
ax2.set_xticklabels([f'Cluster 1\n({len(redwine_cluster_1)} red wines)'])
ax3.set_xticklabels([f'Cluster 2\n({len(redwine_cluster_2)} red wines)'])

fig.suptitle('Residual sugar')
fig.supylabel('g / dm^3')

fig.tight_layout()

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10,6), sharey=True)

ax1.boxplot(redwine_cluster_0.loc[:,'chlorides'], patch_artist=True, boxprops=dict(facecolor='slateblue'))
ax2.boxplot(redwine_cluster_1.loc[:,'chlorides'], patch_artist=True, boxprops=dict(facecolor='springgreen'))
ax3.boxplot(redwine_cluster_2.loc[:,'chlorides'], patch_artist=True, boxprops=dict(facecolor='red'))

ax1.set_xticklabels([f'Cluster 0\n({len(redwine_cluster_0)} red wines)'])
ax2.set_xticklabels([f'Cluster 1\n({len(redwine_cluster_1)} red wines)'])
ax3.set_xticklabels([f'Cluster 2\n({len(redwine_cluster_2)} red wines)'])

fig.suptitle('Chlorides')
fig.supylabel('g / dm^3')

fig.tight_layout()

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(redwine_cluster_0.loc[:, 'residual sugar'], redwine_cluster_0.loc[:, 'chlorides'], color='slateblue')
plt.scatter(redwine_cluster_1.loc[:, 'residual sugar'], redwine_cluster_1.loc[:, 'chlorides'], color='springgreen')
plt.scatter(redwine_cluster_2.loc[:, 'residual sugar'], redwine_cluster_2.loc[:, 'chlorides'], color='red')


plt.legend([0,1,2], title='Cluster')
plt.xlabel('Residual sugar')
plt.ylabel('Chlorides')
plt.grid()

- Observations in red wines:
    - Cluster 2 has relatively high residual sugar
    - Cluster 1 has relatively high chlorides
    - Cluster 0 does not have relatively high residual sugar nor chlorides
    - However, cluster ratio distribution is highly uneven


In [None]:
whitewine_data

In [None]:
whitewine_cluster_0 = whitewine_data[whitewine_data['cluster']==0]
whitewine_cluster_1 = whitewine_data[whitewine_data['cluster']==1]
whitewine_cluster_2= whitewine_data[whitewine_data['cluster']==2]

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10,6), sharey=True)

ax1.boxplot(whitewine_cluster_0.loc[:,'fixed acidity'], patch_artist=True, boxprops=dict(facecolor='slateblue'))
ax2.boxplot(whitewine_cluster_1.loc[:,'fixed acidity'], patch_artist=True, boxprops=dict(facecolor='springgreen'))
ax3.boxplot(whitewine_cluster_2.loc[:,'fixed acidity'], patch_artist=True, boxprops=dict(facecolor='red'))

ax1.set_xticklabels([f'Cluster 0\n({len(whitewine_cluster_0)} white wines)'])
ax2.set_xticklabels([f'Cluster 1\n({len(whitewine_cluster_1)} white wines)'])
ax3.set_xticklabels([f'Cluster 2\n({len(whitewine_cluster_2)} white wines)'])

fig.suptitle('Fixed Acidity')
fig.supylabel('g / dm^3')

fig.tight_layout()

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10,6), sharey=True)

ax1.boxplot(whitewine_cluster_0.loc[:,'citric acid'], patch_artist=True, boxprops=dict(facecolor='slateblue'))
ax2.boxplot(whitewine_cluster_1.loc[:,'citric acid'], patch_artist=True, boxprops=dict(facecolor='springgreen'))
ax3.boxplot(whitewine_cluster_2.loc[:,'citric acid'], patch_artist=True, boxprops=dict(facecolor='red'))

ax1.set_xticklabels([f'Cluster 0\n({len(whitewine_cluster_0)} white wines)'])
ax2.set_xticklabels([f'Cluster 1\n({len(whitewine_cluster_1)} white wines)'])
ax3.set_xticklabels([f'Cluster 2\n({len(whitewine_cluster_2)} white wines)'])

fig.suptitle('Citric acid')
fig.supylabel('g / dm^3')

fig.tight_layout()

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10,6), sharey=True)

ax1.boxplot(whitewine_cluster_0.loc[:,'residual sugar'], patch_artist=True, boxprops=dict(facecolor='slateblue'))
ax2.boxplot(whitewine_cluster_1.loc[:,'residual sugar'], patch_artist=True, boxprops=dict(facecolor='springgreen'))
ax3.boxplot(whitewine_cluster_2.loc[:,'residual sugar'], patch_artist=True, boxprops=dict(facecolor='red'))

ax1.set_xticklabels([f'Cluster 0\n({len(whitewine_cluster_0)} white wines)'])
ax2.set_xticklabels([f'Cluster 1\n({len(whitewine_cluster_1)} white wines)'])
ax3.set_xticklabels([f'Cluster 2\n({len(whitewine_cluster_2)} white wines)'])

fig.suptitle('Residual sugar')
fig.supylabel('g / dm^3')

fig.tight_layout()

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10,6), sharey=True)

ax1.boxplot(whitewine_cluster_0.loc[:,'chlorides'], patch_artist=True, boxprops=dict(facecolor='slateblue'))
ax2.boxplot(whitewine_cluster_1.loc[:,'chlorides'], patch_artist=True, boxprops=dict(facecolor='springgreen'))
ax3.boxplot(whitewine_cluster_2.loc[:,'chlorides'], patch_artist=True, boxprops=dict(facecolor='red'))

ax1.set_xticklabels([f'Cluster 0\n({len(whitewine_cluster_0)} white wines)'])
ax2.set_xticklabels([f'Cluster 1\n({len(whitewine_cluster_1)} white wines)'])
ax3.set_xticklabels([f'Cluster 2\n({len(whitewine_cluster_2)} white wines)'])

fig.suptitle('Chlorides')
fig.supylabel('g / dm^3')

fig.tight_layout()

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10,6), sharey=True)

ax1.boxplot(whitewine_cluster_0.loc[:,'free sulfur dioxide'], patch_artist=True, boxprops=dict(facecolor='slateblue'))
ax2.boxplot(whitewine_cluster_1.loc[:,'free sulfur dioxide'], patch_artist=True, boxprops=dict(facecolor='springgreen'))
ax3.boxplot(whitewine_cluster_2.loc[:,'free sulfur dioxide'], patch_artist=True, boxprops=dict(facecolor='red'))

ax1.set_xticklabels([f'Cluster 0\n({len(whitewine_cluster_0)} white wines)'])
ax2.set_xticklabels([f'Cluster 1\n({len(whitewine_cluster_1)} white wines)'])
ax3.set_xticklabels([f'Cluster 2\n({len(whitewine_cluster_2)} white wines)'])

fig.suptitle('Free sulfur dioxide')
fig.supylabel('mg / dm^3')

fig.tight_layout()

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10,6), sharey=True)

ax1.boxplot(whitewine_cluster_0.loc[:,'density'], patch_artist=True, boxprops=dict(facecolor='slateblue'))
ax2.boxplot(whitewine_cluster_1.loc[:,'density'], patch_artist=True, boxprops=dict(facecolor='springgreen'))
ax3.boxplot(whitewine_cluster_2.loc[:,'density'], patch_artist=True, boxprops=dict(facecolor='red'))

ax1.set_xticklabels([f'Cluster 0\n({len(whitewine_cluster_0)} white wines)'])
ax2.set_xticklabels([f'Cluster 1\n({len(whitewine_cluster_1)} white wines)'])
ax3.set_xticklabels([f'Cluster 2\n({len(whitewine_cluster_2)} white wines)'])

ax1.set_ylim(0.98,1.02)
ax2.set_ylim(0.98,1.02)
ax3.set_ylim(0.98,1.02)

fig.suptitle('Density')
fig.supylabel('g / dm^3')

fig.tight_layout()

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10,6), sharey=True)

ax1.boxplot(whitewine_cluster_0.loc[:,'alcohol'], patch_artist=True, boxprops=dict(facecolor='slateblue'))
ax2.boxplot(whitewine_cluster_1.loc[:,'alcohol'], patch_artist=True, boxprops=dict(facecolor='springgreen'))
ax3.boxplot(whitewine_cluster_2.loc[:,'alcohol'], patch_artist=True, boxprops=dict(facecolor='red'))

ax1.set_xticklabels([f'Cluster 0\n({len(whitewine_cluster_0)} white wines)'])
ax2.set_xticklabels([f'Cluster 1\n({len(whitewine_cluster_1)} white wines)'])
ax3.set_xticklabels([f'Cluster 2\n({len(whitewine_cluster_2)} white wines)'])

fig.suptitle('Alcohol')
fig.supylabel('% by volume')

fig.tight_layout()

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(whitewine_cluster_0.loc[:, 'alcohol'], whitewine_cluster_0.loc[:, 'chlorides'], color='slateblue')
plt.scatter(whitewine_cluster_1.loc[:, 'alcohol'], whitewine_cluster_1.loc[:, 'chlorides'], color='springgreen')
plt.scatter(whitewine_cluster_2.loc[:, 'alcohol'], whitewine_cluster_2.loc[:, 'chlorides'], color='red')


plt.legend([0,1,2], title='Cluster')
plt.xlabel('Alcohol')
plt.ylabel('Chlorides')
plt.grid()

- Observations in white wines:
    - Cluster 2 has relatively high chlorides
    - Cluster 0 has relatively high alcohol %
    - Cluster 1 does not have relatively high chlorides nor alcohol %
    - Cluster ratio distribution is about evenly distributed on two clusters


# Conclusion of Further EDA 
- Red wines
    - Cluster 0 – normal red wines
    - Cluster 1 – relatively high chlorides – dry red wines
    - Cluster 2 – relatively high residual sugar – sweet red wines
- White wines
    - Cluster 0 – relatively high alcohol % – high alcohol white wines
    - Cluster 1 – normal white wines
    - Cluster 2 – relatively high chlorides % – dry white wines