Project Objective: Given a wine dataset, apply unsupervised learning algorithms to it to determine whether the wine is good or not

In [None]:
#Importing the essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
#Loading the dataset

data =pd.read_csv('../input/wine-quality-selection/winequality-white.csv')

In [None]:
#Exploring data 
data.head()

In [None]:
data.shape

In [None]:
data.info()

Dataset has no missing value and 12 columns. The last column 'quality' is the labeled result that I will be using to determine the accuracy of my clustering algorithms

In [None]:
data.describe()

In [None]:
data['quality'].unique()

There are no negative values and the wines are qualified into 6 different types on the basis of quality.

In [None]:
#Checking for correlation 
fig, ax = plt.subplots(figsize=(8,8)) 
sns.heatmap(data.corr(), cmap="YlGnBu", annot=True, ax=ax)

Correlation exists between parameters like 'density' and 'residual sugar' but not this much to eliminate one of the features

In [None]:
#Checking for outliers
sns.set()
plt.figure(figsize=(20,10))
sns.boxplot(data=data,palette="Set3")
plt.show()

Some outliers can be seen for free and total sulfur dioxide and residual sugar. Removing that 

In [None]:
lower_limit = data["free sulfur dioxide"].mean() - 3*data["free sulfur dioxide"].std()
upper_limit = data["free sulfur dioxide"].mean() + 3*data["free sulfur dioxide"].std()

data = data[(data["free sulfur dioxide"] > lower_limit) & (data["free sulfur dioxide"] < upper_limit)]

In [None]:
lower_limit = data['total sulfur dioxide'].mean() - 3*data['total sulfur dioxide'].std()
upper_limit = data['total sulfur dioxide'].mean() + 3*data['total sulfur dioxide'].std()
data = data[(data['total sulfur dioxide'] > lower_limit) & (data['total sulfur dioxide'] < upper_limit)]

In [None]:
lower_limit = data['residual sugar'].mean() - 3*data['residual sugar'].std()
upper_limit = data['residual sugar'].mean() + 3*data['residual sugar'].std()
data = data[(data['residual sugar'] > lower_limit) & (data['residual sugar'] < upper_limit)]


In [None]:
data.shape

In [None]:
#Updated box plot
sns.set()
plt.figure(figsize=(20,10))
sns.boxplot(data=data,palette="Set3")
plt.show()

In [None]:
#Dropping the dependent variable from the data
df=data.iloc[:,:-1]
y=data.iloc[:,-1]

In [None]:
#K-Means Clustering

from sklearn.cluster import KMeans

clusters = []

for i in range(1, 11):
    km = KMeans(n_clusters=i).fit(df)
    clusters.append(km.inertia_)
    
fig, ax = plt.subplots(figsize=(12, 8))
sns.lineplot(x=list(range(1, 11)), y=clusters, ax=ax)
ax.set_title('Searching for Elbow')
ax.set_xlabel('Clusters')
ax.set_ylabel('Inertia')

plt.show()

Possible number of clusters= 2 or 3

In [None]:
#Standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df = scaler.fit_transform(df)

In [None]:
#Using dimensionality reduction
from sklearn.decomposition import PCA
pca = PCA(2)
df = pca.fit_transform(df)

In [None]:
#Applying kmeans to PCA reduced data
#Applying kmeans to the dataset / Creating the kmeans classifier
from sklearn import metrics
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
y_kmeans = kmeans.fit_predict(df)

#print metrics for reduced data
print('With PCA')
print('Homogeneity: {}'.format(metrics.homogeneity_score(y, kmeans.labels_)))
print('Completeness: {}'.format(metrics.completeness_score(y,kmeans.labels_)))
print('V-measure: {}'.format(metrics.v_measure_score(y,kmeans.labels_)) )


In [None]:
#Getting unique labels
 
u_labels = np.unique(kmeans.labels_)
 
#plotting the results:
 
for i in u_labels:
    plt.scatter(df[kmeans.labels_ == i , 0] , df[kmeans.labels_ == i , 1] , label = i)
plt.legend()
plt.show()

In [None]:
data['cluster']=y_kmeans

In [None]:
data.head()

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(data['density'], data['alcohol'], hue=data['cluster'], 
                palette=sns.color_palette('hls', 3))
plt.title('KMeans with 3 Clusters')
plt.show()

In [None]:
#Better visualization
fig = plt.figure(figsize=(20,8))
ax = fig.add_subplot(121)
sns.swarmplot(x='cluster', y='alcohol', data=data, ax=ax)


Comparing different clustering algorithms to decide the number of classes: With 3 clusters, there was lot of overlapping, so tried for 2 clusters


In [None]:
from sklearn.cluster import MeanShift, estimate_bandwidth
fig = plt.figure(figsize=(20,15))
X= data
##### KMeans #####
ax = fig.add_subplot(221)

km5 = KMeans(n_clusters=2).fit(df)
X['Labels'] = km5.labels_
sns.scatterplot(X['density'], X['alcohol'], hue=X['Labels'], style=X['Labels'],
                palette=sns.color_palette('hls', 2), s=60, ax=ax)
ax.set_title('KMeans with 2 Clusters')

from sklearn.cluster import AgglomerativeClustering 
##### Agglomerative Clustering #####
ax = fig.add_subplot(222)

agglom = AgglomerativeClustering(n_clusters=2, linkage='average').fit(df)
X['Labels'] = agglom.labels_
sns.scatterplot(X['density'], X['alcohol'], hue=X['Labels'], style=X['Labels'],
                palette=sns.color_palette('hls', 2), s=60, ax=ax)
ax.set_title('Agglomerative with 2 Clusters')

from sklearn.cluster import MeanShift, estimate_bandwidth
##### MEAN SHIFT #####
ax = fig.add_subplot(223)

bandwidth = estimate_bandwidth(df, quantile=0.1)
ms = MeanShift(bandwidth).fit(df)
X['Labels'] = ms.labels_
sns.scatterplot(X['density'], X['alcohol'], hue=X['Labels'], style=X['Labels'], s=60,
                palette=sns.color_palette('hls', np.unique(ms.labels_).shape[0]), ax=ax)
ax.set_title('MeanShift')

plt.tight_layout()
plt.show()

The last algorithm automatically estimates the number of clusters to be 5 but the prominent clusters are 2. 

In [None]:
#Bining the wine quality to be good or bad

bins = (2,5.5,10) #classifying quality level below 6 as bad and above as good
labels = [0,1] #0 for bad, 1 for good
data['quality'] = pd.cut(data['quality'],bins=bins,labels=labels)

In [None]:
data.head()

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
kmeans = KMeans(n_clusters=2).fit(df)
kmeans_predict = kmeans.predict(df)

km_cm = confusion_matrix(kmeans_predict,data['quality'])
ax = sns.heatmap(km_cm,annot=True)
ax.set(xlabel='predict', ylabel='true')
km_as = accuracy_score(kmeans_predict,data['quality'])
print("KMeans clustering accuracy score: ",km_as)