In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt, seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

from scipy.cluster.hierarchy import linkage,dendrogram,cut_tree

In [None]:
wine = pd.read_csv('/kaggle/input/wine-dataset-for-clustering/wine-clustering.csv')
wine.head()

In [None]:
wine.info()

In [None]:
plt.figure(figsize=(15,10))
mask = np.triu(wine.iloc[:,:-2].corr(),1)
sns.heatmap(wine.iloc[:,:-2].corr(), annot=True, mask=mask, cmap="YlGnBu")
plt.show()

In [None]:
wine.plot(kind='box', subplots=True, layout=(4,4), figsize=(15,15), title='Outlier Visualization')
plt.show()

In [None]:
scaler = StandardScaler()
df = scaler.fit_transform(wine)
df = pd.DataFrame(df, columns=wine.columns)
df.head()

In [None]:
def hopkins(X):
    
    from sklearn.neighbors import NearestNeighbors
    from random import sample
    from numpy.random import uniform
    import numpy as np
    from math import isnan
    
    d = X.shape[1]
    #d = len(vars) # columns
    n = len(X) # rows
    m = int(0.1 * n) 
    nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
 
    rand_X = sample(range(0, n, 1), m)
 
    ujd = []
    wjd = []
    for j in range(0, m):
        u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
        ujd.append(u_dist[0][1])
        w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
        wjd.append(w_dist[0][1])
 
    H = sum(ujd) / (sum(ujd) + sum(wjd))
    if isnan(H):
        print(ujd, wjd)
        H = 0
 
    return H

In [None]:
hopkins_score = np.array([hopkins(df) for i in range(10)]).mean()
print('Hopkins score =',hopkins_score)

## K-Means

In [None]:
inertia = []
num_of_clusters = np.arange(2,8)
for i in num_of_clusters:
    km = KMeans(n_clusters=i, max_iter=100, random_state=100)
    km.fit(df)    
    inertia.append(km.inertia_)
    
plt.plot(num_of_clusters, inertia)
plt.grid(alpha=0.6)
plt.show()

In [None]:
sil = []
num_of_clusters = np.arange(2,8)
for i in num_of_clusters:
    km = KMeans(n_clusters=i, max_iter=100, random_state=100)
    km.fit(df)    
    sil.append(silhouette_score(df, km.labels_))
    
plt.plot(num_of_clusters, sil)
plt.grid(alpha=0.6)
plt.show()

In [None]:
## 3 clusters seem to be fairly a good choice

km = KMeans(n_clusters=3, max_iter=100, random_state=100)
km.fit(df)

wine['km_cluster_id'] = km.labels_
wine.head()

In [None]:
plt.figure(figsize=(20,15))
for i in enumerate(wine.columns[:-1]):
    plt.subplot(4,4,i[0]+1)
    sns.boxplot(x=wine.km_cluster_id, y=wine[i[1]])
    plt.xlabel('km_cluster_id',fontsize=15)
    plt.ylabel(i[1],fontsize=15)
plt.show()

## Agglomerative Clustering

In [None]:
links = linkage(df, method='complete')
dendrogram(links)
plt.show()

In [None]:
ag = AgglomerativeClustering(n_clusters=5, linkage='complete')
ag.fit_predict(df)

wine['ag_cluster_id'] = ag.labels_
wine.head()

In [None]:
plt.figure(figsize=(20,15))
for i in enumerate(wine.columns[:-2]):
    plt.subplot(4,4,i[0]+1)
    sns.boxplot(x=wine.ag_cluster_id, y=wine[i[1]])
    plt.xlabel('ag_cluster_id',fontsize=15)
    plt.ylabel(i[1],fontsize=15)
plt.show()