In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#read file
wine_data = pd.read_csv("../input/uci-wine-data/wine-clustering.csv") 
wine_data.head()

In [None]:
wine_data.info()

In [None]:
#min max normalization
def normalize_data(data):
    data_normalized = data.copy()
    for col in data.columns:
        print(col, "max:",data[col].max(), "min:",data[col].min())
        data_normalized[col] = (data_normalized[col] - data_normalized[col].min()) / (data_normalized[col].max() - data_normalized[col].min())
    return data_normalized

wine_data_normalized = normalize_data(wine_data)
wine_data_normalized.head()

In [None]:
#similarity matrix where, i-j entry gives dissimilarity between i and j objects
#dissimilarity is 0 when 2 objects are similar
# euclidean distance formula: d(i,j) = [sum for all features((Xif - Xjf)^2)]^1/2

def get_dissimilarity(data):
    from scipy.spatial.distance import squareform, pdist
    similarity_matrix = pd.DataFrame(squareform(pdist(data, 'euclidean')))
    return similarity_matrix

similarity_matrix = get_dissimilarity(wine_data_normalized)
similarity_matrix

In [None]:
#avg dissimilarity for each object 
def get_avg_dissimilarity(data):
    avg_dissimilarity = np.zeros((data.shape[0],1))
    for i in range(data.shape[0]):
        avg_dissimilarity[i] = data[i].mean()
    return avg_dissimilarity
        
avg_dissimilarity = get_avg_dissimilarity(similarity_matrix)
avg_dissimilarity[:10], avg_dissimilarity.shape

In [None]:
#forming m clusters, checking for each object the i-j pairs with dissimilarity less than avg for that object
def form_m_clusters(data, avg_data):
    cluster_objects = [] 
    cluster = [] 
    for i in range(data.shape[0]):
        for j in range(data.shape[1]):
            if(data[i][j]<avg_data[i]):
                  #cluster.append(i)
                  cluster.append(j)
        cluster_objects.append(cluster)
        cluster = [] 
    return cluster_objects

cluster_objects = form_m_clusters(similarity_matrix, avg_dissimilarity)
len(cluster_objects) 

In [None]:
#preview clusters
for i in range(5):
    print("cluster", i, "(",max(cluster_objects[i]),")", ": ", cluster_objects[i])
    print("")

In [None]:
#Remove clusters that are a subset of some other cluster, to be left with p clusters
def remove_subset_clusters(cluster_objects):
    for i in range(len(cluster_objects)):
        for j in range(i+1, len(cluster_objects)):
            if (j<len(cluster_objects) and set(cluster_objects[j]).issubset(set(cluster_objects[i]))):
                cluster_objects = np.delete(cluster_objects, j, axis=0)
                print("cluster", j, "subset of cluster", i, "deleted!") 
    return cluster_objects

# cluster_objects = remove_subset_clusters(cluster_objects)
# len(cluster_objects)

In [None]:
#create similarity matrix of pxp
#where, Cij = |Ci I Cj/ Ci U Cj|
def get_similarity_matrix(cluster_objects):
    p = len(cluster_objects) 
    similarity_matrix2 = np.zeros((p, p), dtype=object) 
    for i in range(p):
        for j in range(p):
            intersect = len(np.intersect1d(cluster_objects[i], cluster_objects[j])) 
            union = len(np.union1d(cluster_objects[i], cluster_objects[j])) 
            similarity_matrix2[i][j] = np.abs(intersect/union) 
    #pd.DataFrame(similarity_matrix2)
    return similarity_matrix2

# similarity_matrix2 = get_similarity_matrix(cluster_objects)
# pd.DataFrame(similarity_matrix2)

In [None]:
#find max Cij and merge Ci & Cj into one 
def merge_max_clusters(similarity_matrix2, cluster_objects):
    maxrow = np.argmax(np.max(similarity_matrix2, axis=0))
    maxcol = np.argmax(np.max(similarity_matrix2, axis=1))
    #print("Maxrow, Maxcol:", maxrow, maxcol) 

    merged = np.unique(np.concatenate((cluster_objects[maxrow], cluster_objects[maxcol]), axis=0))
    #print(len(cluster_objects[maxrow]) , len(cluster_objects[maxcol]), len(merged)) 
    cluster_objects[maxrow] = merged 
    cluster_objects = np.delete(cluster_objects, maxcol, axis=0)
    return cluster_objects
    #len(cluster_objects) 

# cluster_objects = merge_max_clusters(similarity_matrix2, cluster_objects)
# len(cluster_objects)

In [None]:
#iterate to get k clusters 
k = 3
while len(cluster_objects) > k:
    cluster_objects = remove_subset_clusters(cluster_objects) 
    similarity_matrix2 = get_similarity_matrix(cluster_objects) 
    cluster_objects = merge_max_clusters(similarity_matrix2, cluster_objects) 
len(cluster_objects) 

In [None]:
len(cluster_objects[0]) + len(cluster_objects[1]) + len(cluster_objects[2])

In [None]:
#for each index from 0-177, for each cluster: find avg row, calc dissimilarity b/w avg row and index, keep index in the cluster with least dissimilarity, remove from others
# df0 = wine_data_normalized.copy()
# for i in range(178):
#     if i not in cluster_objects[0]:
#         df0 = df0.drop(i)
# dissimilarity = get_dissimilarity(df0)
# avg_dissimilarity = get_avg_dissimilarity(dissimilarity)
# df0['avg_dissimilarity'] = avg_dissimilarity

# df1 = wine_data_normalized.copy()
# for i in range(178):
#     if i not in cluster_objects[1]:
#         df1 = df1.drop(i)
# dissimilarity = get_dissimilarity(df1)
# avg_dissimilarity = get_avg_dissimilarity(dissimilarity)
# df1['avg_dissimilarity'] = avg_dissimilarity

# df2 = wine_data_normalized.copy()
# for i in range(178):
#     if i not in cluster_objects[2]:
#         df2 = df2.drop(i)
# dissimilarity = get_dissimilarity(df2)
# avg_dissimilarity = get_avg_dissimilarity(dissimilarity)
# df2['avg_dissimilarity'] = avg_dissimilarity

# # df0.index, df0.loc[41,:]["avg_dissimilarity"]
# for i in range(178):
#     a=100
#     b=100
#     c=100
#     if i in df0.index:
#         a = df0.loc[i,:]['avg_dissimilarity']
#     if i in df1.index:
#         b = df1.loc[i,:]['avg_dissimilarity']
#     if i in df2.index:
#         c = df2.loc[i,:]['avg_dissimilarity']
#     smallest=min(a,b,c)
#     if i in df0.index and a != smallest:
#         df0 = df0.drop(i)
#     if i in df1.index and b != smallest:
#         df1 = df1.drop(i)
#     if i in df2.index and c != smallest:
#         df2 = df2.drop(i)
# df0.tail()

In [None]:
# df1.tail()

In [None]:
# df2.tail()

In [None]:
# len(df0) + len(df1) + len(df2)

In [None]:
our_cluster = np.zeros((178,), dtype='object')
# for i in range(178):
#     if i in df0.index:
#         our_cluster[i] = 0
#     elif i in df1.index:
#         our_cluster[i] = 1
#     elif i in df2.index:
#         our_cluster[i] = 2
#     else:
#         our_cluster[i] = 3
for i in cluster_objects[0]:
    our_cluster[i]=0;
for i in cluster_objects[1]:
    our_cluster[i]=1;
for i in cluster_objects[2]:
    our_cluster[i]=2;
our_cluster

In [None]:
import matplotlib.pyplot as plt
plt.scatter(our_cluster, range(0,178), c=our_cluster)
plt.show()

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(wine_data_normalized)
kmeans.labels_

In [None]:
kmeans.predict(wine_data_normalized)

In [None]:
cluster0 = []
cluster1 = []
cluster2 = []

for i in range(178):
    x = kmeans.labels_[i]
    if(x == 0):
        cluster0.append(i)
    elif(x == 1):
        cluster1.append(i)
    else:
        cluster2.append(i)

In [None]:
plt.scatter(kmeans.labels_, range(0, 178), c=kmeans.labels_)
plt.show()

In [None]:
#Dunn index

In [None]:
#Davies Bouldin index
from sklearn.metrics import davies_bouldin_score
kmeans_dbs = davies_bouldin_score(wine_data_normalized, kmeans.labels_)
our_dbs = davies_bouldin_score(wine_data_normalized, our_cluster)
kmeans_dbs, our_dbs

In [None]:
#Silhouette index
from sklearn.metrics import silhouette_score
kmeans_si = silhouette_score(wine_data_normalized, kmeans.labels_)
our_si = silhouette_score(wine_data_normalized, our_cluster)
kmeans_si, our_si