In [28]:
from sklearn.cluster import AgglomerativeClustering
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from scipy import spatial

In [29]:
from sklearn.neighbors._nearest_centroid import NearestCentroid

In [30]:
data = pd.read_csv('FINAL_FINAL_SCALED_DATA.csv') 

df = pd.DataFrame(data, columns=['feature', 'feature_neutral', 'feature_frequency', 'feature_dissatisfaction','feature_satisfaction'])

In [31]:
for x in range(len(df)):
    if df['feature_satisfaction'][x] == 0 and df['feature_dissatisfaction'][x] == 0:
        print(df['feature'][x])
        df.drop(x, inplace=True)
        
df = df.reset_index(drop=True)

In [32]:
points = df[['feature_dissatisfaction','feature_satisfaction']].to_numpy()
# points = df[['feature_dissatisfaction','feature_satisfaction', 'hungerstation_e', 'h_dissatisfaction', 'h_satisfaction', 'jahez_e', 'j_dissatisfaction', 'j_satisfaction', 'shgardi_e', 's_dissatisfaction', 's_satisfaction', 'toyou_e', 't_dissatisfaction', 't_satisfaction', 'thechefz_e', 'c_dissatisfaction', 'c_satisfaction']].to_numpy()

# 1. Experiments

## 3 clusters

In [33]:
cluster = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
y = cluster.fit_predict(points)
df['cluster'] = cluster.labels_



In [34]:
table = {'feature': [], "cluster": [], "feature_dissatisfaction": [], "feature_satisfaction": [], "feature_neutral": [], "feature_frequency": []}

for x in range(len(df)):
        table["feature"].append(df['feature'][x])
        table["cluster"].append(df['cluster'][x])
        
        table["feature_dissatisfaction"].append(df['feature_dissatisfaction'][x])
        table["feature_satisfaction"].append(df['feature_satisfaction'][x])
        table["feature_neutral"].append(df['feature_neutral'][x])
        table["feature_frequency"].append(df['feature_frequency'][x])
            
       

In [35]:
df=pd.DataFrame.from_dict(table,orient='index').transpose()
df.to_csv('ah_3_clusters_2v_NEW.csv', index = False)

## 4 clusters 

In [36]:
cluster = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')
y = cluster.fit_predict(points)
df['cluster'] = cluster.labels_



In [37]:
table = {'feature': [], "cluster": [], "feature_dissatisfaction": [], "feature_satisfaction": [], "feature_neutral": [], "feature_frequency": []}

for x in range(len(df)):
        table["feature"].append(df['feature'][x])
        table["cluster"].append(df['cluster'][x])
        
        table["feature_dissatisfaction"].append(df['feature_dissatisfaction'][x])
        table["feature_satisfaction"].append(df['feature_satisfaction'][x])
        table["feature_neutral"].append(df['feature_neutral'][x])
        table["feature_frequency"].append(df['feature_frequency'][x])
        

In [38]:
df=pd.DataFrame.from_dict(table,orient='index').transpose()
df.to_csv('ah_4_clusters_2v_NEW.csv', index = False)

## 5 clusters 

In [39]:
cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
y = cluster.fit_predict(points)
df['cluster'] = cluster.labels_



In [40]:
table = {'feature': [], "cluster": [], "feature_dissatisfaction": [], "feature_satisfaction": [], "feature_neutral": [], "feature_frequency": []}

for x in range(len(df)):
        table["feature"].append(df['feature'][x])
        table["cluster"].append(df['cluster'][x])
        
        table["feature_dissatisfaction"].append(df['feature_dissatisfaction'][x])
        table["feature_satisfaction"].append(df['feature_satisfaction'][x])
        table["feature_neutral"].append(df['feature_neutral'][x])
        table["feature_frequency"].append(df['feature_frequency'][x])
            
        

In [41]:
df=pd.DataFrame.from_dict(table,orient='index').transpose()
df.to_csv('ah_5_clusters_2v_NEW.csv', index = False)

# 2. Evaluation

In [42]:
clf = NearestCentroid()
clf.fit(points, y)
print("Centroids:")
centroids = clf.centroids_
print(centroids)

Centroids:
[[0.0143012  0.09931127]
 [0.03844829 0.58759521]
 [0.28532958 0.01651705]
 [0.01556327 0.01249315]
 [1.         0.        ]]


In [43]:
labels = cluster.labels_

In [44]:
cluster_sizes = np.bincount(labels)

In [45]:
cluster_sizes

array([ 159,   15,   13, 1063,    1], dtype=int64)

## 2.1 Dunn index 

In [46]:
import pylab as pl
from scipy.spatial import ConvexHull

In [22]:
# compute the diameter based on convex hull 
def diameter(pts):
  # need at least 3 points to construct the convex hull
  if pts.shape[0] <= 1:
    return 0
  if pts.shape[0] == 2:
    return ((pts[0] - pts[1])**2).sum()
  # two points which are fruthest apart will occur as vertices of the convex hull
  hull = ConvexHull(pts)
  candidates = pts[ConvexHull(pts).vertices]
  
  return spatial.distance_matrix(candidates, candidates).max()

def dunn_index(pts, labels, centroids):
  max_intracluster_dist = max(diameter(pts[labels==i]) for i in np.unique(labels))
  # get pairwise distances between centroids
  cluster_dmat = spatial.distance_matrix(centroids, centroids)
  # fill diagonal with +inf: ignore zero distance to self in "min" computation
  np.fill_diagonal(cluster_dmat, np.inf)
  min_intercluster_dist = cluster_sizes.min()
  return min_intercluster_dist / max_intracluster_dist

In [47]:
dunn_index(points, labels, centroids)

1.5801216341980495

## 2.2 Silhouette index 

In [48]:
from sklearn.metrics import silhouette_score

In [49]:
s_score = silhouette_score(points, labels)
print("Silhouette score =", s_score)

Silhouette score = 0.6546742478244202


## 2.3 Calinski Harabasz  

In [50]:
from sklearn.metrics import calinski_harabasz_score

In [51]:
ch_score = calinski_harabasz_score(points, labels)
print("Calinski Harabasz score =", ch_score)

Calinski Harabasz score = 1173.7435908438597
