In [21]:
# import hierarchical clustering libraries
import scipy.cluster.hierarchy as sch # to build dendrogram and build the plotting
from sklearn.cluster import AgglomerativeClustering

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sn

In [22]:
Univ = pd.read_csv("Universities.csv")

In [23]:
Univ

Unnamed: 0,Univ,SAT,Top10,Accept,SFRatio,Expenses,GradRate
0,Brown,1310,89,22,13,22704,94
1,CalTech,1415,100,25,6,63575,81
2,CMU,1260,62,59,9,25026,72
3,Columbia,1310,76,24,12,31510,88
4,Cornell,1280,83,33,13,21864,90
5,Dartmouth,1340,89,23,10,32162,95
6,Duke,1315,90,30,12,31585,95
7,Georgetown,1255,74,24,12,20126,92
8,Harvard,1400,91,14,11,39525,97
9,JohnsHopkins,1305,75,44,7,58691,87


In [29]:
# Customized / user defined Normalization function
# Here we can use built-in functions as well from sklearn but to show you how we can write customized function we used this code
def norm_func(i): # function name is norm_func, we can give any name here.
    x = (i-i.min())/(i.max()-i.min()) # (Xi-min)/range (Feature Scaling), Range=max-min
    return x

In [None]:
# (i-i.min())/(i.max()-i.min())
Univ['SAT'].min(),Univ['SAT'].max()

In [None]:
(1310-1005)/(1415-1005)

In [27]:
# Normalized data frame (considering the numerical part of data)
df_norm = norm_func(Univ.iloc[:,1:]) # from 1st column because 0 index col is univ names and we don't need that column

In [28]:
df_norm

Unnamed: 0,SAT,Top10,Accept,SFRatio,Expenses,GradRate
0,0.743902,0.847222,0.105263,0.368421,0.255144,0.9
1,1.0,1.0,0.144737,0.0,1.0,0.466667
2,0.621951,0.472222,0.592105,0.157895,0.297461,0.166667
3,0.743902,0.666667,0.131579,0.315789,0.415629,0.7
4,0.670732,0.763889,0.25,0.368421,0.239835,0.766667
5,0.817073,0.847222,0.118421,0.210526,0.427512,0.933333
6,0.756098,0.861111,0.210526,0.315789,0.416996,0.933333
7,0.609756,0.638889,0.131579,0.315789,0.208161,0.833333
8,0.963415,0.875,0.0,0.263158,0.561699,1.0
9,0.731707,0.652778,0.394737,0.052632,0.910991,0.666667


In [None]:
# create dendrogram
dendrogram = sch.dendrogram(sch.linkage(df_norm, method='single')) # single linkage

In [31]:
# create clusters, suppose got input from customer that go for 4 clusters
hc = AgglomerativeClustering(n_clusters=4, metric = 'euclidean',linkage = 'single')

In [None]:
hc

In [36]:
# save clusters for chart
y_hc = hc.fit_predict(df_norm) # apply fit_predict method on dataset df_norm. We will get cluster nos. in y_hc
Clusters=pd.DataFrame(y_hc,columns=['Cluster_No']) # append those no. of cluster numbers create dataframe

In [34]:
y_hc

array([0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 0])

In [37]:
Clusters # Data point 0 belongs to 0th cluster, Data point 1 belongs to 3rd cluster

Unnamed: 0,Cluster_No
0,0
1,3
2,1
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [38]:
# Now let us map this cluster membership to the data points
Univ['h_clusterid'] = Clusters

In [None]:
Univ  # Brown univ belongs to 0th cluster, CalTech belongs to 3rd cluster and so on. Total 4 clusters: 0,1,2,3

In [None]:
Univ1=Univ.sort_values("h_clusterid")
Univ1.iloc[:,[0 ,7]]