In [1]:
from datetime import datetime
str(datetime.now()).replace("-","").replace(":","").replace(".","").replace(" ","_")

'20180509_211512943638'

# Clustering
## Intention: Identify best clustering algorithm for a set of data

## 1. Instantiate Dataset

In [6]:
from sklearn.cluster import KMeans, AffinityPropagation, MeanShift, SpectralClustering, AgglomerativeClustering, DBSCAN, Birch 

In [7]:
from sklearn.datasets import load_iris
from pandas import DataFrame

In [8]:
from pandas import DataFrame
data = DataFrame(load_iris().data, columns=load_iris().feature_names)
labels = load_iris().target
features = load_iris().feature_names

## 2. Score Function

In [9]:
from sklearn.metrics import silhouette_score, calinski_harabaz_score

def scores(data,labels):
    return {"silhouette score" : silhouette_score(data,labels), "calinski harabaz score" : calinski_harabaz_score(data,labels)}

In [71]:
def generate_parameters(parameter_grid):
    
    parameter_names = list(parameter_grid.keys())
    
    axes = []
    for key in parameter_names:
        axes += [parameter_grid[key]]
    
    parameters = []
    
    for point in product(*axes):
        parameter = {}
        for i in range(0,len(point)):
            parameter[parameter_names[i]] = point[i]
        parameters += [parameter]
        
    return parameters        

## K-Means

In [29]:
algorithm = KMeans

parameter_options = {}
parameter_options["n_cluster"] = [i for i in range(2,21)]

score_table = []
for i in range(2, 21):
    
    labels = algorithm(n_clusters=i).fit(data).predict(data)

    row = scores(data,labels)
    row["number of clusters"] = i
    
    score_table += [row]
    
DataFrame(score_table).sort_values("silhouette score", ascending=False).round(2)[["number of clusters", "calinski harabaz score","silhouette score"]]

Unnamed: 0,number of clusters,calinski harabaz score,silhouette score
0,2,513.3,0.68
1,3,560.4,0.55
2,4,529.4,0.5
3,5,494.09,0.49
4,6,474.85,0.37
6,8,438.8,0.36
5,7,449.86,0.36
7,9,409.42,0.34
8,10,391.64,0.33
10,12,363.26,0.32


In [13]:
# Affinity Propagation

In [98]:
from math import floor

n = len(data)

parameter_grid = {
    "damping" : [0.50 + 0.03 * i for i in range(0,floor(0.50/0.03)+1)]#,
    #"preference" : [0.2 * i for i in range(1,floor(1/0.2)+1)]
    #"max_iter" : ,
    #"convergence_iter":,
}

# ===

score_table = []

for parameter in generate_parameters(parameter_grid):
    
    labels = AffinityPropagation(**parameter).fit(data).predict(data)
    row = scores(data,labels)
    row["number of clusters"] = int(len(set(labels)))
    row = {**row,**parameter}
    score_table += [row]
    
DataFrame(score_table).sort_values("silhouette score", ascending=False).round(2)[["number of clusters"] + list(parameter_grid.keys()) + ["calinski harabaz score","silhouette score"]]

Unnamed: 0,number of clusters,damping,calinski harabaz score,silhouette score
16,4,0.98,446.08,0.5
8,5,0.74,487.02,0.49
7,6,0.71,435.83,0.47
4,6,0.62,427.2,0.47
5,6,0.65,427.2,0.47
6,6,0.68,427.2,0.47
9,6,0.77,427.2,0.47
10,6,0.8,427.2,0.47
11,6,0.83,427.2,0.47
12,7,0.86,393.49,0.44


In [88]:
# TO-IMPLEMENT: preference, max_iter, convergence_iter

score_table = []
for i in range(1, 16):
    
    damping = (0.50 + (0.03 * i))
    labels = AffinityPropagation(damping = damping).fit(data).predict(data)
    
    row = scores(data,labels)
    row["damping"] = damping
    row["number of clusters"] = int(len(set(labels)))
    
    score_table += [row]
    
DataFrame(score_table).sort_values("silhouette score", ascending=False).round(2)[["number of clusters", "damping", "calinski harabaz score","silhouette score"]]

Unnamed: 0,number of clusters,damping,calinski harabaz score,silhouette score
7,5,0.74,487.02,0.49
6,6,0.71,435.83,0.47
3,6,0.62,427.2,0.47
4,6,0.65,427.2,0.47
5,6,0.68,427.2,0.47
8,6,0.77,427.2,0.47
9,6,0.8,427.2,0.47
10,6,0.83,427.2,0.47
11,7,0.86,393.49,0.44
12,7,0.89,393.49,0.44


In [15]:
#MeanShift

In [16]:
score_table = []
for i in range(1, 21):
    
    bandwidth = i * 0.05
    labels = MeanShift(bandwidth=bandwidth).fit(data).predict(data)
    
    row = scores(data,labels)
    row["bandwidth"] = bandwidth
    row["number of clusters"] = int(len(set(labels)))
    
    score_table += [row]
    
DataFrame(score_table).sort_values("silhouette score", ascending=False).round(2)[["number of clusters","bandwidth", "calinski harabaz score","silhouette score"]]

Unnamed: 0,number of clusters,bandwidth,calinski harabaz score,silhouette score
19,2,1.0,508.88,0.69
18,2,0.95,508.88,0.69
17,2,0.9,508.88,0.69
16,3,0.85,558.92,0.55
15,4,0.8,463.04,0.49
14,5,0.75,358.65,0.46
12,9,0.65,290.23,0.43
13,6,0.7,342.87,0.41
11,10,0.6,281.01,0.32
9,16,0.5,212.3,0.3


In [17]:
#Spectral Clustering

In [18]:
score_table = []
for i in range(2, 21):
    
    labels = SpectralClustering(n_clusters=i).fit_predict(data)
    
    row = scores(data,labels)
    row["number of clusters"] = i
    
    score_table += [row]
    
DataFrame(score_table).sort_values("silhouette score", ascending=False).round(2)[["number of clusters", "calinski harabaz score","silhouette score"]]

Unnamed: 0,number of clusters,calinski harabaz score,silhouette score
0,2,501.92,0.69
1,3,558.92,0.55
2,4,526.59,0.49
3,5,493.13,0.49
4,6,473.66,0.37
5,7,443.64,0.35
6,8,438.29,0.35
8,10,368.25,0.34
7,9,399.59,0.34
9,11,354.63,0.31


In [19]:
# Agglomerative Clustering

In [20]:
score_table = []
for i in range(2, 21):
    
    labels = AgglomerativeClustering(n_clusters=i).fit_predict(data)
    
    row = scores(data,labels)
    row["number of clusters"] = i
    
    score_table += [row]
    
DataFrame(score_table).sort_values("silhouette score", ascending=False).round(2)[["number of clusters", "calinski harabaz score","silhouette score"]]

Unnamed: 0,number of clusters,calinski harabaz score,silhouette score
0,2,501.92,0.69
1,3,556.84,0.55
2,4,513.77,0.49
3,5,487.07,0.48
4,6,465.73,0.36
6,8,417.14,0.35
5,7,432.83,0.34
7,9,389.63,0.33
8,10,367.76,0.33
9,11,351.72,0.31


In [21]:
#DBSCAN

In [22]:
score_table = []
for i in range(1, 21):
    eps = i * 0.05
    labels = DBSCAN(eps=eps).fit_predict(data)
    
    if len(set(labels)) == 1: continue
    
    row = scores(data,labels)
    row["eps"] = eps
    row["number of clusters"] = int(len(set(labels)))
    
    score_table += [row]
    
DataFrame(score_table).sort_values("silhouette score", ascending=False).round(2)[["number of clusters", "eps", "calinski harabaz score","silhouette score"]]

Unnamed: 0,number of clusters,eps,calinski harabaz score,silhouette score
17,2,1.0,501.92,0.69
15,2,0.9,501.92,0.69
16,2,0.95,501.92,0.69
9,3,0.6,229.7,0.54
12,3,0.75,277.49,0.51
13,3,0.8,277.49,0.51
10,3,0.65,259.76,0.5
11,3,0.7,259.76,0.5
8,3,0.55,226.65,0.5
14,3,0.85,262.13,0.49


In [23]:
#Birch

In [24]:
score_table = []
for i in range(1, 21):
    threshold = i * 0.05
    labels = Birch(threshold=threshold).fit_predict(data)
    
    row = scores(data,labels)
    row["threshold"] = threshold
    row["number of clusters"] = int(len(set(labels)))
    
    score_table += [row]
    
DataFrame(score_table).sort_values("silhouette score", ascending=False).round(2)[["number of clusters","threshold", "calinski harabaz score","silhouette score"]]

Unnamed: 0,number of clusters,threshold,calinski harabaz score,silhouette score
19,3,1.0,554.91,0.56
7,3,0.4,554.91,0.56
1,3,0.1,554.91,0.56
2,3,0.15,556.84,0.55
0,3,0.05,558.06,0.55
18,3,0.95,546.88,0.55
14,3,0.75,484.71,0.54
5,3,0.3,399.95,0.53
6,3,0.35,399.95,0.53
4,3,0.25,399.95,0.53
