In [118]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering as AC
import statistics as st
from sklearn.preprocessing import StandardScaler

In [119]:
columnas = ['area','perimeter','compactness','kernel_length','kernel_width','asymmetry','length_groove','class']
df = pd.read_csv('seeds_dataset.txt', names=columnas, delimiter='\s+')
df.head()

Unnamed: 0,area,perimeter,compactness,kernel_length,kernel_width,asymmetry,length_groove,class
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1


In [120]:
X = df[columnas[:-1]]
labels = df[columnas[-1]]

scaler = StandardScaler()
X_std = scaler.fit_transform(X)
X_std

array([[ 1.42097769e-01,  2.15462437e-01,  6.06017918e-05, ...,
         1.41701823e-01, -9.86151745e-01, -3.83577423e-01],
       [ 1.11880257e-02,  8.22375713e-03,  4.28515270e-01, ...,
         1.97432229e-01, -1.78816620e+00, -9.22013487e-01],
       [-1.92066576e-01, -3.60200562e-01,  1.44238325e+00, ...,
         2.08047544e-01, -6.67479334e-01, -1.18919199e+00],
       ...,
       [-5.67570840e-01, -6.90247348e-01,  7.33948301e-01, ...,
        -7.06044846e-02,  3.07658816e+00, -7.18060432e-01],
       [-1.03608992e+00, -1.03564515e+00, -8.01701104e-01, ...,
        -1.12152071e+00, -6.81351965e-02, -7.42534799e-01],
       [-8.77620233e-01, -9.35863561e-01, -1.10234659e-01, ...,
        -7.55292327e-01,  1.29122264e+00, -7.03783718e-01]])

In [121]:
min_clusters = 3
max_clusters = 50

num_clusters = [i  for i in range(min_clusters, max_clusters+1)]
col_names = ['single', 'complete', 'average', 'ward']

table = pd.DataFrame(np.zeros((len(num_clusters),4)),columns=col_names)

table.columns.name = 'LINKAGE'
table.index.name = 'n_clusters'
table.index = num_clusters

In [122]:
for i in range(len(col_names)):
    results = []
    
    for c in range(min_clusters, max_clusters+1):
        
        clt = AC(n_clusters=c, affinity='euclidean', linkage=col_names[i])
        clusters = clt.fit_predict(X_std)
        moda = -1 + np.zeros((c), dtype=int)
        aciertos=0
        
        for k in range(c):
            moda[k]=st.mode(labels[clusters==k])
            aciertos += np.sum(labels[clusters==k] == moda[k])
            
        accuracy = aciertos / labels.shape[0]
        results.append(accuracy)
        
    table.loc[:, table.keys()[i]] = results

table        

LINKAGE,single,complete,average,ward
3,0.347619,0.87619,0.880952,0.928571
4,0.352381,0.87619,0.880952,0.928571
5,0.357143,0.87619,0.880952,0.928571
6,0.357143,0.87619,0.880952,0.928571
7,0.361905,0.87619,0.890476,0.928571
8,0.361905,0.87619,0.890476,0.928571
9,0.366667,0.9,0.890476,0.928571
10,0.37619,0.928571,0.890476,0.928571
11,0.380952,0.928571,0.890476,0.928571
12,0.390476,0.928571,0.890476,0.928571


In [123]:
for i in range(len(col_names)):
    cluster_optimo = table[col_names[i]].idxmax()
    print("Para el metodo de montaje",'\033[1m' + col_names[i] + '\033[0m', "el numero de clusters optimo es:", '\033[1m' + str(cluster_optimo) + '\033[0m')

Para el metodo de montaje [1msingle[0m el numero de clusters optimo es: [1m50[0m
Para el metodo de montaje [1mcomplete[0m el numero de clusters optimo es: [1m43[0m
Para el metodo de montaje [1maverage[0m el numero de clusters optimo es: [1m45[0m
Para el metodo de montaje [1mward[0m el numero de clusters optimo es: [1m48[0m
