In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering as AC
from sklearn.cluster import KMeans, DBSCAN
from statistics import mode
from sklearn_extra.cluster import KMedoids
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.metrics import accuracy_score

In [2]:
columnas = ['area','perimeter','compactness','kernel_length','kernel_width','asymmetry','length_groove','class']
df = pd.read_csv('seeds_dataset.txt', names=columnas, delimiter='\s+')
df.head()

Unnamed: 0,area,perimeter,compactness,kernel_length,kernel_width,asymmetry,length_groove,class
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1


In [5]:
X = df.iloc[:,:-1]
y = df[columnas[-1]].values

In [15]:
escalado = StandardScaler()
X_scaled = escalado.fit_transform(X)

# Kmedias

In [16]:
kmedias_init = KMeans(n_clusters=30) #n_clusters = 12 + (12*1.5)
kmedias_init.fit(X_scaled)
#Asignamos etiquetas a los puntos de datos escalados basándose en cuál de los centros de clúster iniciales está más cerca de cada punto.
kmedias_init_centros = X_scaled[np.argmin(pairwise_distances(kmedias_init.cluster_centers_, X_scaled),axis=1)]
kmedias_init_centros

array([[-9.53410084e-01, -1.02029413e+00, -1.95077168e-01,
        -1.19801941e+00, -8.56137823e-01, -1.52349476e+00,
        -9.11815834e-01],
       [ 1.07913593e+00,  9.90688609e-01,  1.20058210e+00,
         7.92897905e-01,  1.13688764e+00, -9.62151312e-01,
         1.04205443e+00],
       [-6.74365631e-01, -6.05816775e-01, -7.16858596e-01,
        -3.02106620e-01, -6.17293227e-01,  1.65656256e+00,
        -1.87782491e-01],
       [-1.23589953e+00, -1.18915528e+00, -1.50589393e+00,
        -1.00119009e+00, -1.45590314e+00,  1.08988568e+00,
        -6.44637333e-01],
       [-2.54076455e-01, -2.91121002e-01,  4.66694399e-01,
        -1.98035942e-01, -9.97966019e-02, -6.30812006e-01,
        -7.54771982e-01],
       [ 1.40985528e+00,  1.48192103e+00,  1.69745619e-01,
         1.84717912e+00,  1.00950386e+00,  9.09215755e-01,
         1.94556646e+00],
       [ 2.90232479e-01,  1.46382877e-01,  1.47632025e+00,
        -2.29709627e-01,  6.77775250e-01, -1.40082589e+00,
        -7.3845573

In [17]:
ward_init = AC(n_clusters=12, linkage='ward')
ward_init.fit(kmedias_init_centros)

#Generamos baricentros para cada uno de los 12 clusters después de aplicar el método de enlace Ward a los centros de clúster iniciales proporcionados por k-medias
ward_baricentros = np.array([np.mean(kmedias_init_centros[ward_init.labels_ == 1], axis=0) for i in range(12)])
ward_baricentros.shape

(12, 7)

In [18]:
#Utilizamos ward para generar inicializaciones de centros de clúster (ward_init_centros) y luego aplica el algoritmo de k-medias con estos centros iniciales 
#para asignar etiquetas de clúster a los datos escalados

a = np.argmin(pairwise_distances(ward_baricentros, kmedias_init_centros), axis=1)
ward_init_centros = kmedias_init_centros[a]
kmedias = KMeans(n_clusters=12, init=ward_init_centros)
kmedias.fit(X_scaled)
kmedias.labels_

  self._check_params(X)


array([ 8,  8,  8,  8,  8,  8,  8,  8,  5,  9, 11,  8,  8,  8,  8,  8, 10,
        8,  8, 10,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, 11, 11,  8,
        8,  8, 11,  9,  8, 10,  8,  8,  8, 11,  8,  8,  8,  8,  8,  8,  8,
       11, 11,  8,  8,  8,  8,  8,  8,  0,  0,  8,  8, 10,  8,  8,  8,  8,
        8,  0,  5,  5,  5,  9,  5, 11,  5,  4,  6,  9, 11,  3,  1,  7,  2,
        9,  9,  2,  1,  1,  7,  9,  9,  3,  6,  5,  2,  7,  2,  7, 11,  9,
        3,  2,  2,  9,  9,  5,  2,  9,  9,  2,  9,  6,  4,  7,  3,  2,  9,
        2,  1,  5, 11,  9,  8,  3,  6,  9,  7,  3,  9,  9, 11, 11, 11,  8,
        5, 11, 11,  5,  0, 10, 10, 10,  0,  0,  0, 10,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 10,  0,  0, 10,  0,  0,
        0, 10,  0,  0,  0, 10,  0, 10, 10, 10, 10,  0,  8, 10,  8,  0, 10,
        0, 10,  0, 10,  0, 10], dtype=int32)

In [30]:
# Vamos a utilizar la moda de las etiquetas de clase en cada clúster como una forma de asignar etiquetas de clase 
#a los puntos de datos dentro de esos clústeres y luego evalúa la precisión en comparación con las etiquetas reales.

moda = -1*np.ones(12, dtype=int)
for c in range(12):
    moda[c] = mode(y[kmedias.labels_ == c])
y_predict = moda[kmedias.labels_]
accuracy_kmedias_init = accuracy_score(y_predict, y)
accuracy_kmedias_init

0.8952380952380953

# Kmedoids

In [20]:
kmediodes = KMedoids(n_clusters=12, init='k-medoids++')
kmediodes.fit(X)
kmediodes.labels_

array([10, 10,  1,  8, 10,  8,  1,  1,  5,  5,  7,  8,  1,  1,  1,  1,  6,
       10,  8,  4,  1,  1, 10, 11, 10, 10,  4, 11,  1,  1,  8,  5,  1,  8,
       10,  5,  5,  5,  1,  6,  8,  8, 11,  7,  1,  8, 10,  1,  1, 10,  1,
        7,  1,  1,  8, 10,  1, 10, 10, 11, 11, 11,  4,  4, 11, 11,  8,  8,
        8,  4,  7,  7,  7,  3,  7,  7,  7,  0,  0,  5,  7,  0,  0,  3,  3,
        3,  3,  3,  0,  0,  3,  3,  3,  0,  0,  7,  3,  3,  3,  3,  7,  3,
        0,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  0,  0,  3,  0,  3,  3,
        3,  0,  3,  7,  3,  5,  0,  0,  3,  3,  7,  3,  3,  7,  7,  7,  1,
        7,  5,  5,  7,  6,  6,  6,  9,  4,  9, 11,  4,  4,  9,  9,  2,  4,
        4,  4,  9,  4,  4,  4,  9, 11,  4,  4,  4,  9, 11,  4,  4,  4,  4,
        2,  2,  4,  9,  9,  9,  9,  9,  9, 11,  9,  4,  4,  9,  6,  4,  9,
        9,  2,  9,  9,  4,  4,  9,  4,  4,  6,  6,  4, 11,  9, 11,  4,  2,
        4,  4,  4,  2,  4,  9])

In [32]:
#from scipy.stats import mode as sp_mode 
moda = -1 * np.ones(12, dtype=int)
for c in range(12):
    #moda[c] = sp_mode(y[np.argwhere(kmediodes.labels_ == c)])[0][0].astype(int)
    moda[c] = mode(y[kmediodes.labels_ == c])
y_predict = moda[kmediodes.labels_]
accuracy_kmedoides = accuracy_score(y_predict, y)
accuracy_kmedoides

0.9

# DBSCAN

In [22]:
dbscan = DBSCAN(eps=0.5, min_samples=3)
dbscan.fit(X_scaled)
np.unique(dbscan.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [23]:
np.sum(dbscan.labels_ == -1)

150

In [24]:
# Eliminamos los puntos aislados
k = np.max(dbscan.labels_)+1
y_dbscan = y[dbscan.labels_ != -1]
clusters_dbscan = dbscan.labels_[dbscan.labels_ != -1]

moda = -1 * np.ones(k, dtype=int)
for i in range(k):
    moda[i] = mode(y_dbscan[clusters_dbscan == i])
y_test_predict = moda[clusters_dbscan]
accuracy_score(y_dbscan, y_test_predict)

0.95

In [25]:
# Definir los valores de eps que quieres probar
eps_values = np.linspace(0.5, 1.0, 5)

# Inicializar una lista para almacenar los resultados
results = []

# Iterar sobre los valores de eps
for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=3)
    dbscan.fit(X_scaled)

    k = np.max(dbscan.labels_) + 1
    y_dbscan = y[dbscan.labels_ != -1]
    clusters_dbscan = dbscan.labels_[dbscan.labels_ != -1]

    moda = -1 * np.ones(k, dtype=int)
    for i in range(k):
        moda[i] = mode(y_dbscan[clusters_dbscan == i])

    y_test_predict = moda[clusters_dbscan]
    accuracy = accuracy_score(y_dbscan, y_test_predict)

    # Almacenar los resultados en un diccionario
    result_entry = {
        'eps': eps,
        'num_clusters': k,
        'accuracy': accuracy
    }
    results.append(result_entry)

# Crear un DataFrame a partir de la lista de resultados
results_df = pd.DataFrame(results)

# Aplicar formato para centrar los datos y poner los títulos en negrita
styled_df = results_df.style.set_table_styles([
    {'selector': 'th', 'props': [('text-align', 'center'), ('font-weight', 'bold')]},
    {'selector': 'td', 'props': [('text-align', 'center')]},
])

# Mostrar el DataFrame sin índice a la izquierda
styled_df.hide_index()


  styled_df.hide_index()


eps,num_clusters,accuracy
0.5,12,0.95
0.625,10,0.960938
0.75,6,0.710843
0.875,2,0.380208
1.0,1,0.338308
