# Ejemplo clustering jerárquico

In [None]:
# Importo librerías
from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

In [None]:
# Defino una función para representar el dendograma
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram
    # create the counts of samples under each node
    fig = plt.figure(figsize=(20,3))
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count
    linkage_matrix = np.column_stack([model.children_, model.distances_, counts]).astype(float)
    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
# Generación de datos sintéticos
X, y = make_blobs(centers=3, random_state=2, n_samples=200)

In [None]:
plt.scatter(X[y == 0, 0], X[y == 0, 1], s=40, label="Clase Y=0", marker="^") 
plt.scatter(X[y == 1, 0], X[y == 1, 1], s=40, label="Clase Y=1", c="green", marker="x")
plt.scatter(X[y == 2, 0], X[y == 2, 1], s=40, label="Clase Y=2", c="orange", marker="x")
plt.xlabel("X1") 
plt.ylabel("X2") 
plt.show()

In [None]:
# Ajusto el clustering sólamente a los datos X
dfX = pd.DataFrame(X, columns=['X1', 'X2'])

In [None]:
dfX.describe()

In [None]:
copy_data = dfX.copy()

In [None]:
copy_data["clusters"] = np.ones(len(copy_data))

In [None]:
copy_data

In [None]:
_ = AgglomerativeClustering(n_clusters=None, 
                            affinity='euclidean', 
                            linkage='single', 
                            distance_threshold=0)

_.fit(dfX)

# Represento el dendograma
plot_dendrogram(_, truncate_mode='level')
plt.xlabel(u"Número de puntos en cada nodo")
plt.show()


In [None]:
_.labels_

In [None]:
cluster = AgglomerativeClustering(n_clusters=None, 
                                  affinity='euclidean', 
                                  linkage='ward', distance_threshold=0)

In [None]:
cluster

In [None]:
cluster.fit(dfX)

In [None]:
# Represento el dendograma
plot_dendrogram(cluster, truncate_mode='level')
plt.xlabel(u"Número de puntos en cada nodo")
plt.show()


In [None]:
# Represento el dendograma
plot_dendrogram(cluster, truncate_mode='level', p=3)
plt.xlabel(u"Número de puntos en cada nodo")
plt.show()


In [None]:
# Fuerzo a que haya 3 clústers
# dfX = pd.DataFrame(X, columns=['X1', 'X2'])
cluster = AgglomerativeClustering(n_clusters=3, 
                                  affinity='euclidean', 
                                  linkage='ward', 
                                  distance_threshold=None)
cluster.fit(dfX)

In [None]:
# Representaciones
plt.figure(figsize=(10, 5))
# Represento las clases reales
plt.subplot(121)
plt.scatter(X[y == 0, 0], X[y == 0, 1], s=40, label="Clase Y=0", marker="^") 
plt.scatter(X[y == 1, 0], X[y == 1, 1], s=40, label="Clase Y=1", c="green", marker="x")
plt.scatter(X[y == 2, 0], X[y == 2, 1], s=40, label="Clase Y=2", c="orange", marker="x")
plt.xlabel("X1") 
plt.ylabel("X2")
plt.title("Clases reales")

# Represento el resultado del clústering con k=3
plt.subplot(122)
plt.scatter(X[:,0],X[:,1], c=cluster.labels_, cmap='rainbow')
plt.xlabel('X1')
plt.ylabel('X2')
plt.title(u"Asignación del clustering")
plt.tight_layout()
plt.show()

In [None]:
cluster.labels_

In [None]:
dfX["cluster"] = cluster.labels_

In [None]:
dfX

In [None]:
import seaborn as sns

sns.pairplot(data=dfX, hue="cluster")