In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_rand_score

from sklearn.manifold import Isomap
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift

In [2]:
type_standar = "minmax"
# type_standar = "zscore"

In [3]:
# file_path = 'data/embeddings/df_tfidf_' + type_standar + '.csv'
file_path = 'data/embeddingstrain/df_embeddings_' + type_standar + '.csv'
df_embeddings = pd.read_csv(file_path, delimiter=',')

In [4]:
df_embeddings.head()

Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,...,embed_378,embed_379,embed_380,embed_381,embed_382,embed_383,ANIO,VOLUMEN,CUARTIL,PAGINAS
0,-0.22988,0.249529,0.081523,0.202636,-0.193608,-0.030638,0.188579,-0.098298,-0.218437,-0.08878,...,-0.042838,-0.232117,-0.262647,0.127149,-0.106181,-0.089396,1.0,9e-06,0.0,0.113402
1,-0.012822,0.14934,0.024315,-0.009801,0.088035,-0.033907,0.11128,-0.171367,-0.193791,0.28999,...,0.178573,-0.004228,-0.047453,-0.051901,0.115226,0.034473,1.0,1e-05,0.25,0.164948
2,-0.055708,0.190835,-0.065515,0.225034,0.369172,0.113739,0.287937,-0.156514,-0.296518,0.024632,...,0.300612,-0.066641,-0.076587,-0.150014,0.088423,-0.016509,1.0,6e-06,0.25,0.154639
3,0.137942,0.176798,-0.03231,0.311785,0.261316,0.186766,0.018803,-0.187183,-0.103112,0.044219,...,0.363133,0.012749,-0.090156,-0.0919,-0.104825,0.144255,0.888889,1e-05,0.25,0.185567
4,-0.118311,-0.016865,0.039658,0.219906,-0.250331,0.054814,-0.052862,0.115424,-0.129113,-0.247627,...,-0.027158,-0.341358,-0.035781,0.166203,-0.286782,0.02461,1.0,0.00081,0.75,0.082474


In [5]:
# Parámetros del modelo
n_clusters = 2
n_components = 2  # Dimensiones para Isomap

# Paso 1: Aplicar Isomap a todos los datos
isomap = Isomap(n_components=n_components)
data_reducido = isomap.fit_transform(df_embeddings)  # Aplicar Isomap a todo el 

df_reducido = pd.DataFrame(data_reducido, columns=['Isomap1', 'Isomap2'])

In [6]:
# Paso 2: Dividir en entrenamiento y prueba
train_data, test_data = train_test_split(df_reducido, test_size=0.2, random_state=42)

In [7]:
# Paso 3: Entrenar K-Means en todo el dataset reducido para obtener los clusters completos
kmeans_completo = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_completo.fit(df_reducido)
clusters_completos = kmeans_completo.predict(df_reducido)
df_reducido['Cluster_Completo'] = clusters_completos  # Añadir columna de clusters completos

# ms_completo = MeanShift(bin_seeding=True)
# ms_completo.fit(df_reducido)
# clusters_completos = ms_completo.predict(df_reducido)
# df_reducido['Cluster_Completo'] = clusters_completos

In [8]:
# Paso 4: Entrenar K-Means en el conjunto de entrenamiento
kmeans_train = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_train.fit(train_data)
clusters_train = kmeans_train.predict(test_data)

# ms_train = MeanShift(bin_seeding=True)
# ms_train.fit(train_data)
# clusters_train = ms_train.predict(test_data)

In [9]:
# Paso 5: Comparar los clusters del conjunto de prueba con los clusters originales generados con todo el dataset
# (utilizamos adjusted_rand_score para ver la concordancia entre los dos agrupamientos)

clusters_test_original = kmeans_completo.predict(test_data)  # Clusters originales en el conjunto de prueba
# clusters_test_original = ms_completo.predict(test_data)  # Clusters originales en el conjunto de prueba

concordancia = adjusted_rand_score(clusters_test_original, clusters_train)

print("Concordancia entre clusters en conjunto de prueba y modelo completo (Adjusted Rand Index):", concordancia)

Concordancia entre clusters en conjunto de prueba y modelo completo (Adjusted Rand Index): -0.020788246787776857


In [10]:
clusters_test_original

array([1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1])

In [11]:
clusters_train

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0])