In [1]:
import pandas as pd
import os
import time
import numpy as np
from gensim.models import LdaModel
import ast
from gsdmm import MovieGroupProcess
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import Word2Vec
from sklearn.cluster import KMeans,AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, v_measure_score, confusion_matrix, rand_score, fowlkes_mallows_score, davies_bouldin_score, calinski_harabasz_score, silhouette_score
from collections import Counter,defaultdict
from bcubed_metrics.bcubed import Bcubed
from gensim import models
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

## Funciones para tratar datos 

In [2]:
def filter_documents_for_dictionary(tokenized_documents, filtered_words):
    filtered_documents = []
    for doc in tokenized_documents:
        filtered_doc = [word for word in doc if word in filtered_words]
        filtered_documents.append(filtered_doc)
    return filtered_documents

def guardar_objeto(resultados, nombre_archivo):
    with open(nombre_archivo, "wb") as archivo:
        pickle.dump(resultados, archivo)
        
# matriz donde cada fila representa un vector de documento, obtenido como el promedio de los vectores de las palabras que lo componen
def get_document_vectors(model, documents):
    document_vectors = []
    for doc in documents:
        word_vectors = [model.wv[word] for word in doc if word in model.wv.key_to_index]
        if word_vectors:
            document_vector = np.mean(word_vectors, axis=0)
            document_vectors.append(document_vector)
        else:
            # Manejar documentos vacíos o sin palabras válidas
            # Omitir el documento o asignar un vector de ceros
            document_vectors.append(np.zeros(model.vector_size)) #vector de ceros
            pass #omitir el documento
    return document_vectors

## Funciones para evaluar modelos

In [13]:
def create_bcubed_predicted_clustering(labels, true_labels):
    cluster_to_service_counts = defaultdict(lambda: defaultdict(int))
    for i, cluster in enumerate(labels):
        service = true_labels[i]
        cluster_to_service_counts[cluster][service] += 1
    predicted_clustering = []
    for cluster, service_counts in cluster_to_service_counts.items():
        predicted_clustering.append(dict(service_counts))
    return predicted_clustering


def convertir_claves_a_string_en_predicted_clustering(predicted_clustering):
    nuevo_predicted_clustering = []
    for topic_counts in predicted_clustering:
        nuevo_topic_counts = convertir_claves_a_string(topic_counts)
        nuevo_predicted_clustering.append(nuevo_topic_counts)
    return nuevo_predicted_clustering


def contar_codigos_servicio(true_labels):
    conteo_servicios = {}
    for servicio in true_labels:
        if servicio in conteo_servicios:
            conteo_servicios[servicio] += 1
        else:
            conteo_servicios[servicio] = 1
    return conteo_servicios

def convertir_claves_a_string(diccionario):
    nuevo_diccionario = {}
    for clave, valor in diccionario.items():
        nueva_clave = str(clave)  # Convierte la clave a string
        nuevo_diccionario[nueva_clave] = valor
    return nuevo_diccionario


def evaluar_modelo(etiquetas_reales, etiquetas_predichas, X=None):
    
    resultados = {}

    resultados["adjusted_rand_score"] = adjusted_rand_score(
        etiquetas_reales, etiquetas_predichas
    )
    resultados["normalized_mutual_info_score"] = normalized_mutual_info_score(
        etiquetas_reales, etiquetas_predichas
    )
    
    resultados["confusion_matrix"] = confusion_matrix(
        etiquetas_reales, etiquetas_predichas
    )
   
    resultados["fowlkes_mallows_score"] = fowlkes_mallows_score(
        etiquetas_reales, etiquetas_predichas
    )
    
    # Métricas B-Cubed
    #Preparar diccionario para cluster indicar documentos asociados a etiquetas reales
    
    predicted_clustering = create_bcubed_predicted_clustering(etiquetas_predichas , etiquetas_reales)
    predicted_clustering_string = convertir_claves_a_string_en_predicted_clustering(predicted_clustering)
   
    #Preparar diccionario con ground_truth_cluster
    
    conteo_servicios = contar_codigos_servicio(etiquetas_reales)
    ground_labels = convertir_claves_a_string(conteo_servicios)
   
    b = Bcubed(ground_truth_clustering=ground_labels, predicted_clustering=predicted_clustering_string)
   
    resultados["bcubed_precision"] = b.bcubed_precision
    resultados["bcubed_recall"] = b.bcubed_recall
    resultados["bcubed_f1"] = b.bcubed_f1
    
    # Métricas de validación interna
    
    if X is not None:
        resultados["davies_bouldin_score"] = davies_bouldin_score(
          X, etiquetas_predichas
      )
        resultados["calinski_harabasz_score"] = calinski_harabasz_score(
          X, etiquetas_predichas
      )
        resultados["silhouette_score"] = silhouette_score(
          X, etiquetas_predichas
      )
        resultados["homogeneity_score"] = homogeneity_score(
        etiquetas_reales, etiquetas_predichas
    )
        resultados["completeness_score"] = completeness_score(
        etiquetas_reales, etiquetas_predichas
    )
        



    return resultados

## Función principal para generar y evaluar modelos 

In [3]:
def generar_y_evaluar_modelos_topicos(etiquetas_reales, modelos=["lda-bi", "hlda", "gsdmm", "bertopic_Roberta","bertopic", "agglomerative_word2vec","agglomerative_tfidf", "kmeans_Tfidf","kmeans_word2vec_100","kmeans_word2vec_150","kmeans_word2vec_200","kmeans_word2vec_250"], num_ejecuciones=5, **kwargs):
    
    for modelo_nombre in modelos:
        resultados_bi[modelo_nombre] = {}
        resultados_bi[modelo_nombre]["etiquetas_predichas"] = []  # Inicializa la lista de etiquetas predichas
        resultados_ejecuciones = []
        tiempos_entrenamiento = []
        random_seeds = [42, 123, 567, 890, 1001]
        modelo = None
        X = None
        if modelo_nombre == "bertopic" or modelo_nombre == "bertopic_Roberta": 
            for ejecucion in range(num_ejecuciones):
                tiempo_inicio = time.time()
                if modelo_nombre == "bertopic":
                    documentos = kwargs.get("documentos")
                    #text_documents = [" ".join(doc) for doc in documentos]
                    vectorizador = TfidfVectorizer(ngram_range=(2, 2))
                    bertopic_model = BERTopic(vectorizer_model=vectorizador,language="spanish",nr_topics=len(set(etiquetas_reales)),verbose=True)
                    etiquetas_predichas, _ = bertopic_model.fit_transform(documentos)
                    # Obtenemos las representaciones vectoriales de BERTopic
                    # embedding_model = SentenceTransformer("all-mpnet-base-v2")
                    # X = embedding_model.encode(documentos)  
                elif modelo_nombre == "bertopic_Roberta":
                    documentos = kwargs.get("documentos")
                    #text_documents = [" ".join(doc) for doc in documentos]
                    # Cargar un modelo RoBERTa pre-entrenado
                    embedding_model = SentenceTransformer("roberta-base-nli-stsb-mean-tokens")
                    bertopic_model = BERTopic(nr_topics=len(set(etiquetas_reales)),verbose=True,embedding_model=embedding_model)
                    etiquetas_predichas, _ = bertopic_model.fit_transform(documentos)
                tiempo_fin = time.time()
                tiempos_entrenamiento.append(tiempo_fin - tiempo_inicio)
                if 'bertopic_model' in locals(): 
                    modelo = bertopic_model
                    nombre_archivo = os.path.join(ruta_guardado, f"{modelo_nombre}_ejecucion_{ejecucion}.pkl") # Cambia la extensión a .pkl
                    with open(nombre_archivo, "wb") as archivo:
                        pickle.dump(modelo, archivo)
                    resultados_ejecuciones.append(evaluar_modelo(etiquetas_reales, etiquetas_predichas, X))
                    resultados_bi[modelo_nombre]["resultados_ejecuciones"] = resultados_ejecuciones

        if modelo_nombre == "agglomerative_word2vec":
            documentos = kwargs.get("documentos")
            agg_model = AgglomerativeClustering(n_clusters=len(set(etiquetas_reales)), linkage='ward')
            etiquetas_predichas = agg_model.fit_predict(documentos)
        elif modelo_nombre == "agglomerative_Tfidf":
            tiempo_inicio = time.time()
            documentos = kwargs.get("documentos")
            vectorizer = TfidfVectorizer()
            documentos_como_cadenas = [" ".join(doc) for doc in documentos]
            # Calcular TF-IDF
            tfidf_matrix = vectorizer.fit_transform(documentos_como_cadenas)
            tfidf_matrix_dense = tfidf_matrix.toarray()
            agg_model = AgglomerativeClustering(n_clusters=len(set(etiquetas_reales)), linkage='ward')
            etiquetas_predichas = agg_model.fit_predict(tfidf_matrix_dense)
            tiempo_fin = time.time()
            tiempos_entrenamiento.append(tiempo_fin - tiempo_inicio)
        if 'agg_model' in locals(): 
            modelo = agg_model
            nombre_archivo = os.path.join(ruta_guardado, f"{modelo_nombre}_ejecucion_0.pkl") # Cambia la extensión a .pkl
            with open(nombre_archivo, "wb") as archivo:
                pickle.dump(modelo, archivo)
                    
            resultados_bi[modelo_nombre]["etiquetas_predichas"].append(etiquetas_predichas)
            resultados_ejecuciones.append(evaluar_modelo(etiquetas_reales, etiquetas_predichas, X))
            resultados_bi[modelo_nombre]["resultados_ejecuciones"] = resultados_ejecuciones
            
        for seed in random_seeds:
            tiempo_inicio = time.time()
           

            if modelo_nombre == "lda-bi":
                corpus = kwargs.get("corpus")
                diccionario = kwargs.get("diccionario")
                if corpus is None:
                    raise ValueError("corpus debe proporcionarse para el modelo LDA.")
                if diccionario is None:
                    raise ValueError("diccionario debe proporcionarse para el modelo GSDMM.")                
                lda_model = models.LdaModel(corpus, num_topics=len(set(etiquetas_reales)), id2word=diccionario, alpha=0.05, eta=0.01, iterations=100, random_state=seed  )
                etiquetas_predichas = [max(lda_model[doc], key=lambda x: x[1])[0] for doc in corpus]
                
                
                          
                
            elif modelo_nombre == "gsdmm":
                n_terms = kwargs.get("n_terms")
                documentos = kwargs.get("documentos")
                if n_terms is None:
                    raise ValueError("n_terms debe proporcionarse para el modelo GSDMM.")
                if documentos is None:
                    raise ValueError("documentos debe proporcionarse para el modelo GSDMM.")    
                gsdmm_model = MovieGroupProcess(K=len(set(etiquetas_reales)), n_iters=30, alpha=0.1, beta=0.1)
                y = gsdmm_model.fit(documentos,n_terms)
                etiquetas_predichas = np.array(y)
                
         
          
                            
            elif modelo_nombre == "kmeans_Tfidf":
                documentos = kwargs.get("documentos")
                vectorizador = TfidfVectorizer(ngram_range=(2, 2))
                documentos_como_cadenas = [" ".join(doc) for doc in documentos]
                # Calcular TF-IDF
                tfidf_matrix = vectorizador.fit_transform(documentos_como_cadenas)
                kmeans_model = KMeans(n_clusters=len(set(etiquetas_reales)), max_iter=100,random_state=seed )
                etiquetas_predichas = kmeans_model.fit_predict(tfidf_matrix)
            elif modelo_nombre == "kmeans_word2vec" or modelo_nombre == "kmeans_word2vec_100" or modelo_nombre == "kmeans_word2vec_150" or modelo_nombre == "kmeans_word2vec_200" or modelo_nombre == "kmeans_word2vec_250":
                documentos = kwargs.get("documentos")
                kmeans_model = KMeans(n_clusters=len(set(etiquetas_reales)), max_iter=100, random_state=seed )
                etiquetas_predichas = kmeans_model.fit_predict(documentos)                

            tiempo_fin = time.time()
            tiempos_entrenamiento.append(tiempo_fin - tiempo_inicio)
            
        # Guardar el modelo
            if 'lda_model' in locals(): modelo = lda_model
            elif 'hlda_model' in locals(): modelo = hlda_model
            elif 'gsdmm_model' in locals(): modelo = gsdmm_model
            elif 'kmeans_model' in locals(): modelo = kmeans_model
                 
            # Guardar el modelo
            if modelo is not None:
                nombre_archivo = os.path.join(ruta_guardado, f"{modelo_nombre}_ejecucion_{seed}.pkl") # Cambia la extensión a .pkl
                with open(nombre_archivo, "wb") as archivo:
                    pickle.dump(modelo, archivo)
                    
            resultados_bi[modelo_nombre]["etiquetas_predichas"].append(etiquetas_predichas)
                
            resultados_ejecuciones.append(evaluar_modelo(etiquetas_reales, etiquetas_predichas, X))
            resultados_bi[modelo_nombre]["resultados_ejecuciones"] = resultados_ejecuciones
            
    # Promediar y calcular la desviación estándar de los resultados
        for metrica in resultados_ejecuciones[0]:
            valores_metrica = [resultado[metrica] for resultado in resultados_ejecuciones]
            resultados_bi[modelo_nombre][metrica] = {
                "promedio": np.mean(valores_metrica),
                "desviacion_estandar": np.std(valores_metrica)
            }

        resultados_bi[modelo_nombre]["tiempo_entrenamiento"] = {
            "promedio": np.mean(tiempos_entrenamiento),
            "desviacion_estandar": np.std(tiempos_entrenamiento)
        }
    guardar_objeto(resultados_bi, "resultados_modelos_bi.pkl")   
    return resultados_bi

### Función para visualizar resultados 

In [4]:
def generar_tabla_metricas_fijas2(metricas, metricas_deseadas, modelos_deseados=None):
    tabla_html = "<table>\n"
    tabla_html += "  <tr>\n"
    tabla_html += "    <th>Métrica</th>\n"

    # Determinar qué modelos mostrar
    modelos_a_mostrar = modelos_deseados if modelos_deseados else list(metricas.keys())

    for modelo in modelos_a_mostrar:
        tabla_html += f"    <th>{modelo.upper()}</th>\n"
    tabla_html += "  </tr>\n"

    for metrica in metricas_deseadas:
        tabla_html += "  <tr>\n"
        tabla_html += f"    <td>{metrica.replace('_', ' ').title()}</td>\n"
        for modelo in modelos_a_mostrar:
            if metrica in metricas[modelo]:
                promedio = metricas[modelo][metrica]['promedio']
                desviacion = metricas[modelo][metrica]['desviacion_estandar']
                tabla_html += f"    <td>{promedio:.4f} ± {desviacion:.4f}</td>\n"
            else:
                tabla_html += "    <td>-</td>\n"  # Si la métrica no está para un modelo
        tabla_html += "  </tr>\n"

    tabla_html += "</table>"
    return tabla_html

# Ejemplo de uso con modelos específicos:
metricas_deseadas = [
    'adjusted_rand_score',
    'normalized_mutual_info_score',
    'fowlkes_mallows_score',
    'bcubed_precision',
    'bcubed_recall',
    'bcubed_f1',
    'tiempo_entrenamiento'
]



# carga de datos

In [5]:
#Cargar datos, diccionario y corpus preparados en preprocesado 

df_preprocesado = pd.read_csv('df_preprocesado.csv', sep=';')
dictionary = Dictionary.load('dictionary_filtrado_bi.gensim')
corpus = MmCorpus('corpus_bi.mm')


#Cargar etiquetas de servicios asociados a documentos

true_labels = df_preprocesado['code_service_unificado'].tolist()

#Iniciarlizar resultados y documentos

documentos=[]
resultados_bi = {}

#Preparar ruta para guardado

ruta_guardado = "./modelos_topicos_bi"
if not os.path.exists(ruta_guardado):
    os.makedirs(ruta_guardado)
 

In [6]:
#preparar documentos para partir de las mismas condiciones 
filtered_words = set(dictionary.token2id.keys())
n_term=len(filtered_words)
preprocesado_documents=df_preprocesado['bigramas_preprocesado'].apply(ast.literal_eval)    
documents = filter_documents_for_dictionary(preprocesado_documents, filtered_words)  

## Generar y evaluar modelos 

### LDA

In [23]:
resultados_bi = generar_y_evaluar_modelos_topicos(true_labels, modelos=["lda-bi"],diccionario=dictionary, corpus=corpus)      

In [24]:
modelos_a_mostrar = ['lda-bi'] # Solo muestra estos modelos.

tabla_html = generar_tabla_metricas_fijas2(resultados_bi, metricas_deseadas, modelos_a_mostrar)

In [25]:
from IPython.display import display, HTML
display(HTML(tabla_html))

Métrica,LDA-BI
Adjusted Rand Score,0.0473 ± 0.0066
Normalized Mutual Info Score,0.1181 ± 0.0025
Fowlkes Mallows Score,0.1025 ± 0.0084
Bcubed Precision,0.1762 ± 0.0036
Bcubed Recall,0.0673 ± 0.0056
Bcubed F1,0.0766 ± 0.0053
Tiempo Entrenamiento,19.8727 ± 0.4344


### GSDMM

In [21]:
with open('resultados_modelos.pkl', 'rb') as archivo:
    resultados = pickle.load(archivo)

In [27]:
resultados = generar_y_evaluar_modelos_topicos(true_labels, modelos=["gsdmm"],n_terms=n_term,documentos=documents)

In stage 0: transferred 60690 clusters with 61 clusters populated
In stage 1: transferred 38670 clusters with 61 clusters populated
In stage 2: transferred 21536 clusters with 61 clusters populated
In stage 3: transferred 14021 clusters with 61 clusters populated
In stage 4: transferred 10702 clusters with 61 clusters populated
In stage 5: transferred 9327 clusters with 61 clusters populated
In stage 6: transferred 8624 clusters with 61 clusters populated
In stage 7: transferred 8194 clusters with 61 clusters populated
In stage 8: transferred 8004 clusters with 61 clusters populated
In stage 9: transferred 7563 clusters with 61 clusters populated
In stage 10: transferred 7179 clusters with 61 clusters populated
In stage 11: transferred 7081 clusters with 61 clusters populated
In stage 12: transferred 6828 clusters with 61 clusters populated
In stage 13: transferred 6736 clusters with 61 clusters populated
In stage 14: transferred 6538 clusters with 61 clusters populated
In stage 15: tr

In stage 5: transferred 9045 clusters with 61 clusters populated
In stage 6: transferred 8287 clusters with 61 clusters populated
In stage 7: transferred 7689 clusters with 61 clusters populated
In stage 8: transferred 7382 clusters with 61 clusters populated
In stage 9: transferred 7186 clusters with 61 clusters populated
In stage 10: transferred 7002 clusters with 61 clusters populated
In stage 11: transferred 6731 clusters with 61 clusters populated
In stage 12: transferred 6687 clusters with 61 clusters populated
In stage 13: transferred 6614 clusters with 61 clusters populated
In stage 14: transferred 6623 clusters with 61 clusters populated
In stage 15: transferred 6539 clusters with 61 clusters populated
In stage 16: transferred 6470 clusters with 61 clusters populated
In stage 17: transferred 6490 clusters with 61 clusters populated
In stage 18: transferred 6374 clusters with 61 clusters populated
In stage 19: transferred 6385 clusters with 61 clusters populated
In stage 20: tr

In [30]:
modelos_a_mostrar = ['gsdmm'] # Solo muestra estos modelos.
tabla5_html = generar_tabla_metricas_fijas2(resultados, metricas_deseadas, modelos_a_mostrar)
display(HTML(tabla5_html))

Métrica,GSDMM
Adjusted Rand Score,0.1913 ± 0.0103
Normalized Mutual Info Score,0.3507 ± 0.0034
Fowlkes Mallows Score,0.2638 ± 0.0109
Bcubed Precision,0.3857 ± 0.0046
Bcubed Recall,0.2024 ± 0.0123
Bcubed F1,0.2104 ± 0.0095
Tiempo Entrenamiento,2017.8562 ± 18.9466


### BERTOPIC

In [16]:
with open('resultados_modelos.pkl', 'rb') as archivo:
    resultados = pickle.load(archivo)

In [16]:
#Si utilizamos documentos originales
documents_original=df_preprocesado['texto']
documents_original

0        Equipo Flip-Flap Me dirijo a ustedes aprovecha...
1        Instalación deportiva para gimnasia deportiva ...
2        Duda Permisos Hola, escribo en nombre de un gr...
3        Noches Jueves viernes y sábado Cada vez es más...
4        Falta de instalaciones deportivas Somos una fa...
                               ...                        
66450    Reposición o reparación de cubo de basura Buen...
66451    Baldosas defectuosas Justo en la puerta de la ...
66452    Aviso de que reclamación sigue sin respuesta  ...
66453    Recogida de basura  Buenos dias, muestro mi ma...
66454    Residencia xiort pontoneros- madre Rafols Buen...
Name: texto, Length: 66455, dtype: object

In [21]:
resultados = generar_y_evaluar_modelos_topicos(true_labels, modelos=["bertopic"],documentos=documents_original)

2025-03-30 14:01:47,282 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2077 [00:00<?, ?it/s]

2025-03-30 14:15:10,990 - BERTopic - Embedding - Completed ✓
2025-03-30 14:15:10,990 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-30 14:15:22,920 - BERTopic - Dimensionality - Completed ✓
2025-03-30 14:15:22,920 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-30 14:15:25,936 - BERTopic - Cluster - Completed ✓
2025-03-30 14:15:25,952 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-03-30 14:15:32,978 - BERTopic - Representation - Completed ✓
2025-03-30 14:15:32,993 - BERTopic - Topic reduction - Reducing number of topics
2025-03-30 14:15:40,043 - BERTopic - Topic reduction - Reduced number of topics from 499 to 61
  self._set_arrayXarray(i, j, x)
2025-03-30 14:15:45,470 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2077 [00:00<?, ?it/s]

2025-03-30 14:29:26,691 - BERTopic - Embedding - Completed ✓
2025-03-30 14:29:26,691 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-30 14:29:38,610 - BERTopic - Dimensionality - Completed ✓
2025-03-30 14:29:38,610 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-30 14:29:41,432 - BERTopic - Cluster - Completed ✓
2025-03-30 14:29:41,432 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-03-30 14:29:48,607 - BERTopic - Representation - Completed ✓
2025-03-30 14:29:48,622 - BERTopic - Topic reduction - Reducing number of topics
2025-03-30 14:29:55,999 - BERTopic - Topic reduction - Reduced number of topics from 490 to 61
  self._set_arrayXarray(i, j, x)
2025-03-30 14:30:01,313 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2077 [00:00<?, ?it/s]

2025-03-30 14:43:51,480 - BERTopic - Embedding - Completed ✓
2025-03-30 14:43:51,480 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-30 14:44:03,297 - BERTopic - Dimensionality - Completed ✓
2025-03-30 14:44:03,297 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-30 14:44:06,102 - BERTopic - Cluster - Completed ✓
2025-03-30 14:44:06,102 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-03-30 14:44:13,480 - BERTopic - Representation - Completed ✓
2025-03-30 14:44:13,496 - BERTopic - Topic reduction - Reducing number of topics
2025-03-30 14:44:20,986 - BERTopic - Topic reduction - Reduced number of topics from 505 to 61
  self._set_arrayXarray(i, j, x)
2025-03-30 14:44:26,507 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2077 [00:00<?, ?it/s]

2025-03-30 14:58:04,185 - BERTopic - Embedding - Completed ✓
2025-03-30 14:58:04,185 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-30 14:58:16,005 - BERTopic - Dimensionality - Completed ✓
2025-03-30 14:58:16,005 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-30 14:58:18,782 - BERTopic - Cluster - Completed ✓
2025-03-30 14:58:18,782 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-03-30 14:58:30,895 - BERTopic - Representation - Completed ✓
2025-03-30 14:58:30,907 - BERTopic - Topic reduction - Reducing number of topics
2025-03-30 14:58:43,114 - BERTopic - Topic reduction - Reduced number of topics from 487 to 61
  self._set_arrayXarray(i, j, x)
2025-03-30 14:58:48,691 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2077 [00:00<?, ?it/s]

2025-03-30 15:12:27,904 - BERTopic - Embedding - Completed ✓
2025-03-30 15:12:27,904 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-03-30 15:12:39,880 - BERTopic - Dimensionality - Completed ✓
2025-03-30 15:12:39,880 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-03-30 15:12:42,688 - BERTopic - Cluster - Completed ✓
2025-03-30 15:12:42,688 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-03-30 15:12:50,172 - BERTopic - Representation - Completed ✓
2025-03-30 15:12:50,187 - BERTopic - Topic reduction - Reducing number of topics
2025-03-30 15:12:57,687 - BERTopic - Topic reduction - Reduced number of topics from 525 to 61
  self._set_arrayXarray(i, j, x)


In [22]:
modelos_a_mostrar = ['bertopic'] # Solo muestra estos modelos.

tabla_html2 = generar_tabla_metricas_fijas2(resultados, metricas_deseadas, modelos_a_mostrar)
display(HTML(tabla_html2))

Métrica,BERTOPIC
Adjusted Rand Score,0.0630 ± 0.0033
Normalized Mutual Info Score,0.2867 ± 0.0049
Fowlkes Mallows Score,0.2048 ± 0.0017
Bcubed Precision,0.2646 ± 0.0110
Bcubed Recall,0.3376 ± 0.0052
Bcubed F1,0.1983 ± 0.0080
Tiempo Entrenamiento,425.8352 ± 425.8867


### Aglomerative

In [11]:
with open('resultados_modelos_bi.pkl', 'rb') as archivo:
    resultados_bi = pickle.load(archivo)

In [22]:
with open('all_document_vectors.pkl', 'rb') as archivo:
    all_document_vectors = pickle.load(archivo)

In [24]:
resultados = generar_y_evaluar_modelos_topicos(true_labels, modelos=["agglomerative_word2vec"],documentos=all_document_vectors_bi["w2v_250_epochs"])

In [34]:
resultados = generar_y_evaluar_modelos_topicos(true_labels, modelos=["agglomerative_Tfidf"],documentos=documents)

### Kmeans_word2vec

In [8]:
with open('resultados_modelos.pkl', 'rb') as archivo:
    resultados = pickle.load(archivo)

#### Generar modelos partiendo de Word2Vec entre 100 y 250 epocs

In [7]:
epochs_to_test = [100, 150, 200, 250]  # Números de épocas a probar
all_document_vectors_bi = {}  # Diccionario para almacenar los vectores de documentos

for epochs in epochs_to_test:
    model_name = f"w2v_{epochs}_epochs"
    model = Word2Vec(
        sentences=documents, vector_size=100, window=5, min_count=1, workers=4, seed=42
    )
    model.train(documents, total_examples=len(documents), epochs=epochs)
    model.save(f"w2v_{epochs}_epochs.model") #guardamos el modelo.
    document_vectors = get_document_vectors(model, documents)
    document_vectors = [vector for vector in document_vectors if vector is not None]
    document_vectors = np.array(document_vectors)
    all_document_vectors_bi[model_name] = document_vectors  # Guardamos los vectores de documentos.

In [8]:
len(all_document_vectors_bi["w2v_100_epochs"])

66455

In [17]:
with open('all_document_vectors_bi.pkl', "wb") as archivo:
                    pickle.dump(all_document_vectors_bi, archivo)

In [9]:
resultados_word2vec={}

In [14]:
resultados_word2vec["kmeans_word2vec_100"] = generar_y_evaluar_modelos_topicos(true_labels, modelos=["kmeans_word2vec_100"],documentos=all_document_vectors_bi["w2v_100_epochs"])

In [15]:
resultados = generar_y_evaluar_modelos_topicos(true_labels, modelos=["kmeans_word2vec_150"],documentos=all_document_vectors_bi["w2v_150_epochs"])

In [16]:
resultados = generar_y_evaluar_modelos_topicos(true_labels, modelos=["kmeans_word2vec_200"],documentos=all_document_vectors_bi["w2v_200_epochs"])

In [18]:
resultados = generar_y_evaluar_modelos_topicos(true_labels, modelos=["kmeans_word2vec_250"],documentos=all_document_vectors_bi["w2v_250_epochs"])

In [21]:
from IPython.display import display, HTML
modelos_a_mostrar = ['kmeans_word2vec_100','kmeans_word2vec_150','kmeans_word2vec_200','kmeans_word2vec_250'] # Solo muestra estos modelos.

tabla_html = generar_tabla_metricas_fijas2(resultados_bi, metricas_deseadas, modelos_a_mostrar)
display(HTML(tabla_html))

Métrica,KMEANS_WORD2VEC_100,KMEANS_WORD2VEC_150,KMEANS_WORD2VEC_200,KMEANS_WORD2VEC_250
Adjusted Rand Score,0.1273 ± 0.0019,0.1247 ± 0.0035,0.1240 ± 0.0070,0.1239 ± 0.0032
Normalized Mutual Info Score,0.3574 ± 0.0041,0.3626 ± 0.0023,0.3608 ± 0.0027,0.3637 ± 0.0045
Fowlkes Mallows Score,0.1948 ± 0.0017,0.1922 ± 0.0035,0.1932 ± 0.0082,0.1927 ± 0.0025
Bcubed Precision,0.4090 ± 0.0031,0.4138 ± 0.0034,0.4131 ± 0.0028,0.4146 ± 0.0032
Bcubed Recall,0.1444 ± 0.0084,0.1443 ± 0.0074,0.1396 ± 0.0037,0.1415 ± 0.0039
Bcubed F1,0.1688 ± 0.0041,0.1671 ± 0.0048,0.1663 ± 0.0053,0.1658 ± 0.0035
Tiempo Entrenamiento,0.8057 ± 0.0283,0.6846 ± 0.0911,0.7843 ± 0.0628,0.8212 ± 0.0096


### KMEANS tfidf

In [12]:
resultados = generar_y_evaluar_modelos_topicos(true_labels, modelos=["kmeans_Tfidf"],documentos=documents)

In [14]:
modelos_a_mostrar = ['kmeans_Tfidf'] # Solo muestra estos modelos.

tabla_html = generar_tabla_metricas_fijas2(resultados, metricas_deseadas, modelos_a_mostrar)

In [15]:
from IPython.display import display, HTML
display(HTML(tabla_html))

Métrica,KMEANS_TFIDF
Adjusted Rand Score,0.0212 ± 0.0090
Normalized Mutual Info Score,0.2060 ± 0.0255
Fowlkes Mallows Score,0.2483 ± 0.0716
Bcubed Precision,0.2149 ± 0.0454
Bcubed Recall,0.6654 ± 0.2527
Bcubed F1,0.1821 ± 0.0306
Tiempo Entrenamiento,23.6173 ± 6.8145
