# Clusterización de Tópicos
Para la identificación de conjuntos de tópicos con temática relacionada. Actualizado al 12/04/2024

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import pymysql
from dotenv import load_dotenv
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# Cargar variables de entorno
def cargar_variables():
    load_dotenv()
    return {
        "host": os.getenv("DB_HOST", "localhost"),
        "port": int(os.getenv("DB_PORT", 3306)),
        "user": os.getenv("DB_USER", "root"),
        "passwd": os.getenv("DB_PASS", "Password"),
        "db": os.getenv("DB_NAME", "BaseDeDatos"),
    }

# Conectar a MySQL y extraer datos
def obtener_datos():
    config = cargar_variables()
    conn = pymysql.connect(**config, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
    try:
        with conn.cursor() as cursor:
            cursor.execute("SELECT id, palabras FROM siglodb_topicos_concatenados;")
            data = cursor.fetchall()
            return pd.DataFrame(data)
    finally:
        conn.close()

# Preprocesamiento de texto
def preparar_texto(df):
    return [' '.join(p.split()) for p in df['palabras'].tolist()]

# Vectorización y clustering
def encontrar_mejor_kmeans(textos):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(textos)
    best_score, best_k, best_labels = -1, -1, None

    for k in range(2, 50):
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(X)
        score = silhouette_score(X, labels)
        if score > best_score:
            best_score, best_k, best_labels = score, k, labels

    return best_k, best_score, best_labels, X

# Evaluar similaridad interna de los clusters
def evaluar_similaridad(textos, labels, X):
    resultados = []
    for c in range(max(labels) + 1):
        indices = [i for i, l in enumerate(labels) if l == c]
        textos_cluster = [textos[i] for i in indices]
        X_cluster = X[indices]
        if len(indices) < 2:
            continue
        sim_matrix = cosine_similarity(X_cluster)
        avg_sim = (sim_matrix.sum() - len(indices)) / (len(indices) * (len(indices) - 1))
        resultados.append((c, avg_sim, indices))
    return resultados

# Subir resultados a MySQL
def subir_resultados(cluster_info, labels):
    config = cargar_variables()
    conn = pymysql.connect(**config, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
    try:
        with conn.cursor() as cursor:
            for cluster_num, avg_sim, indices in cluster_info:
                if avg_sim > 0.15:
                    for i in indices:
                        cursor.execute("""
                            UPDATE siglodb_topicos_concatenados
                            SET cluster = %s, similaridad = %s
                            WHERE id = %s
                        """, (cluster_num, avg_sim, i + 1))
        conn.commit()
    finally:
        conn.close()

# Función principal
def main():
    df = obtener_datos()
    textos = preparar_texto(df)
    best_k, best_score, labels, X = encontrar_mejor_kmeans(textos)
    print(f"Mejor número de clusters: {best_k}, Silhouette: {best_score:.4f}")
    cluster_info = evaluar_similaridad(textos, labels, X)
    subir_resultados(cluster_info, labels)

In [None]:
if __name__ == "__main__":
    main()