In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.utils import resample
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import umap.umap_ as umap_
import hdbscan
from sklearn.manifold import TSNE
import matplotlib.patches as mpatches
import re
from sklearn.preprocessing import MinMaxScaler

# Supondo que 'X_tsne' seja a matriz de embeddings do t-SNE
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score


from unidecode import unidecode

from sklearn.feature_extraction.text import CountVectorizer
from mpl_toolkits import mplot3d
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import os
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning
import warnings
warnings.simplefilter('ignore', category=NumbaDeprecationWarning)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [3]:

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names_out()
    labels_ = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels_)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                   .Doc
                   .count()
                   .reset_index()
                   .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                   .sort_values("Size", ascending=False))
    return topic_sizes


def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

def map_labels_to_colors(labels,cmap_):
    cmap = plt.get_cmap(cmap_)
    num_labels = np.max(labels) + 1
    colors = cmap(np.linspace(0, 1, num_labels))
    # print(colors)
    return colors[labels]


def gera_comparacao(list_preprossing_,cluster_):
    docs_df = pd.DataFrame(list_preprossing_,columns=["Doc"])
    docs_df['Topic'] = cluster_
    docs_df['Doc_ID'] = range(len(docs_df))
    docs_per_topic=docs_df.groupby(["Topic"],as_index=False).agg({"Doc":' '.join})
    tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(list_preprossing_))
    top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=5)
    # print(top_n_words)

    topic_sizes = extract_topic_sizes(docs_df)
    legends=[top_n_words[i][0] for i in top_n_words]
    return  legends




def preprocess_text(text):
    # Conversão para minúsculas
    text = text.lower()

    # Remoção de pontuações
    text = re.sub(r'[^\w\s]', '', text)


    # Remoção de caracteres especiais, preservando letras acentuadas
    text = re.sub(r'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôûÂÊÎÔÛàèìòùÀÈÌÒÙãõñÃÕÑçÇ\s]+', '', text)
    text = unidecode(text)

    return text




In [4]:
df_train = pd.read_excel("/home/rafael/Documentos/FACOM/Douturado/Doutorado/webcrawler/medicamentos.xlsx",index_col=0,dtype=str)
df_train.head()
list_preprossing=[]

In [5]:
for dta in range(df_train.shape[0]):
    # value=str(df_train.iloc[dta,0])
    value=str(df_train.iloc[dta,0])+" "+str(df_train.iloc[dta,1])
    # value=str(df_train.iloc[dta,0])+" "+str(df_train.iloc[dta,1]) +" "+str(df_train.iloc[dta,2])+" "+str(df_train.iloc[dta,3])
    list_preprossing.append(preprocess_text(value))
list_preprossing=np.array(list_preprossing)
list_preprossing=list_preprossing[list_preprossing!='nan']
print(list_preprossing)

['cafeina  carisoprodol  diclofenaco sodico  paracetamol trimusk cafeina  carisoprodol  diclofenaco sodico  paracetamol e indicado para o tratamento de reumatismo nas suas formas inflamatoriodegenerativas agudas e cronicas crise aguda de gota estados inflamatorios agudos postraumaticos e poscirurgicos exacerbacoes agudas de artrite reumatoide ou outras artropatias reumaticas osteoartrites e estados agudos de reumatismo nos tecidos extraarticulares quadros de lombalgias ou lombociatalgias trimusk cafeina  carisoprodol  diclofenaco sodico  paracetamol e indicado como coadjuvante em processos inflamatorios graves decorrentes de quadros infecciosos'
 'probiotico bb12 bifidobacterium animalis subsp lactis suplemento alimentar de bifidobacterium animalis subsp lactis bb12 dsm 15954 em solucao gotas que contribui com a saude do trato gastrointestinal'
 'enoxaparina sodica versa enoxaparina sodica e indicado no  tratamento da trombose oclusao por trombo de veias profundas ja estabelecida com o

In [6]:
# embedder = SentenceTransformer('distilbert-base-nli-mean-tokens',device='cuda')
embedder = SentenceTransformer('all-mpnet-base-v2')


In [7]:
corpus_embeddings = embedder.encode(list_preprossing,show_progress_bar=True)


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

In [8]:
tsne = TSNE(random_state = 42, n_components=2,perplexity=5,metric='cosine')
pca = PCA(n_components=2)
umap= umap_.UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine')


In [9]:

embeddings2d_pca = pca.fit_transform(corpus_embeddings)
embeddings2d_tsne = tsne.fit_transform(corpus_embeddings)
embeddings2d_umap =  umap.fit_transform(corpus_embeddings)


In [10]:
color_maps = 'tab20'

In [11]:
embeddings2d_dic ={'pca':embeddings2d_pca,'tsne':embeddings2d_tsne,'umap':embeddings2d_tsne}

In [12]:
import numpy as np
from sklearn.metrics import pairwise_distances

for key in embeddings2d_dic:
    print(key)
    for k in range(4, 20, 4):
        kmeans = KMeans(n_clusters=k, n_init=40)
        kmeans_ = kmeans.fit(embeddings2d_dic[key])
        result = pd.DataFrame(embeddings2d_dic[key], columns=['x', 'y'])
        result['labels'] = kmeans_.labels_
        centroids = kmeans_.cluster_centers_

        labels = gera_comparacao(list_preprossing, kmeans_.labels_)
        fig, ax = plt.subplots(figsize=(20, 10))
        plt.scatter(result.x, result.y, c=result.labels, cmap=color_maps)

        # Calcular a distância média entre o centróide e os pontos do grupo
        distances = pairwise_distances(result[['x', 'y']], centroids)
        avg_distance = np.mean(distances, axis=1)

        # Ajustar o raio multiplicando por um fator
        radius_factor = 0.2
        adjusted_radius = avg_distance * radius_factor

        # Adicionar círculos representando os centróides com raio ajustado
        for centroid, radius in zip(centroids, adjusted_radius):
            circle = plt.Circle((centroid[0], centroid[1]), radius=radius, color='red', fill=False)
            ax.add_artist(circle)

            # Adicionar marcador para o centro do centróide
            plt.scatter(centroid[0], centroid[1], c='black', s=100, marker='x')

        plt.colorbar()
        legend_labels = [label for label in labels]
        legend_colors = map_labels_to_colors(np.arange(len(legend_labels)), color_maps)
        legend_elements = [mpatches.Patch(color=color, label=label) for color, label in zip(legend_colors, legend_labels)]
        plt.legend(handles=legend_elements)
        plt.legend(legend_elements, legend_labels)
        fig.savefig(f"./{key}/{key}-kmeans-{k}.png")
        plt.close(fig)



pca
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
tsne
umap


In [13]:
for key in embeddings2d_dic:
    print(key)
    for k in range (5,9):
        cluster = hdbscan.HDBSCAN(min_cluster_size=k,
                              metric='euclidean',
                              cluster_selection_method='eom').fit(embeddings2d_dic[key])
        result = pd.DataFrame(embeddings2d_dic[key], columns=['x', 'y'])
        result['labels'] = cluster.labels_
        outliers = result.loc[result.labels == -1, :]
        clustered = result.loc[result.labels != -1, :]
        labels = gera_comparacao(list_preprossing,cluster.labels_)

        fig, ax = plt.subplots(figsize=(20, 10))
        # plt.scatter(exemplars[:, 0], exemplars[:, 1], c='r' ,marker='^')
        # plt.scatter(outliers.x, outliers.y, c='black' ,marker='x')
        plt.scatter(clustered.x, clustered.y,c=clustered.labels,  cmap=color_maps)
        plt.colorbar()
        legend_labels = [label for label in labels]
        legend_colors = map_labels_to_colors(np.arange(len(legend_labels)),color_maps)
        legend_elements = [mpatches.Patch(color=color, label=label) for color, label in zip(legend_colors, legend_labels)]
        plt.legend(handles=legend_elements)
        plt.legend(legend_elements, legend_labels)
        fig.savefig(f"./{key}/{key}-hdbscan-{k}.png")
        plt.close(fig)


pca
tsne
umap


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import numpy as np

k_values=range(5, 100)
print(k_values)
for key in embeddings2d_dic:
    silhouette_scores = []
    calinski_scores = []
    davies_bouldin_scores = []
    for k in  k_values :
        kmeans = KMeans(n_clusters=k,n_init=40)
        kmeans.fit(embeddings2d_tsne)
        labels = kmeans.labels_
        silhouette_avg = silhouette_score(embeddings2d_dic[key], labels)
        silhouette_scores.append(silhouette_avg)
        calinski_score = calinski_harabasz_score(embeddings2d_dic[key], labels)
        davies_bouldin_score = davies_bouldin_score(embeddings2d_dic[key], labels)
        calinski_scores.append(calinski_score)
        davies_bouldin_scores.append(davies_bouldin_score)

    fig,ax =plt.subplots(1,figsize=(20,10))
    ax.plot(k_values, silhouette_scores, marker='o')
    plt.xlabel('Número de clusters (k)')
    plt.ylabel('Coeficiente de Silhueta Médio')
    fig.savefig(f"./{key}/{key}-kmeans-silhouette.png")
    plt.close(fig)
    fig,ax =plt.subplots(1,figsize=(20,10))
    plt.plot(k_values, calinski_scores, marker='o')
    plt.xlabel('Número de clusters (k)')
    plt.ylabel('Calinski-Harabasz Score')
    fig.savefig(f"./{key}/{key}-kmeans-calinski.png")
    plt.close(fig)
    fig,ax =plt.subplots(1,figsize=(20,10))
    plt.plot(k_values, davies_bouldin_scores, marker='o')
    plt.xlabel('Número de clusters (k)')
    plt.ylabel('Davies-Bouldin Score')
    fig.savefig(f"./{key}/{key}-kmeans-davies.png")
    plt.close(fig)


In [None]:
from top2vec import Top2Vec

model = Top2Vec(documents, embedding_model='universal-sentence-encoder')