In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.utils import resample
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import umap.umap_ as umap_
import hdbscan
from sklearn.manifold import TSNE
import matplotlib.patches as mpatches
import re
from sklearn.preprocessing import MinMaxScaler

# Supondo que 'X_tsne' seja a matriz de embeddings do t-SNE


from unidecode import unidecode

from sklearn.feature_extraction.text import CountVectorizer
from mpl_toolkits import mplot3d
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import os

In [40]:

def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names_out()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                   .Doc
                   .count()
                   .reset_index()
                   .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                   .sort_values("Size", ascending=False))
    return topic_sizes


def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

def map_labels_to_colors(labels):
    cmap = plt.get_cmap('brg')
    num_labels = np.max(labels) + 1
    colors = cmap(np.linspace(0, 1, num_labels))
    # print(colors)
    return colors[labels]


def gera_comparacao(list_preprossing_,cluster_):
    docs_df = pd.DataFrame(list_preprossing_,columns=["Doc"])
    docs_df['Topic'] = cluster_
    docs_df['Doc_ID'] = range(len(docs_df))
    docs_per_topic=docs_df.groupby(["Topic"],as_index=False).agg({"Doc":' '.join})
    tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(list_preprossing_))
    top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=5)
    # print(top_n_words)
    # topic_sizes = extract_topic_sizes(docs_df)
    legends=[top_n_words[i][0] for i in range(len(top_n_words))]
    return  legends


def gera_plot(result,legends):
    fig, ax = plt.subplots(figsize=(20, 10))
    plt.scatter(result.x, result.y, c=result.labels,  cmap='brg')
    plt.colorbar()
    legend_labels = [label for label in legends]
    legend_colors = map_labels_to_colors(np.arange(len(legend_labels)))
    legend_elements = [mpatches.Patch(color=color, label=label) for color, label in zip(legend_colors, legend_labels)]
    plt.legend(handles=legend_elements)
    plt.legend(legend_elements, legend_labels)
    plt.show()

def preprocess_text(text):
    # Conversão para minúsculas
    text = text.lower()

    # Remoção de pontuações
    text = re.sub(r'[^\w\s]', '', text)


    # Remoção de caracteres especiais, preservando letras acentuadas
    text = re.sub(r'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôûÂÊÎÔÛàèìòùÀÈÌÒÙãõñÃÕÑçÇ\s]+', '', text)
    text = unidecode(text)

    return text




In [41]:
df_train = pd.read_excel("/home/rafael/Documentos/FACOM/Douturado/Doutorado/webcrawler/medicamentos.xlsx",index_col=0,dtype=str)
df_train.head()
list_preprossing=[]

In [42]:
for dta in range(df_train.shape[0]):
    value=preprocess_text(str(df_train.iloc[dta,0]))
    # value=str(df_train.iloc[dta,0])+" "+str(df_train.iloc[dta,1])
    # value=str(df_train.iloc[dta,0])+" "+str(df_train.iloc[dta,1]) +" "+str(df_train.iloc[dta,2])+" "+str(df_train.iloc[dta,3])
    list_preprossing.append(value)
list_preprossing=np.array(list_preprossing)
list_preprossing=list_preprossing[list_preprossing!='nan']
print(list_preprossing)

['cafeina  carisoprodol  diclofenaco sodico  paracetamol'
 'probiotico bb12 bifidobacterium animalis subsp lactis'
 'enoxaparina sodica' 'cetoprofeno'
 'fosfato dissodico de dexametasona cloridrato de tiamina cloridrato de piridoxina cianocobalamina'
 'drospirenona  etinilestradiol' 'ceftriaxona sodica' 'alprazolam'
 'acetato de caspofungina' 'poliestirenossulfonato de calcio'
 'valerato de betametasona  sulfato de gentamicina  tolnaftato  clioquinol'
 'letrozol' 'tinidazol  nitrato de miconazol' 'promestrieno'
 'losartana potassica' 'cloridrato de paroxetina' 'cloreto de sodio'
 'cloridrato de sibutramina' 'mesilato de imatinibe'
 'cloridrato de duloxetina' 'linezolida' 'desloratadina'
 'montelucaste de sodio' 'prednisolona' 'tadalafila'
 'cloridrato de venlafaxina' 'ifosfamida'
 'pantoprazol sodico sesquihidratado' 'topiramato'
 'cafeina  carisoprodol  diclofenaco sodico  paracetamol' 'azitromicina'
 'gestodeno  etinilestradiol' 'furoato de mometasona'
 'acetato de abiraterona' 'cole

In [8]:
# embedder = SentenceTransformer('distilbert-base-nli-mean-tokens',device='cuda')
embedder = SentenceTransformer('all-mpnet-base-v2')


In [55]:
corpus_embeddings = embedder.encode(list_preprossing,show_progress_bar=True)
# scaler = MinMaxScaler()
# corpus_embeddings = scaler.fit_transform(corpus_embeddings)

In [None]:
tsne = TSNE(random_state = 42, n_components=2,perplexity=5,metric='cosine')
pca = PCA(n_components=2)
umap= umap_.UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine')


In [56]:

embeddings2d_pca = pca.fit_transform(corpus_embeddings)
embeddings2d_tsne = tsne.fit_transform(corpus_embeddings)
embeddings2d_umap =  umap.fit_transform(corpus_embeddings)


In [63]:
embeddings2d_dic ={'pca':embeddings2d_pca,'tsne':embeddings2d_tsne,'umap':embeddings2d_tsne}

In [64]:
for key in embeddings2d_dic:
    print(key)
    for k in range (4,16,4):
        kmeans = KMeans(n_clusters=k,n_init=40)
        kmeans_ = kmeans.fit_predict(embeddings2d_dic[key])
        result = pd.DataFrame(embeddings2d_dic[key], columns=['x', 'y'])
        result['labels'] = kmeans_
        labels = gera_comparacao(list_preprossing,kmeans_)
        fig, ax = plt.subplots(figsize=(20, 10))
        plt.scatter(result.x, result.y, c=result.labels,  cmap='brg')
        plt.colorbar()
        legend_labels = [label for label in labels]
        legend_colors = map_labels_to_colors(np.arange(len(legend_labels)))
        legend_elements = [mpatches.Patch(color=color, label=label) for color, label in zip(legend_colors, legend_labels)]
        plt.legend(handles=legend_elements)
        plt.legend(legend_elements, legend_labels)
        fig.savefig(f"./{key}/{key}-kmeans-{k}.png")


In [69]:
for key in embeddings2d_dic:
    print(key)
    cluster = hdbscan.HDBSCAN(min_cluster_size=10,metric='cosine')
    cluster_ = cluster.fit_predict(embeddings2d_dic[key])
    result = pd.DataFrame(embeddings2d_dic[key], columns=['x', 'y'])
    result['labels'] = cluster_
    labels = gera_comparacao(list_preprossing,cluster_)
    fig, ax = plt.subplots(figsize=(20, 10))
    result= result[result['labels']!=-1]
    plt.scatter(result.x, result.y, c=result.labels,  cmap='brg')
    plt.colorbar()
    legend_labels = [label for label in labels]
    legend_colors = map_labels_to_colors(np.arange(len(legend_labels)))
    legend_elements = [mpatches.Patch(color=color, label=label) for color, label in zip(legend_colors, legend_labels)]
    plt.legend(handles=legend_elements)
    plt.legend(legend_elements, legend_labels)
    fig.savefig(f"./{key}/{key}-hdbscan_@.png")


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import numpy as np


# Clusterização
k_values = range(2, 15)  # Faixa de valores para o número de clusters
silhouette_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=7)
    kmeans.fit(embeddings2d_tsne)
    labels = kmeans.labels_

    silhouette_avg = silhouette_score(embeddings2d_tsne, labels)
    silhouette_scores.append(silhouette_avg)

# Plotar gráfico do coeficiente de silhueta
plt.plot(k_values, silhouette_scores, marker='o')
plt.xlabel('Número de clusters (k)')
plt.ylabel('Coeficiente de Silhueta Médio')
#plt.title('Coeficiente de Silhueta para Valores de k')
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score

# Range de valores para o número de clusters (k)
k_values = range(2, 15)

# Listas para armazenar as métricas
calinski_scores = []
davies_bouldin_scores = []

# Calcular as métricas para cada valor de k
for k in k_values:
    # Executar o algoritmo de clusterização e obter os rótulos

    labels = kmeans.fit(X, k)  # Substitua "seu_algoritmo_de_clusterizacao" pelo seu algoritmo

    # Calcular as métricas de Calinski-Harabasz e Davies-Bouldin
    calinski_score = calinski_harabasz_score(X, labels)
    davies_bouldin_score = davies_bouldin_score(X, labels)

    # Armazenar as métricas nas listas
    calinski_scores.append(calinski_score)
    davies_bouldin_scores.append(davies_bouldin_score)

# Plotar gráfico da métrica de Calinski-Harabasz
plt.plot(k_values, calinski_scores, marker='o')
plt.xlabel('Número de clusters (k)')
plt.ylabel('Calinski-Harabasz Score')
plt.title('Calinski-Harabasz Score para Valores de k')
plt.show()

# Plotar gráfico da métrica de Davies-Bouldin
plt.plot(k_values, davies_bouldin_scores, marker='o')
plt.xlabel('Número de clusters (k)')
plt.ylabel('Davies-Bouldin Score')
plt.title('Davies-Bouldin Score para Valores de k')
plt.show()




