In [1]:
import matplotlib as mpl
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import PCA, TruncatedSVD, IncrementalPCA, KernelPCA, SparsePCA, FastICA, FactorAnalysis
import umap.umap_ as umap_
import hdbscan
from sklearn.utils import shuffle
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import TSNE, LocallyLinearEmbedding, Isomap, SpectralEmbedding, MDS
import matplotlib.patches as mpatches
import re
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
from sklearn.random_projection import SparseRandomProjection
from unidecode import unidecode
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning
import warnings
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import nltk
import numpy as np
from matplotlib import pyplot as plt
import os
import shutil
from sklearn.metrics import silhouette_samples, silhouette_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
import numpy as np
import skfuzzy as fuzz

In [2]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names_out()
    labels_ = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels_)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                   .Doc
                   .count()
                   .reset_index()
                   .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                   .sort_values("Size", ascending=False))
    return topic_sizes


def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

def map_labels_to_colors(labels,cmap_):
    cmap = plt.get_cmap(cmap_)
    num_labels = np.max(labels) + 1
    colors = cmap(np.linspace(0, 1, num_labels))
    # print(colors)
    return colors[labels]


def gera_comparacao(list_preprossing_,cluster_):
    docs_df = pd.DataFrame(list_preprossing_,columns=["Doc"])
    docs_df['Topic'] = cluster_
    docs_df['Doc_ID'] = range(len(docs_df))
    docs_per_topic=docs_df.groupby(["Topic"],as_index=False).agg({"Doc":' '.join})
    tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(list_preprossing_))
    top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=5)
    topic_sizes = extract_topic_sizes(docs_df)
    legends=[top_n_words[i][0] for i in top_n_words]
    return  legends



def silhouette(cluster_labels,x,save):
    n_clusters=len(set(cluster_labels))
    silhouette_avg = silhouette_score(x, cluster_labels)
    sample_silhouette_values = silhouette_samples(x, cluster_labels)
    valor_aceitavel = 0.7
    if silhouette_avg > valor_aceitavel:
        save =save+"___APROVADO___"+str(silhouette_avg*100)+"__"
        fig, ax = plt.subplots()
        ax.set_xlim([-0.1, 1])
        ax.set_ylim([0, len(x) + (n_clusters + 1) * 10])

        y_lower = 10
        for i in range(n_clusters):
            ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i
            colors =mpl.colormaps["Spectral"]
            color= colors(float(i) , n_clusters)
            ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)
            ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            y_lower = y_upper + 10

        ax.set_title("Gráfico de Silhuetas para {} clusters".format(n_clusters))
        ax.set_xlabel("Valores de Silhueta")
        ax.set_ylabel("Cluster")

        ax.axvline(x=silhouette_avg, color="red", linestyle="--")
        ax.set_yticks([])
        fig.savefig(save+".png")
        plt.close(fig)
        return True, silhouette_avg * 100
    return False,0

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords_ = set(stopwords.words('portuguese'))

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize

# Baixe os recursos necessários do NLTK
nltk.download('wordnet')
nltk.download('punkt')
import spacy

# Carregue o modelo em português do spaCy
nlp = spacy.load('pt_core_news_sm')


def lemmatize_text(text):
    # Processa o texto com o modelo do spaCy
    doc = nlp(text)

    # Lematiza cada token no texto
    lemmas = [token.lemma_ for token in doc]

    # Junte os lemas em uma única string
    lemmatized_text = ' '.join(lemmas)

    return lemmatized_text


def preprocess_text(text):
    # Conversão para minúsculas
    text = text.lower()
    # Remoção de pontuações
    text = re.sub(r'[^\w\s]', '', text)
    # Remoção de caracteres especiais, preservando letras acentuadas
    text = re.sub(r'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôûÂÊÎÔÛàèìòùÀÈÌÒÙãõñÃÕÑçÇ\s]+', '', text)
    text = unidecode(text)
    return lemmatize_text(text)

texto = "Eu estou correndo no parque. Está um belo dia ensolarado!"
lemmatized_text = preprocess_text(texto)
print(lemmatized_text)
# !python -m spacy download pt

In [3]:
def plot_info(reduce_embedding,metrics_,kmeans_options,name,modelo_direct,reduce,reduc_op):
        for met_ in metrics_:
            metrics[met_]=[]
        for n_init in kmeans_options['n_init']:
            for n_cluster in kmeans_options['n_clusters']:
                print(f"Kmeans debug: {n_init}__cluster_{n_cluster}")
                kmeans = KMeans(n_clusters=n_cluster,n_init=n_init).fit(reduce_embedding)
                if len(set(kmeans.labels_)) >1:
                    valor_aceite = silhouette(kmeans.labels_,reduce_embedding,f"./{name}/{modelo_direct}/silhouette/kmeans_{n_cluster}_{reduce}__{reduc_op}")
                    if valor_aceite[0]:
                        result = pd.DataFrame(reduce_embedding,columns=['x', 'y'])
                        result['labels'] = kmeans.labels_
                        silhouette_avg = silhouette_score(reduce_embedding, kmeans.labels_)

                        calinski_score = calinski_harabasz_score(reduce_embedding, kmeans.labels_)
                        davies_score = davies_bouldin_score(reduce_embedding, kmeans.labels_)
                        metrics['silhouette_scores'].append(silhouette_avg)
                        metrics['calinski_scores'].append(calinski_score)
                        metrics['davies_bouldin_scores'].append(davies_score)
                        centroids = kmeans.cluster_centers_
                        labels = gera_comparacao(list_preprossing, kmeans.labels_)
                        fig, ax = plt.subplots(figsize=figsize)
                        plt.scatter(result.x, result.y, c=result.labels, cmap=color_maps)
                        distances = pairwise_distances(result[['x', 'y']], centroids)
                        avg_distance = np.mean(distances, axis=1)

                        radius_factor = 0.2
                        adjusted_radius = avg_distance * radius_factor
                        for centroid, radius in zip(centroids, adjusted_radius):
                            circle = plt.Circle((centroid[0], centroid[1]), radius=radius, color='red', fill=False)
                            ax.add_artist(circle)
                            plt.scatter(centroid[0], centroid[1], c='red', s=50, marker='x')

                        legend_labels = [f"{label[0]}: {round(label[1]*100,2)}%" for label in labels]
                        legend_colors = map_labels_to_colors(np.arange(len(legend_labels)), color_maps)
                        legend_elements = [mpatches.Patch(color=color, label=label) for color, label in zip(legend_colors, labels)]
                        plt.legend(handles=legend_elements)
                        plt.legend(legend_elements, legend_labels)
                        fig.savefig(f"./{name}/{modelo_direct}/kmeans/kmeans_init_{n_init}_cluster_{n_cluster}_method_reduce_{reduce}_opt{reduc_op}.png")
                        plt.close(fig)
                        save_relatorio(name,modelo_direct,f"kmeans_init{n_init}_",n_cluster,reduce,reduc_op,valor_aceite[1])


def plot_info_hdbscan(reduce_embedding,hdbscan_options,name,modelo_direct,reduce,reduc_op="none"):
    for k in hdbscan_options['min_cluster_size']:
        hdbscan_ = hdbscan.HDBSCAN(min_cluster_size=k,
                                      metric='euclidean',
                                      cluster_selection_method='eom').fit(reduce_embedding)
        result = pd.DataFrame(reduce_embedding,columns=['x', 'y'])
        result['labels'] = hdbscan_.labels_

        if len(set(hdbscan_.labels_)) >=1 and result[result['labels']==-1].shape[0] < (result.shape[0]*0.20):
            valor_aceite = silhouette(hdbscan_.labels_,reduce_embedding,f"./{name}/{modelo_direct}/silhouette/hdbscan_{k}_{reduce}__{reduc_op}")

            if valor_aceite[0]:
                print(f"DB Scan{k}",set(hdbscan_.labels_),"Count -1: ",result[result['labels']==-1].shape[0],f" Maximo Permitido: {(result.shape[0]*0.20)}")
                unique_clusters = np.unique(hdbscan_.labels_)
                centroids = []
                for cluster in unique_clusters:
                    if cluster != -1:  # Ignorar pontos de ruído
                        cluster_points = result[result.labels == cluster]
                        centroid = np.mean(cluster_points[['x', 'y']], axis=0)
                        centroids.append(np.array(centroid))
                outliers = result.loc[result.labels == -1, :]
                clustered = result.loc[result.labels != -1, :]
                fig, ax = plt.subplots(figsize=figsize)
                plt.scatter(outliers.x, outliers.y, c='black' ,marker='x')
                plt.scatter(clustered.x, clustered.y,c=clustered.labels,  cmap=color_maps)
                labels = gera_comparacao(list_preprossing,clustered.labels)
                distances = pairwise_distances(result[['x', 'y']].to_numpy(), np.array(centroids).reshape(-1, 2))
                avg_distance = np.mean(distances, axis=1)
                radius_factor = 0.2
                adjusted_radius = avg_distance * radius_factor
                for centroid, radius in zip(centroids, adjusted_radius):
                    circle = plt.Circle((centroid[0], centroid[1]), radius=radius, color='red', fill=False)
                    ax.add_artist(circle)
                    plt.scatter(centroid[0], centroid[1], c='red', s=50, marker='x')
                legend_labels = [f"{label[0]}: {round(label[1]*100,2)}%" for label in labels]
                legend_colors = map_labels_to_colors(np.arange(len(legend_labels)),color_maps)
                legend_elements = [mpatches.Patch(color=color, label=label) for color, label in zip(legend_colors,legend_labels)]
                plt.legend(handles=legend_elements)
                plt.legend(legend_elements, legend_labels)
                fig.savefig(f"./{name}/{modelo_direct}/hdbscan/hdbscan_{k}_{reduce}___{reduc_op}.png")
                plt.close(fig)
                save_relatorio(name,modelo_direct,"hdbscan",k,reduce,reduc_op,valor_aceite[1])

def save_relatorio (nome_set,modelo_direct,method_group,method_op,reduce,reduce_op,value_s):
    new_ = f"{nome_set}--{modelo_direct}--{method_group}--{method_op}--{reduce}--{reduce_op}:{round(value_s,3)}\n"
    if os.path.exists("./result.txt"):
        with open("./result.txt","a") as documentos:
            documentos.write(new_)
    else:
        with open("./result.txt","w+") as documentos:
            documentos.write(new_)

In [4]:
from itertools import product

def generate_reduce_params(reduces_class):
    params_list = []

    for reduce in reduces_class:
        if 'options' in reduces_class[reduce]:
            options = reduces_class[reduce]['options']
            param_names = list(options.keys())
            param_values = [options[param] for param in param_names]
            param_combinations = list(product(*param_values))

            for combination in param_combinations:
                reduce_params = {'options':{param_names[i]: combination[i] for i in range(len(param_names))}}
                reduce_params['method'] = reduces_class[reduce]['method']
                params_list.append(reduce_params)

    return params_list

name_types = {
        # "ativo_classe":[0,3],
    # "ativo":[0],
    #
    #         "classe":[3],
    #
    #             "ativo_indicacao_contra_indicacao_classes":[0,1,2,3]
    # ,
 # "ativo_indicacao":[0,1],
            "ativo_indicacao_contra_indicacao":[0,1,2],


            "indicacao":[1],

}

# df_train = shuffle(pd.read_csv("./medicamentos_reduzidos"))
df_train = pd.read_excel("/home/rafael/Documentos/FACOM/Douturado/Doutorado/webcrawler/medicamentos.xlsx",index_col=0,dtype=str)[:200]
df_train.head()
color_maps = 'tab20'
lemmatizer = WordNetLemmatizer()
figsize=(10,5)
if os.path.exists("./result.txt"):
    os.remove("./result.txt")
for name in name_types:
    list_preprossing=np.empty(0)
    for dta in range(df_train.shape[0]):
        value=''
        for select in name_types[name]:
            value+=str(df_train.iloc[dta,select])
        tokens = word_tokenize(preprocess_text(value.replace('nan','')))
        filtered_tokens = [token for token in tokens if token.lower() not in list(stopwords_)]
        filtered_text = ' '.join(filtered_tokens)
        list_preprossing = np.append(list_preprossing,filtered_text)

    lnp_models = [

        # 'distiluse-base-multilingual-cased-v2',
        # 'allenai/scibert_scivocab_uncased',
                        # 'all-mpnet-base-v2',
                        # 'all-distilroberta-v1',
                        # 'neuralmind/bert-base-portuguese-cased',
                        # 'distiluse-base-multilingual-cased-v1',

                        'all-MiniLM-L12-v2',
                        'all-MiniLM-L6-v2',
                        'multi-qa-distilbert-dot-v1',
                        'multi-qa-distilbert-cos-v1',
                        'multi-qa-mpnet-base-dot-v1',
                        'multi-qa-MiniLM-L6-cos-v1',
                        'multi-qa-MiniLM-L6-dot-v1',
                        'paraphrase-multilingual-mpnet-base-v2',
                        'paraphrase-albert-small-v2',
                        'paraphrase-multilingual-MiniLM-L12-v2',
                        'paraphrase-MiniLM-L3-v2',
    ]
    hdbscan_options ={"min_cluster_size":[x for x in range(6,20,2)],'metric':['euclidean','cosine']}
    kmeans_options ={'n_clusters':[x for x in range(5,30,3)], 'n_init':['auto']}
    reduces_class = {
    # 'isomap':{
    #                             'options':{'n_neighbors':[x for x in range(4,20)],'n_components':2},
    #                             'method':Isomap
    #                         },
                            'umap':{
                                'options':{'n_neighbors':[x for x in range(2,10,2)], 'n_components':[2],
                                           'metric':['cosine','euclidean']},
                                'method':umap_.UMAP
                            },
                            'tnse':{
                                'options':{'perplexity':[x for x in range(1,15,2)], 'n_components':[2],
                                           'metric':['cosine','euclidean']},
                                'method':TSNE
                            },
                            # 'efcm':{
                            #     'num_clusters' : [x for x in range(2,30,1)],
                            #     'fuzziness' : [x for x in range(2,30,1)]
                            # },

                            # 'lle-standard':{
                            #     'options':{'n_neighbors':[x for x in range(2,8)], 'n_components':2 ,"method":"standard"},
                            #     'method':LocallyLinearEmbedding
                            # },
                            # 'lle-hessian':{
                            #     'options':{'n_neighbors':[x for x in range(2,8)], 'n_components':2,
                            #     "method":"hessian"},
                            #     'method':LocallyLinearEmbedding
                            # },
                            # 'lle-modified':{
                            #     'options':{'n_neighbors':[x for x in range(5,8)], 'n_components':2,"method":"modified"},
                            #     'method':LocallyLinearEmbedding
                            # },
                            #
                            # 'lle-ltsa':{
                            #     'options':{'n_neighbors':[x for x in range(5,15)], 'n_components':2,
                            #                "method":"ltsa"},
                            #     'method':LocallyLinearEmbedding
                            # },
                            # 'random-projection':{
                            #     'options':{'n_components':2},
                            #     'method':SparseRandomProjection
                            # },
                            # 'lda ':{
                            #     'options':{'n_components':2},
                            #     'method':LinearDiscriminantAnalysis
                            # },
                            #
                            # 'mds':{
                            #     'options':{'n_components':2},
                            #     'method':MDS
                            # },
                            # 'spectral':{
                            #     'options':{ 'n_components':[2],'affinity':['rbf', 'precomputed','precomputed_nearest_neighbors'],'n_neighbors':[2,3,4,5,6,7,8,10]},
                            #     'method':SpectralEmbedding
                            # },
                            # 'kernel_pca':{
                            #
                            #     'options':{ 'n_components':[2],  'kernel':['linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed']},
                            #     'method':KernelPCA
                            #
                            # },
                            # 'truncate_svd':{
                            #     'options':{'n_components':[2],'algorithm' :['arpack']},
                            #     'method':TruncatedSVD
                            # },
                            # 'pca':{
                            #     'options':{'n_components':2},
                            #     'method':PCA
                            # },
                            # 'factor_analysis':{
                            #     'options':{ 'n_components':2},
                            #     'method':FactorAnalysis
                            # },
                            # 'incremental_pca':{
                            #     'options':{ 'n_components':2},
                            #     'method':IncrementalPCA
                            # }
                           #,
                            # 'sparse_pca':{
                            #     'options':{'n_components':[2]},
                            #     'method':SparsePCA
                            # },
                           #  'fast_ica':{
                           #      'options':{ 'n_components':2},
                           #      'method':FastICA
                           # }
                    }
    metrics = {
                        'silhouette_scores': [],
                        'calinski_scores':[],
                        'davies_bouldin_scores':[]
                }
    map_type =["silhouette","kmeans","hdbscan"]

    for model_index in lnp_models:
        if len(model_index.split('/'))>1:
            modelo_direct = model_index.split('/')[1]
        else:
            modelo_direct = model_index
        if os.path.exists(f'./{name}'):
            if os.path.exists(f"./{name}/{modelo_direct}"):
                shutil.rmtree(f"{name}/{modelo_direct}")
                os.makedirs(f"{name}/{modelo_direct}")
            else:
                os.makedirs(f"./{name}/{modelo_direct}")
        else:
            os.makedirs(f"./{name}")
            os.makedirs(f"./{name}/{modelo_direct}")

        # for m in metrics:
        #     if os.path.exists(f'./{name}/{modelo_direct}/{m}'):
        #         shutil.rmtree(f"{name}/{modelo_direct}/{m}")
        #         os.makedirs(f"{name}/{modelo_direct}/{m}")
        #     else:
        #         os.m        # break
    # breakakedirs(f"{name}/{modelo_direct}/{m}")

        for map in map_type:
            if os.path.exists(f'./{name}/{modelo_direct}/{map}'):
                shutil.rmtree(f"{name}/{modelo_direct}/{map}")
                os.makedirs(f"{name}/{modelo_direct}/{map}")
            else:
                os.makedirs(f"{name}/{modelo_direct}/{map}")

        corpus_embeddings = SentenceTransformer(model_index).encode(list_preprossing,show_progress_bar=True)
        reduces_compile=generate_reduce_params(reduces_class)
        print(reduces_compile)
        for reduce in reduces_compile:
            print(f"Modelo:{model_index} reduce:{reduce}")
            reduce_embedding=reduce['method'](**reduce['options']).fit_transform(corpus_embeddings)
            plot_info(reduce_embedding,metrics,kmeans_options,name,modelo_direct,reduce,'')
            plot_info_hdbscan(reduce_embedding,hdbscan_options,name,modelo_direct,reduce,'')
