Agora vamos criar dois modelos de lda que irão permitir agrupar os documentos.

In [None]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import scispacy
import spacy
import en_core_sci_lg
import joblib
import os

In [None]:
df = pd.read_csv('dataset_gastric_cancer.csv', sep='#')

In [None]:
def print_top_words(model, vectorizer, n_top_words):
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        message = "\nTopic #%d: " % topic_idx
        message += " ".join([feature_names[i]for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
# medium model
nlp=en_core_sci_lg.load(disable=["tagger", "parser", "ner"])
nlp.max_length = 3000000

def spacy_tokenizer(sentence):
    #return word_lemma of non necessary words
    return [word.lemma_ for word in nlp(sentence) if not (word.like_num or word.is_stop or word.is_punct or word.is_space or len(word)==1)]

In [None]:
def create_customize_words(delete_by_frequency=False):
    
    #New stop words list 
    customize_stop_words = [
        'doi', 'preprint', 'copyright', 'org', 'https', 'et', 'al', 'author', 'figure', 'table',
        'rights', 'reserved', 'permission', 'use', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC',         'CZI','-PRON-', 'usually',
        r'\usepackage{amsbsy', r'\usepackage{amsfonts', r'\usepackage{mathrsfs', r'\usepackage{amssymb', r'\usepackage{wasysym',
        r'\setlength{\oddsidemargin}{-69pt',  r'\usepackage{upgreek', r'\documentclass[12pt]{minimal'
    ]

    if(delete_by_frequency):
        #read words and frequency
        words=pd.read_csv('word__count.csv')

        for index, row in words.iterrows():
            if(row['Frequency']<100 or row['Frequency']>5000): customize_stop_words.append(row['Word'])

    return customize_stop_words

In [None]:
def train_lda(folder_path,delete_by_frequency):
    if(not os.path.isdir(folder_path)):
        os.mkdir(folder_path)

    #New stop words list 
    customize_stop_words = create_customize_words(delete_by_frequency=delete_by_frequency)

    # Mark them as stop words
    for w in customize_stop_words:
        if(not isinstance(w, float)): nlp.vocab[w].is_stop = True

    #Convert a collection of text documents to a matrix of token counts
    vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, min_df=2)

    #Learn the vocabulary dictionary and return document-term matrix.
    #The astype(‘U’) is telling numpy to convert the data to Unicode (essentially a string in python 3)
    data_vectorized = vectorizer.fit_transform(df['summary'].values.astype('U'))

    #joblib.dump Persist an arbitrary Python object into one file.
    joblib.dump(vectorizer, folder_path+'/vectorizer.csv')
    joblib.dump(data_vectorized,folder_path+'/data_vectorized.csv')


    lda = LatentDirichletAllocation(n_components=32, random_state=0)
    lda.fit(data_vectorized)
    joblib.dump(lda, folder_path+'/lda.csv')

    print_top_words(lda, vectorizer, n_top_words=25)

    #get topic distances
    doc_topic_dist = pd.DataFrame(lda.transform(data_vectorized))
    doc_topic_dist.to_csv(folder_path+'/doc_topic_dist.csv', index=False)

In [None]:
train_lda('baseline',False)

In [None]:
train_lda('delete_by_frequency',False)