In [23]:
import numpy as np
import pandas as pd
from pycaret.nlp import *
import nltk
from nltk.corpus import stopwords
import unicodedata
import unidecode
import re
import spacy

# download das stopwords para o idioma português
nltk.download('stopwords')
stop_words = stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df_covid = pd.read_csv('data/exemplos-treinamento-covid.csv', delimiter=';')
df_seloturismo = pd.read_csv('data/exemplos-treinamento-seloturismo.csv', delimiter=';')
df_tuberculose = pd.read_csv('data/exemplos-treinamento-tuberculose.csv', delimiter=';')
df_teste = pd.read_csv('data/dados-testes-experimentos.csv', delimiter=';')

df_all = pd.concat([df_covid, df_seloturismo, df_tuberculose, df_teste], axis=0)
df_all.reset_index(drop=True, inplace=True)
df_all


Unnamed: 0,input,category
0,dor no abdomen é covid?,covid
1,Pego covid tocando numa objeto_contaminado,covid
2,Pego covid através de meio,covid
3,onde começou o covid,covid
4,onde coemçou a infestação do covid,covid
...,...,...
1194,Para os setores de Meios de Hospedagem as prem...,seloturismo
1195,"Ao final da estada do hóspede, deverá ser real...",seloturismo
1196,É proibido pernoitar na embarcação,seloturismo
1197,Fornecer Equipamentos de Proteção Individual (...,seloturismo


In [5]:
### TO-DO
## Normalizar a string de entrada

# lower case
df_all.input = df_all.input.str.lower()

# retira pontuação
df_all.input = df_all.input.str.replace(r'[^\w\s]+', '')

# remove numeração
df_all.input = df_all.input.str.replace(r'[0-9]+', '')

# remove underscore
df_all.input = df_all.input.str.replace('_', ' ')

# retira acentuação
for i in df_all.index:
    df_all.input[i] = unidecode.unidecode(df_all.input[i])

# Remove Emails
df_all.input = [re.sub('\S*@\S*\s?', '', sent) for sent in df_all.input]

# Remove new line characters
df_all.input = [re.sub('\s+', ' ', sent) for sent in df_all.input]

# Remove distracting single quotes
df_all.input = [re.sub("\'", "", sent) for sent in df_all.input]

df_all

Unnamed: 0,input,category
0,dor no abdomen e covid,covid
1,pego covid tocando numa objeto contaminado,covid
2,pego covid atraves de meio,covid
3,onde comecou o covid,covid
4,onde coemcou a infestacao do covid,covid
...,...,...
1194,para os setores de meios de hospedagem as prem...,seloturismo
1195,ao final da estada do hospede devera ser reali...,seloturismo
1196,e proibido pernoitar na embarcacao,seloturismo
1197,fornecer equipamentos de protecao individual e...,seloturismo


In [6]:
s = setup(df_all, target='input', custom_stopwords=stop_words)

Description,Value
session_id,6817
Documents,1199
Vocab Size,586
Custom Stopwords,True


In [7]:
models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
lda,Latent Dirichlet Allocation,gensim/models/ldamodel
lsi,Latent Semantic Indexing,gensim/models/lsimodel
hdp,Hierarchical Dirichlet Process,gensim/models/hdpmodel
rp,Random Projections,gensim/models/rpmodel
nmf,Non-Negative Matrix Factorization,sklearn.decomposition.NMF


In [8]:
lda = create_model('lda', num_topics=3)
print(lda)


LdaModel<num_terms=586, num_topics=3, decay=0.5, chunksize=100>


In [9]:

lsi = create_model('lsi', num_topics=3)
print(lsi)


LsiModel<num_terms=586, num_topics=3, decay=1.0, chunksize=20000>


In [10]:

hdp = create_model('hdp', num_topics=3)
print(hdp)


<gensim.models.hdpmodel.HdpModel object at 0x00000201C5980100>


In [11]:

rp = create_model('rp', num_topics=3)
print(rp)


RpModel<num_terms=586, num_topics=3>


In [12]:

nmf = create_model('nmf', num_topics=3)
print(nmf)

NMF(alpha='deprecated', alpha_H='same', alpha_W=0.0, beta_loss='frobenius',
    init='nndsvd', l1_ratio=0.0, max_iter=200, n_components=3,
    random_state=6817, regularization='deprecated', shuffle=False, solver='cd',
    tol=0.0001, verbose=0)


In [13]:
evaluate_model(lda)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Frequency Plot', 'freque…

In [14]:
#evaluate_model(lsi)

In [15]:
evaluate_model(hdp)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Frequency Plot', 'freque…

In [16]:
evaluate_model(rp)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Frequency Plot', 'freque…

In [17]:
evaluate_model(nmf)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Frequency Plot', 'freque…

In [18]:
lda_results = assign_model(lda)
lda_results

Unnamed: 0,input,category,Topic_0,Topic_1,Topic_2,Dominant_Topic,Perc_Dominant_Topic
0,,covid,0.403434,0.280807,0.315759,Topic 0,0.40
1,pego,covid,0.186575,0.127872,0.685553,Topic 2,0.69
2,pego atrave meio,covid,0.363721,0.060733,0.575546,Topic 2,0.58
3,onde comecou covid,covid,0.870949,0.060686,0.068365,Topic 0,0.87
4,,covid,0.403434,0.280807,0.315759,Topic 0,0.40
...,...,...,...,...,...,...,...
1194,setore meio sistematizada setore bebida evento...,seloturismo,0.042539,0.029585,0.927876,Topic 2,0.93
1195,final estada unidade habitacional,seloturismo,0.069118,0.049828,0.881054,Topic 2,0.88
1196,,seloturismo,0.403434,0.280807,0.315759,Topic 0,0.40
1197,colaboradore,seloturismo,0.182764,0.126967,0.690269,Topic 2,0.69


In [19]:
#lsi_results = assign_model(lsi)
#lsi_results

In [20]:
hdp_results = assign_model(hdp)
hdp_results

Unnamed: 0,input,category,Topic_0,Topic_1,Topic_2,Dominant_Topic
0,,covid,,,,
1,pego,covid,0.652473,0.173372,0.174155,Topic 0
2,pego atrave meio,covid,0.805084,0.095677,0.099239,Topic 0
3,onde comecou covid,covid,0.402458,0.102101,0.495441,Topic 2
4,,covid,,,,
...,...,...,...,...,...,...
1194,setore meio sistematizada setore bebida evento...,seloturismo,0.907877,0.046705,0.045418,Topic 0
1195,final estada unidade habitacional,seloturismo,0.235312,0.077027,0.687660,Topic 2
1196,,seloturismo,,,,
1197,colaboradore,seloturismo,0.164872,0.664740,0.170388,Topic 1


In [21]:
rp_results = assign_model(rp)
rp_results

Unnamed: 0,input,category,Topic_0,Topic_1,Topic_2,Dominant_Topic
0,,covid,,,,
1,pego,covid,0.577350,-0.577350,0.577350,Topic 0
2,pego atrave meio,covid,0.577350,-0.577350,0.577350,Topic 0
3,onde comecou covid,covid,-0.577350,1.732051,-1.732051,Topic 1
4,,covid,,,,
...,...,...,...,...,...,...
1194,setore meio sistematizada setore bebida evento...,seloturismo,-1.732051,-1.732051,-1.732051,Topic 0
1195,final estada unidade habitacional,seloturismo,-1.154701,1.154701,0.000000,Topic 1
1196,,seloturismo,,,,
1197,colaboradore,seloturismo,0.577350,0.577350,-0.577350,Topic 0


In [22]:
nmf_results = assign_model(nmf)
nmf_results

Unnamed: 0,input,category,Topic_0,Topic_1,Topic_2,Dominant_Topic,Perc_Dominant_Topic
0,,covid,0.000000,0.000000e+00,0.000000e+00,Topic 0,
1,pego,covid,0.000009,1.052093e-07,2.984715e-13,Topic 0,0.99
2,pego atrave meio,covid,0.001214,8.011750e-06,3.775291e-07,Topic 0,0.99
3,onde comecou covid,covid,0.000000,8.100618e-04,1.344473e-01,Topic 2,0.99
4,,covid,0.000000,0.000000e+00,0.000000e+00,Topic 0,
...,...,...,...,...,...,...,...
1194,setore meio sistematizada setore bebida evento...,seloturismo,0.000001,3.024357e-06,2.282964e-07,Topic 1,0.67
1195,final estada unidade habitacional,seloturismo,0.000081,1.583989e-08,1.850438e-06,Topic 0,0.98
1196,,seloturismo,0.000000,0.000000e+00,0.000000e+00,Topic 0,
1197,colaboradore,seloturismo,0.000000,0.000000e+00,0.000000e+00,Topic 0,


In [24]:
## Verificar o uso do spacy pois tem em português
nlp = spacy.load("pt_core_news_lg")