In [1]:
import numpy as np
import pandas as pd
from pycaret.nlp import *
import nltk
from nltk.corpus import stopwords
import unicodedata
import unidecode
import re
import spacy
import pprint
from gensim import corpora
from collections import defaultdict    

In [2]:
# download das stopwords para o idioma português
nltk.download('stopwords')
stop_words = stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# montagem do dataset treinamento e validação
df_covid = pd.read_csv('data/exemplos-treinamento-covid.csv', delimiter=';')
df_seloturismo = pd.read_csv('data/exemplos-treinamento-seloturismo.csv', delimiter=';')
df_tuberculose = pd.read_csv('data/exemplos-treinamento-tuberculose.csv', delimiter=';')
df_teste = pd.read_csv('data/dados-testes-experimentos.csv', delimiter=';')

df_all = pd.concat([df_covid, df_seloturismo, df_tuberculose, df_teste], axis=0)
df_all.reset_index(drop=True, inplace=True)
df_all


Unnamed: 0,input,category
0,dor no abdomen é covid?,covid
1,Pego covid tocando numa objeto_contaminado,covid
2,Pego covid através de meio,covid
3,onde começou o covid,covid
4,onde coemçou a infestação do covid,covid
...,...,...
1194,Para os setores de Meios de Hospedagem as prem...,seloturismo
1195,"Ao final da estada do hóspede, deverá ser real...",seloturismo
1196,É proibido pernoitar na embarcação,seloturismo
1197,Fornecer Equipamentos de Proteção Individual (...,seloturismo


In [4]:
# montagem do dataset de teste
df_testdata = pd.read_csv('data/testdata.csv')
df_testdata

Unnamed: 0,input,category
0,Diante da quantidade limitada de vacinas qual ...,covid
1,Qual é a melhor vacina contra a COVID-19 desen...,covid
2,Considerando a rapidez com que as vacinas cont...,covid
3,Se o gargalo está na produção há uma alternati...,covid
4,As vacinas funcionarão contra as novas cepas d...,covid
...,...,...
472,Quais os protocolos de higiene do selo turismo?,seloturismo
473,O selo turismo se aplica a qualquer setor?,seloturismo
474,Quais estados já aderiram o selo turismo?,seloturismo
475,Qual seria a estratégia mais efetiva para o us...,seloturismo


In [5]:
# Normalização string de entrada
# retira pontuação
df_all.input = df_all.input.str.replace(r'[^\w\s]+', '')
df_testdata.input = df_testdata.input.str.replace(r'[^\w\s]+', '')

# remove numeração
df_all.input = df_all.input.str.replace(r'[0-9]+', '')
df_testdata.input = df_testdata.input.str.replace(r'[0-9]+', '')

# remove underscore
df_all.input = df_all.input.str.replace('_', ' ')
df_testdata.input = df_testdata.input.str.replace('_', ' ')
df_all

  df_all.input = df_all.input.str.replace(r'[^\w\s]+', '')
  df_testdata.input = df_testdata.input.str.replace(r'[^\w\s]+', '')
  df_all.input = df_all.input.str.replace(r'[0-9]+', '')
  df_testdata.input = df_testdata.input.str.replace(r'[0-9]+', '')


Unnamed: 0,input,category
0,dor no abdomen é covid,covid
1,Pego covid tocando numa objeto contaminado,covid
2,Pego covid através de meio,covid
3,onde começou o covid,covid
4,onde coemçou a infestação do covid,covid
...,...,...
1194,Para os setores de Meios de Hospedagem as prem...,seloturismo
1195,Ao final da estada do hóspede deverá ser reali...,seloturismo
1196,É proibido pernoitar na embarcação,seloturismo
1197,Fornecer Equipamentos de Proteção Individual E...,seloturismo


In [6]:
df_testdata

Unnamed: 0,input,category
0,Diante da quantidade limitada de vacinas qual ...,covid
1,Qual é a melhor vacina contra a COVID desenvol...,covid
2,Considerando a rapidez com que as vacinas cont...,covid
3,Se o gargalo está na produção há uma alternati...,covid
4,As vacinas funcionarão contra as novas cepas d...,covid
...,...,...
472,Quais os protocolos de higiene do selo turismo,seloturismo
473,O selo turismo se aplica a qualquer setor,seloturismo
474,Quais estados já aderiram o selo turismo,seloturismo
475,Qual seria a estratégia mais efetiva para o us...,seloturismo


In [7]:
# Lematização string de entrada
## Verificar o uso do spacy pois tem em português
nlp = spacy.load("pt_core_news_lg")
lem = nlp.get_pipe("lemmatizer")

inputs = []
for doc in df_all.input:
    d = nlp(doc)
    s = ' '.join([token.lemma_ for token in d])       
    inputs.append(s)
df_all['input'] = inputs
df_all



Unnamed: 0,input,category
0,dor em o abdomen ser Covid,covid
1,pego Covid tocar em um objeto contaminar,covid
2,pego Covid através de meio,covid
3,onde começar o Covid,covid
4,onde coemçar o infestação de o Covid,covid
...,...,...
1194,para o setor de Meios de Hospedagem o premissa...,seloturismo
1195,a o final de o estada de o hóspede dever ser r...,seloturismo
1196,ser proibir pernoitar em o embarcação,seloturismo
1197,Fornecer Equipamentos de Proteção Individual E...,seloturismo


In [8]:
# Lematização string de entrada
## Verificar o uso do spacy pois tem em português
#nlp = spacy.load("pt_core_news_lg")
#lem = nlp.get_pipe("lemmatizer")

inputs = []
for doc in df_testdata.input:
    d = nlp(doc)
    s = ' '.join([token.lemma_ for token in d])       
    inputs.append(s)
df_testdata['input'] = inputs
df_testdata

Unnamed: 0,input,category
0,diante de o quantidade limitar de vacina qual ...,covid
1,qual ser o bom vacina contra o COVID desenvolver,covid
2,considerar o rapidez com que o vacina contra o...,covid
3,se o gargalo estar em o produção haver um alte...,covid
4,o vacina funcionar contra o novo cepa de o vírus,covid
...,...,...
472,qual o protocolo de higiene de o selo turismo,seloturismo
473,o selo turismo se aplicar a qualquer setor,seloturismo
474,qual estado já aderir o selo turismo,seloturismo
475,qual ser o estratégia mais efetivo para o uso ...,seloturismo


In [9]:
# geração de corpus com dataset, remoção de stopwords e tokenização
text_corpus = df_all.input.to_numpy()
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stop_words]
         for document in text_corpus]
# Count word frequencies
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint.pprint(processed_corpus)

[['covid'],
 ['pego', 'covid', 'tocar', 'objeto', 'contaminar'],
 ['pego', 'covid', 'através', 'meio'],
 ['onde', 'começar', 'covid'],
 ['onde', 'covid'],
 ['vírus', 'transmitir', 'através', 'meio'],
 ['significar', 'covid'],
 ['comorbidade', 'poder', 'tomar', 'vacina'],
 ['covid', 'família', 'vírus'],
 ['vacina', 'disponível', 'covid'],
 ['criança', 'poder', 'vacinar', 'contra', 'covid'],
 ['bom', 'vacina'],
 ['vacina', 'contra', 'covid', 'seguro'],
 ['ter', 'tomar', 'quanto', 'dose', 'vacina'],
 ['prevenir', 'contagio'],
 ['covid', 'começar', 'china'],
 ['covid', 'doença', 'grave'],
 ['covid', 'grave'],
 ['covid', 'virus', 'sarscov'],
 ['covid', 'vírus', 'grave'],
 ['covid'],
 ['covid', 'doença', 'grave'],
 ['covid', 'gripe'],
 ['covid', 'criar', 'laboratorio'],
 ['covid', 'vir'],
 ['covid', 'vir'],
 ['covid'],
 ['falar', 'covid'],
 ['covid'],
 ['querer', 'dizer', 'covid'],
 ['virus', 'família', 'covid'],
 ['covid', 'vir', 'china'],
 ['virus', 'comum', 'família', 'covid'],
 ['pegar',

In [10]:
# geração de corpus com dataset, remoção de stopwords e tokenização
ttext_corpus = df_testdata.input.to_numpy()
# Lowercase each document, split it by white space and filter out stopwords
ttexts = [[word for word in document.lower().split() if word not in stop_words]
         for document in ttext_corpus]
# Count word frequencies
tfrequency = defaultdict(int)
for text in ttexts:
    for token in text:
        tfrequency[token] += 1
# Only keep words that appear more than once
tprocessed_corpus = [[token for token in text if tfrequency[token] > 1] for text in ttexts]
pprint.pprint(tprocessed_corpus)

[['diante', 'vacina', 'vacinação', 'contra', 'covid'],
 ['bom', 'vacina', 'contra', 'covid', 'desenvolver'],
 ['considerar', 'vacina', 'contra', 'covid', 'poder', 'considerar', 'seguro'],
 ['alternativa', 'acesso', 'vacina'],
 ['vacina', 'funcionar', 'contra', 'novo', 'vírus'],
 ['vacina', 'previner', 'doença', 'forma', 'grave', 'morte', 'transmissão'],
 ['temperatura', 'vacina'],
 ['país', 'poder', 'obter', 'vacina'],
 ['diante', 'impacto', 'sobre', 'outro', 'vacinação'],
 ['quanto', 'tempo', 'coronavírus', 'ativo'],
 ['existir',
  'algum',
  'direto',
  'ministério',
  'saúde',
  'tirar',
  'sobre',
  'novo',
  'coronavírus'],
 ['verdade',
  'ter',
  'uso',
  'paciente',
  'recuperar',
  'covid',
  'tratamento',
  'doença'],
 ['funcionar', 'tratamento', 'contra', 'covid'],
 ['vírus', 'causar', 'doença', 'covid', 'ar'],
 ['período', 'incubação', 'período', 'incubação', 'novo', 'coronavírus'],
 ['possível', 'contaminar', 'novo', 'coronavírus', 'meio', 'mão'],
 ['qualquer', 'hospital', 

In [11]:
# atualização de strings de entrada pós criação de corpus
inputs = []
for arr in processed_corpus:
    s = ' '.join([w for w in arr])         
    inputs.append(s)
inputs
df_all['input'] = inputs
df_all

Unnamed: 0,input,category
0,covid,covid
1,pego covid tocar objeto contaminar,covid
2,pego covid através meio,covid
3,onde começar covid,covid
4,onde covid,covid
...,...,...
1194,setor meios hospedagem recomendar setor,seloturismo
1195,final dever realizar limpeza unidade,seloturismo
1196,,seloturismo
1197,proteção,seloturismo


In [12]:
# atualização de strings de entrada pós criação de corpus
inputs = []
for arr in tprocessed_corpus:
    s = ' '.join([w for w in arr])         
    inputs.append(s)
inputs
df_testdata['input'] = inputs
df_testdata

Unnamed: 0,input,category
0,diante vacina vacinação contra covid,covid
1,bom vacina contra covid desenvolver,covid
2,considerar vacina contra covid poder considera...,covid
3,alternativa acesso vacina,covid
4,vacina funcionar contra novo vírus,covid
...,...,...
472,protocolo higiene selo turismo,seloturismo
473,selo turismo aplicar qualquer setor,seloturismo
474,estado aderir selo turismo,seloturismo
475,uso selo,seloturismo


In [13]:
## Continuação da normalização da string de entrada
# lower case
#df_all.input = df_all.input.str.lower()

# retira acentuação
for i in df_all.index:
    df_all.input[i] = unidecode.unidecode(df_all.input[i])

# Remove Emails
df_all.input = [re.sub('\S*@\S*\s?', '', sent) for sent in df_all.input]

# Remove new line characters
df_all.input = [re.sub('\s+', ' ', sent) for sent in df_all.input]

# Remove distracting single quotes
df_all.input = [re.sub("\'", "", sent) for sent in df_all.input]

df_all

Unnamed: 0,input,category
0,covid,covid
1,pego covid tocar objeto contaminar,covid
2,pego covid atraves meio,covid
3,onde comecar covid,covid
4,onde covid,covid
...,...,...
1194,setor meios hospedagem recomendar setor,seloturismo
1195,final dever realizar limpeza unidade,seloturismo
1196,,seloturismo
1197,protecao,seloturismo


In [14]:
## Continuação da normalização da string de entrada
# lower case
df_testdata.input = df_testdata.input.str.lower()

# retira acentuação
for i in df_testdata.index:
    df_testdata.input[i] = unidecode.unidecode(df_testdata.input[i])

# Remove Emails
df_testdata.input = [re.sub('\S*@\S*\s?', '', sent) for sent in df_testdata.input]

# Remove new line characters
df_testdata.input = [re.sub('\s+', ' ', sent) for sent in df_testdata.input]

# Remove distracting single quotes
df_testdata.input = [re.sub("\'", "", sent) for sent in df_testdata.input]

df_testdata

Unnamed: 0,input,category
0,diante vacina vacinacao contra covid,covid
1,bom vacina contra covid desenvolver,covid
2,considerar vacina contra covid poder considera...,covid
3,alternativa acesso vacina,covid
4,vacina funcionar contra novo virus,covid
...,...,...
472,protocolo higiene selo turismo,seloturismo
473,selo turismo aplicar qualquer setor,seloturismo
474,estado aderir selo turismo,seloturismo
475,uso selo,seloturismo


In [15]:
# atualização do corpus pós término da normalização
text_corpus = df_all.input.to_numpy()
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stop_words]
         for document in text_corpus]
# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint.pprint(processed_corpus)

[['covid'],
 ['pego', 'covid', 'tocar', 'objeto', 'contaminar'],
 ['pego', 'covid', 'atraves', 'meio'],
 ['onde', 'comecar', 'covid'],
 ['onde', 'covid'],
 ['virus', 'transmitir', 'atraves', 'meio'],
 ['significar', 'covid'],
 ['comorbidade', 'poder', 'tomar', 'vacina'],
 ['covid', 'familia', 'virus'],
 ['vacina', 'disponivel', 'covid'],
 ['crianca', 'poder', 'vacinar', 'contra', 'covid'],
 ['bom', 'vacina'],
 ['vacina', 'contra', 'covid', 'seguro'],
 ['ter', 'tomar', 'quanto', 'dose', 'vacina'],
 ['prevenir', 'contagio'],
 ['covid', 'comecar', 'china'],
 ['covid', 'doenca', 'grave'],
 ['covid', 'grave'],
 ['covid', 'virus', 'sarscov'],
 ['covid', 'virus', 'grave'],
 ['covid'],
 ['covid', 'doenca', 'grave'],
 ['covid', 'gripe'],
 ['covid', 'criar', 'laboratorio'],
 ['covid', 'vir'],
 ['covid', 'vir'],
 ['covid'],
 ['falar', 'covid'],
 ['covid'],
 ['querer', 'dizer', 'covid'],
 ['virus', 'familia', 'covid'],
 ['covid', 'vir', 'china'],
 ['virus', 'comum', 'familia', 'covid'],
 ['pegar',

In [16]:
# geração de corpus com dataset, remoção de stopwords e tokenização
ttext_corpus = df_testdata.input.to_numpy()
# Lowercase each document, split it by white space and filter out stopwords
ttexts = [[word for word in document.lower().split() if word not in stop_words]
         for document in ttext_corpus]
# Count word frequencies
tfrequency = defaultdict(int)
for text in ttexts:
    for token in text:
        tfrequency[token] += 1
# Only keep words that appear more than once
tprocessed_corpus = [[token for token in text if tfrequency[token] > 1] for text in ttexts]
pprint.pprint(tprocessed_corpus)

[['diante', 'vacina', 'vacinacao', 'contra', 'covid'],
 ['bom', 'vacina', 'contra', 'covid', 'desenvolver'],
 ['considerar', 'vacina', 'contra', 'covid', 'poder', 'considerar', 'seguro'],
 ['alternativa', 'acesso', 'vacina'],
 ['vacina', 'funcionar', 'contra', 'novo', 'virus'],
 ['vacina', 'previner', 'doenca', 'forma', 'grave', 'morte', 'transmissao'],
 ['temperatura', 'vacina'],
 ['pais', 'poder', 'obter', 'vacina'],
 ['diante', 'impacto', 'sobre', 'outro', 'vacinacao'],
 ['quanto', 'tempo', 'coronavirus', 'ativo'],
 ['existir',
  'algum',
  'direto',
  'ministerio',
  'saude',
  'tirar',
  'sobre',
  'novo',
  'coronavirus'],
 ['verdade',
  'ter',
  'uso',
  'paciente',
  'recuperar',
  'covid',
  'tratamento',
  'doenca'],
 ['funcionar', 'tratamento', 'contra', 'covid'],
 ['virus', 'causar', 'doenca', 'covid', 'ar'],
 ['periodo', 'incubacao', 'periodo', 'incubacao', 'novo', 'coronavirus'],
 ['possivel', 'contaminar', 'novo', 'coronavirus', 'meio', 'mao'],
 ['qualquer', 'hospital', 

In [17]:
# criação de dicionário
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary<657 unique tokens: ['covid', 'contaminar', 'objeto', 'pego', 'tocar']...>


In [18]:
# criação de dicionário
tdictionary = corpora.Dictionary(tprocessed_corpus)
print(tdictionary)

Dictionary<312 unique tokens: ['contra', 'covid', 'diante', 'vacina', 'vacinacao']...>


In [19]:
# criação de bag of words com corpus
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint.pprint(bow_corpus)

[[(0, 1)],
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(0, 1), (3, 1), (5, 1), (6, 1)],
 [(0, 1), (7, 1), (8, 1)],
 [(0, 1), (8, 1)],
 [(5, 1), (6, 1), (9, 1), (10, 1)],
 [(0, 1), (11, 1)],
 [(12, 1), (13, 1), (14, 1), (15, 1)],
 [(0, 1), (10, 1), (16, 1)],
 [(0, 1), (15, 1), (17, 1)],
 [(0, 1), (13, 1), (18, 1), (19, 1), (20, 1)],
 [(15, 1), (21, 1)],
 [(0, 1), (15, 1), (18, 1), (22, 1)],
 [(14, 1), (15, 1), (23, 1), (24, 1), (25, 1)],
 [(26, 1), (27, 1)],
 [(0, 1), (7, 1), (28, 1)],
 [(0, 1), (29, 1), (30, 1)],
 [(0, 1), (30, 1)],
 [(0, 1), (10, 1), (31, 1)],
 [(0, 1), (10, 1), (30, 1)],
 [(0, 1)],
 [(0, 1), (29, 1), (30, 1)],
 [(0, 1), (32, 1)],
 [(0, 1), (33, 1), (34, 1)],
 [(0, 1), (35, 1)],
 [(0, 1), (35, 1)],
 [(0, 1)],
 [(0, 1), (36, 1)],
 [(0, 1)],
 [(0, 1), (37, 1), (38, 1)],
 [(0, 1), (10, 1), (16, 1)],
 [(0, 1), (28, 1), (35, 1)],
 [(0, 1), (10, 1), (16, 1), (39, 1)],
 [(0, 1), (40, 1), (41, 1)],
 [(10, 1), (39, 1), (42, 1), (43, 1)],
 [(0, 1), (39, 1)],
 [(41, 1), (44, 1)

In [20]:
# criação de bag of words com corpus
tbow_corpus = [tdictionary.doc2bow(text) for text in tprocessed_corpus]
pprint.pprint(tbow_corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(0, 1), (1, 1), (3, 1), (5, 1), (6, 1)],
 [(0, 1), (1, 1), (3, 1), (7, 2), (8, 1), (9, 1)],
 [(3, 1), (10, 1), (11, 1)],
 [(0, 1), (3, 1), (12, 1), (13, 1), (14, 1)],
 [(3, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)],
 [(3, 1), (21, 1)],
 [(3, 1), (8, 1), (22, 1), (23, 1)],
 [(2, 1), (4, 1), (24, 1), (25, 1), (26, 1)],
 [(27, 1), (28, 1), (29, 1), (30, 1)],
 [(13, 1),
  (26, 1),
  (28, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1)],
 [(1, 1), (15, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)],
 [(0, 1), (1, 1), (12, 1), (40, 1)],
 [(1, 1), (14, 1), (15, 1), (43, 1), (44, 1)],
 [(13, 1), (28, 1), (45, 2), (46, 2)],
 [(13, 1), (28, 1), (47, 1), (48, 1), (49, 1), (50, 1)],
 [(8, 1), (28, 1), (37, 1), (51, 1), (52, 1), (53, 1)],
 [(54, 1), (55, 1), (56, 1), (57, 1), (58, 1)],
 [(0, 1), (1, 1), (3, 1), (41, 1)],
 [(0, 1), (1, 1), (3, 1), (59, 1), (60, 1), (61, 1), (62, 1)],
 [(3, 1), (59, 1), (60, 

In [21]:
# criação do setup Pycaret para NLP
setup_nlp = setup(df_all, target='input', session_id=999)

Description,Value
session_id,999
Documents,1199
Vocab Size,357
Custom Stopwords,False


In [22]:
# alteração da configuração do text, para o processado em português (por padrão o Pycaret processa em inglês)
set_config('text', processed_corpus)
get_config('text')

[['covid'],
 ['pego', 'covid', 'tocar', 'objeto', 'contaminar'],
 ['pego', 'covid', 'atraves', 'meio'],
 ['onde', 'comecar', 'covid'],
 ['onde', 'covid'],
 ['virus', 'transmitir', 'atraves', 'meio'],
 ['significar', 'covid'],
 ['comorbidade', 'poder', 'tomar', 'vacina'],
 ['covid', 'familia', 'virus'],
 ['vacina', 'disponivel', 'covid'],
 ['crianca', 'poder', 'vacinar', 'contra', 'covid'],
 ['bom', 'vacina'],
 ['vacina', 'contra', 'covid', 'seguro'],
 ['ter', 'tomar', 'quanto', 'dose', 'vacina'],
 ['prevenir', 'contagio'],
 ['covid', 'comecar', 'china'],
 ['covid', 'doenca', 'grave'],
 ['covid', 'grave'],
 ['covid', 'virus', 'sarscov'],
 ['covid', 'virus', 'grave'],
 ['covid'],
 ['covid', 'doenca', 'grave'],
 ['covid', 'gripe'],
 ['covid', 'criar', 'laboratorio'],
 ['covid', 'vir'],
 ['covid', 'vir'],
 ['covid'],
 ['falar', 'covid'],
 ['covid'],
 ['querer', 'dizer', 'covid'],
 ['virus', 'familia', 'covid'],
 ['covid', 'vir', 'china'],
 ['virus', 'comum', 'familia', 'covid'],
 ['pegar',

In [23]:
# alteração da configuração do corpus, para o processado em português (por padrão o Pycaret processa em inglês)
set_config('corpus', bow_corpus)
get_config('corpus')

[[(0, 1)],
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(0, 1), (3, 1), (5, 1), (6, 1)],
 [(0, 1), (7, 1), (8, 1)],
 [(0, 1), (8, 1)],
 [(5, 1), (6, 1), (9, 1), (10, 1)],
 [(0, 1), (11, 1)],
 [(12, 1), (13, 1), (14, 1), (15, 1)],
 [(0, 1), (10, 1), (16, 1)],
 [(0, 1), (15, 1), (17, 1)],
 [(0, 1), (13, 1), (18, 1), (19, 1), (20, 1)],
 [(15, 1), (21, 1)],
 [(0, 1), (15, 1), (18, 1), (22, 1)],
 [(14, 1), (15, 1), (23, 1), (24, 1), (25, 1)],
 [(26, 1), (27, 1)],
 [(0, 1), (7, 1), (28, 1)],
 [(0, 1), (29, 1), (30, 1)],
 [(0, 1), (30, 1)],
 [(0, 1), (10, 1), (31, 1)],
 [(0, 1), (10, 1), (30, 1)],
 [(0, 1)],
 [(0, 1), (29, 1), (30, 1)],
 [(0, 1), (32, 1)],
 [(0, 1), (33, 1), (34, 1)],
 [(0, 1), (35, 1)],
 [(0, 1), (35, 1)],
 [(0, 1)],
 [(0, 1), (36, 1)],
 [(0, 1)],
 [(0, 1), (37, 1), (38, 1)],
 [(0, 1), (10, 1), (16, 1)],
 [(0, 1), (28, 1), (35, 1)],
 [(0, 1), (10, 1), (16, 1), (39, 1)],
 [(0, 1), (40, 1), (41, 1)],
 [(10, 1), (39, 1), (42, 1), (43, 1)],
 [(0, 1), (39, 1)],
 [(41, 1), (44, 1)

In [24]:
# alteração da configuração do data_, para o processado em português (por padrão o Pycaret processa em inglês)
set_config('data_', df_all)
get_config('data_')

Unnamed: 0,input,category
0,covid,covid
1,pego covid tocar objeto contaminar,covid
2,pego covid atraves meio,covid
3,onde comecar covid,covid
4,onde covid,covid
...,...,...
1194,setor meios hospedagem recomendar setor,seloturismo
1195,final dever realizar limpeza unidade,seloturismo
1196,,seloturismo
1197,protecao,seloturismo


In [25]:
# alteração da configuração do dicionário, para o processado em português (por padrão o Pycaret processa em inglês)
set_config('id2word', dictionary)
d = get_config('id2word')
print(d)

Dictionary<657 unique tokens: ['covid', 'contaminar', 'objeto', 'pego', 'tocar']...>


In [26]:
# modelos disponíveis no Pycaret
#models()

In [27]:
# Latent Dirichlet Allocation
#lda = create_model('lda', num_topics=3, multi_core=True)
#print(lda)


In [28]:
# Latent Semantic Indexing
# aparentemente algum erro no pacote
#lsi = create_model('lsi', num_topics=3, multi_core=True)
#print(lsi)


In [29]:
# Hierarchical Dirichlet Process
#hdp = create_model('hdp', num_topics=3, multi_core=True)
#print(hdp)


In [30]:
# Random Projections
#rp = create_model('rp', num_topics=3, multi_core=True)
#print(rp)


In [32]:
# Non-Negative Matrix Factorization
nmf = create_model('nmf', num_topics=3, multi_core=True)
print(nmf)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=3, random_state=999, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)


In [33]:
# Avaliação do modelo
#evaluate_model(lda)

In [34]:
# Avaliação do modelo
#evaluate_model(lsi)

In [35]:
# Avaliação do modelo
#evaluate_model(hdp)

In [36]:
# Avaliação do modelo
#evaluate_model(rp)

In [37]:
# Avaliação do modelo
evaluate_model(nmf)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Frequency Plot', 'freque…

In [38]:
# Resultados do modelo
#lda_results = assign_model(lda)
#lda_results

In [39]:
# Resultados do modelo
#lsi_results = assign_model(lsi)
#lsi_results

In [40]:
# Resultados do modelo
#hdp_results = assign_model(hdp)
#hdp_results

In [41]:
# Resultados do modelo
#rp_results = assign_model(rp)
#rp_results

In [42]:
# Resultados do modelo
nmf_results = assign_model(nmf)
nmf_results

Unnamed: 0,input,category,Topic_0,Topic_1,Topic_2,Dominant_Topic,Perc_Dominant_Topic
0,covid,covid,0.000000,0.473614,0.000000,Topic 1,1.00
1,pego covid tocar objeto contaminar,covid,0.000000,0.058017,0.000000,Topic 1,1.00
2,pego covid atraves meio,covid,0.000000,0.073348,0.000000,Topic 1,1.00
3,onde comecar covid,covid,0.000000,0.117244,0.001173,Topic 1,0.99
4,onde covid,covid,0.000000,0.206630,0.002319,Topic 1,0.99
...,...,...,...,...,...,...,...
1194,setor meios hospedagem recomendar setor,seloturismo,0.000000,0.000000,0.009163,Topic 2,1.00
1195,final dever realizar limpeza unidade,seloturismo,0.000066,0.001263,0.001381,Topic 2,0.51
1196,,seloturismo,0.000000,0.000000,0.000000,Topic 0,
1197,protecao,seloturismo,0.000000,0.000000,0.005243,Topic 2,1.00


In [43]:
## ATENÇÃO ##
# isso é uma marretada, tem que fazer uma query pra pegar os labels de cada tópico e aplicar o outro modelo
nmf_results.rename(columns={'Topic_0':'tuberculose','Topic_1':'covid','Topic_2':'seloturismo'}, inplace=True)
nmf_results

Unnamed: 0,input,category,tuberculose,covid,seloturismo,Dominant_Topic,Perc_Dominant_Topic
0,covid,covid,0.000000,0.473614,0.000000,Topic 1,1.00
1,pego covid tocar objeto contaminar,covid,0.000000,0.058017,0.000000,Topic 1,1.00
2,pego covid atraves meio,covid,0.000000,0.073348,0.000000,Topic 1,1.00
3,onde comecar covid,covid,0.000000,0.117244,0.001173,Topic 1,0.99
4,onde covid,covid,0.000000,0.206630,0.002319,Topic 1,0.99
...,...,...,...,...,...,...,...
1194,setor meios hospedagem recomendar setor,seloturismo,0.000000,0.000000,0.009163,Topic 2,1.00
1195,final dever realizar limpeza unidade,seloturismo,0.000066,0.001263,0.001381,Topic 2,0.51
1196,,seloturismo,0.000000,0.000000,0.000000,Topic 0,
1197,protecao,seloturismo,0.000000,0.000000,0.005243,Topic 2,1.00


In [44]:
# Criação de dataset para classificação
nmf_results.drop(['input','Dominant_Topic', 'Perc_Dominant_Topic'], axis=1, inplace=True)
nmf_results

Unnamed: 0,category,tuberculose,covid,seloturismo
0,covid,0.000000,0.473614,0.000000
1,covid,0.000000,0.058017,0.000000
2,covid,0.000000,0.073348,0.000000
3,covid,0.000000,0.117244,0.001173
4,covid,0.000000,0.206630,0.002319
...,...,...,...,...
1194,seloturismo,0.000000,0.000000,0.009163
1195,seloturismo,0.000066,0.001263,0.001381
1196,seloturismo,0.000000,0.000000,0.000000
1197,seloturismo,0.000000,0.000000,0.005243


In [45]:
set_config('text', tprocessed_corpus)
get_config('text')

[['diante', 'vacina', 'vacinacao', 'contra', 'covid'],
 ['bom', 'vacina', 'contra', 'covid', 'desenvolver'],
 ['considerar', 'vacina', 'contra', 'covid', 'poder', 'considerar', 'seguro'],
 ['alternativa', 'acesso', 'vacina'],
 ['vacina', 'funcionar', 'contra', 'novo', 'virus'],
 ['vacina', 'previner', 'doenca', 'forma', 'grave', 'morte', 'transmissao'],
 ['temperatura', 'vacina'],
 ['pais', 'poder', 'obter', 'vacina'],
 ['diante', 'impacto', 'sobre', 'outro', 'vacinacao'],
 ['quanto', 'tempo', 'coronavirus', 'ativo'],
 ['existir',
  'algum',
  'direto',
  'ministerio',
  'saude',
  'tirar',
  'sobre',
  'novo',
  'coronavirus'],
 ['verdade',
  'ter',
  'uso',
  'paciente',
  'recuperar',
  'covid',
  'tratamento',
  'doenca'],
 ['funcionar', 'tratamento', 'contra', 'covid'],
 ['virus', 'causar', 'doenca', 'covid', 'ar'],
 ['periodo', 'incubacao', 'periodo', 'incubacao', 'novo', 'coronavirus'],
 ['possivel', 'contaminar', 'novo', 'coronavirus', 'meio', 'mao'],
 ['qualquer', 'hospital', 

In [46]:
# alteração da configuração do corpus, para o processado em português (por padrão o Pycaret processa em inglês)
set_config('corpus', tbow_corpus)
get_config('corpus')

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(0, 1), (1, 1), (3, 1), (5, 1), (6, 1)],
 [(0, 1), (1, 1), (3, 1), (7, 2), (8, 1), (9, 1)],
 [(3, 1), (10, 1), (11, 1)],
 [(0, 1), (3, 1), (12, 1), (13, 1), (14, 1)],
 [(3, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1)],
 [(3, 1), (21, 1)],
 [(3, 1), (8, 1), (22, 1), (23, 1)],
 [(2, 1), (4, 1), (24, 1), (25, 1), (26, 1)],
 [(27, 1), (28, 1), (29, 1), (30, 1)],
 [(13, 1),
  (26, 1),
  (28, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1)],
 [(1, 1), (15, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1)],
 [(0, 1), (1, 1), (12, 1), (40, 1)],
 [(1, 1), (14, 1), (15, 1), (43, 1), (44, 1)],
 [(13, 1), (28, 1), (45, 2), (46, 2)],
 [(13, 1), (28, 1), (47, 1), (48, 1), (49, 1), (50, 1)],
 [(8, 1), (28, 1), (37, 1), (51, 1), (52, 1), (53, 1)],
 [(54, 1), (55, 1), (56, 1), (57, 1), (58, 1)],
 [(0, 1), (1, 1), (3, 1), (41, 1)],
 [(0, 1), (1, 1), (3, 1), (59, 1), (60, 1), (61, 1), (62, 1)],
 [(3, 1), (59, 1), (60, 

In [47]:
# alteração da configuração do dicionário, para o processado em português (por padrão o Pycaret processa em inglês)
set_config('id2word', tdictionary)
d = get_config('id2word')
print(d)

Dictionary<312 unique tokens: ['contra', 'covid', 'diante', 'vacina', 'vacinacao']...>


In [48]:
set_config('data_', df_testdata)
get_config('data_')

Unnamed: 0,input,category
0,diante vacina vacinacao contra covid,covid
1,bom vacina contra covid desenvolver,covid
2,considerar vacina contra covid poder considera...,covid
3,alternativa acesso vacina,covid
4,vacina funcionar contra novo virus,covid
...,...,...
472,protocolo higiene selo turismo,seloturismo
473,selo turismo aplicar qualquer setor,seloturismo
474,estado aderir selo turismo,seloturismo
475,uso selo,seloturismo


In [49]:
tnmf = create_model('nmf', num_topics=3, multi_core=True)
print(tnmf)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=3, random_state=999, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)


In [50]:
nmf_results_test = assign_model(tnmf)
nmf_results_test

Unnamed: 0,input,category,Topic_0,Topic_1,Topic_2,Dominant_Topic,Perc_Dominant_Topic
0,diante vacina vacinacao contra covid,covid,0.000000,0.000000,0.081986,Topic 2,1.00
1,bom vacina contra covid desenvolver,covid,0.000000,0.000000,0.079624,Topic 2,1.00
2,considerar vacina contra covid poder considera...,covid,0.000952,0.000000,0.064799,Topic 2,0.99
3,alternativa acesso vacina,covid,0.001509,0.002560,0.012262,Topic 2,0.75
4,vacina funcionar contra novo virus,covid,0.002447,0.007521,0.015967,Topic 2,0.62
...,...,...,...,...,...,...,...
472,protocolo higiene selo turismo,seloturismo,0.124177,0.000000,0.000000,Topic 0,1.00
473,selo turismo aplicar qualquer setor,seloturismo,0.087195,0.000000,0.001007,Topic 0,0.99
474,estado aderir selo turismo,seloturismo,0.117481,0.000000,0.000000,Topic 0,1.00
475,uso selo,seloturismo,0.130259,0.000000,0.003107,Topic 0,0.98


In [51]:
## ATENÇÃO ##
# isso é uma marretada, tem que fazer uma query pra pegar os labels de cada tópico de acordo com o primeiro modelo treinado
nmf_results_test.rename(columns={'Topic_0':'seloturismo','Topic_1':'tuberculose','Topic_2':'covid'}, inplace=True)
nmf_results_test

Unnamed: 0,input,category,seloturismo,tuberculose,covid,Dominant_Topic,Perc_Dominant_Topic
0,diante vacina vacinacao contra covid,covid,0.000000,0.000000,0.081986,Topic 2,1.00
1,bom vacina contra covid desenvolver,covid,0.000000,0.000000,0.079624,Topic 2,1.00
2,considerar vacina contra covid poder considera...,covid,0.000952,0.000000,0.064799,Topic 2,0.99
3,alternativa acesso vacina,covid,0.001509,0.002560,0.012262,Topic 2,0.75
4,vacina funcionar contra novo virus,covid,0.002447,0.007521,0.015967,Topic 2,0.62
...,...,...,...,...,...,...,...
472,protocolo higiene selo turismo,seloturismo,0.124177,0.000000,0.000000,Topic 0,1.00
473,selo turismo aplicar qualquer setor,seloturismo,0.087195,0.000000,0.001007,Topic 0,0.99
474,estado aderir selo turismo,seloturismo,0.117481,0.000000,0.000000,Topic 0,1.00
475,uso selo,seloturismo,0.130259,0.000000,0.003107,Topic 0,0.98


In [52]:
# Criação de dataset para teste
nmf_results_test.drop(['input','Dominant_Topic', 'Perc_Dominant_Topic'], axis=1, inplace=True)
nmf_results_test

Unnamed: 0,category,seloturismo,tuberculose,covid
0,covid,0.000000,0.000000,0.081986
1,covid,0.000000,0.000000,0.079624
2,covid,0.000952,0.000000,0.064799
3,covid,0.001509,0.002560,0.012262
4,covid,0.002447,0.007521,0.015967
...,...,...,...,...
472,seloturismo,0.124177,0.000000,0.000000
473,seloturismo,0.087195,0.000000,0.001007
474,seloturismo,0.117481,0.000000,0.000000
475,seloturismo,0.130259,0.000000,0.003107


In [53]:
from pycaret.classification import *

# # criação do setup Pycaret para Classificação
setup_class = setup(data=nmf_results, target='category', test_data=nmf_results_test, session_id=9999)

Unnamed: 0,Description,Value
0,session_id,9999
1,Target,category
2,Target Type,Multiclass
3,Label Encoded,"covid: 0, seloturismo: 1, tuberculose: 2"
4,Original Data,"(1199, 4)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [54]:
# comparação de modelos e salva os 5 de maior Accuracy
# compare models
top5 = compare_models(n_select = 5) 

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9373,0.9826,0.9212,0.9435,0.9385,0.893,0.8947,0.054
catboost,CatBoost Classifier,0.9365,0.983,0.9238,0.9438,0.9378,0.8925,0.8948,0.817
knn,K Neighbors Classifier,0.9356,0.9711,0.9151,0.941,0.9362,0.8886,0.8901,0.008
et,Extra Trees Classifier,0.9356,0.986,0.9187,0.9402,0.9363,0.8892,0.8905,0.046
gbc,Gradient Boosting Classifier,0.9323,0.9859,0.917,0.939,0.933,0.884,0.8861,0.079
dt,Decision Tree Classifier,0.929,0.9443,0.9132,0.9341,0.9296,0.879,0.8809,0.003
xgboost,Extreme Gradient Boosting,0.9223,0.981,0.9086,0.93,0.9238,0.8679,0.87,0.066
lightgbm,Light Gradient Boosting Machine,0.9215,0.9815,0.9043,0.927,0.9223,0.8658,0.8677,0.04
nb,Naive Bayes,0.919,0.9837,0.8693,0.9238,0.9126,0.8501,0.8601,0.003
qda,Quadratic Discriminant Analysis,0.919,0.9835,0.8723,0.9228,0.9139,0.8516,0.8595,0.006


In [55]:
# blend models
blender = blend_models(estimator_list = top5) 

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9417,0.9858,0.9357,0.9421,0.9418,0.9007,0.9008
1,0.975,0.9982,0.9714,0.9779,0.9754,0.9571,0.9579
2,0.9333,0.989,0.9227,0.9341,0.933,0.8858,0.8865
3,0.975,0.9942,0.9662,0.9755,0.9748,0.9565,0.9569
4,0.9917,0.9998,0.9855,0.992,0.9916,0.9856,0.9857
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9583,0.9975,0.9301,0.9581,0.9571,0.9267,0.9279
7,0.9667,0.9974,0.9446,0.9663,0.966,0.9417,0.9422
8,0.9083,0.9693,0.8718,0.9108,0.9076,0.8391,0.8416
9,0.7647,0.9274,0.7352,0.8091,0.7747,0.6035,0.6121


In [56]:
# stack models
stacker = stack_models(estimator_list = top5) 

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9333,0.9798,0.9218,0.9333,0.9333,0.8859,0.8859
1,0.9833,0.9998,0.9762,0.9838,0.9831,0.971,0.9714
2,0.9417,0.9892,0.9275,0.9414,0.9412,0.8995,0.8999
3,0.975,0.9944,0.9662,0.9755,0.9748,0.9565,0.9569
4,0.9917,0.9996,0.9855,0.992,0.9916,0.9856,0.9857
5,0.9917,1.0,0.9855,0.992,0.9916,0.9856,0.9857
6,0.9583,0.9968,0.9301,0.9581,0.9571,0.9267,0.9279
7,0.9667,0.9978,0.9446,0.9663,0.966,0.9417,0.9422
8,0.9,0.9698,0.8573,0.9022,0.8986,0.8235,0.8268
9,0.7479,0.9185,0.7158,0.7948,0.7595,0.5752,0.5833


In [57]:
# automl 
best = automl(optimize = 'Accuracy')
print(best)

VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
        

In [58]:
# calibrate model
best_calib = calibrate_model(best)
predict_model(best_calib)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9417,0.9861,0.9357,0.9421,0.9418,0.9007,0.9008
1,0.975,0.9979,0.9714,0.9763,0.975,0.9568,0.9573
2,0.95,0.9903,0.9324,0.9495,0.9492,0.9134,0.9139
3,0.975,0.9931,0.9662,0.9755,0.9748,0.9565,0.9569
4,0.9917,0.9998,0.9855,0.992,0.9916,0.9856,0.9857
5,0.9917,1.0,0.9855,0.992,0.9916,0.9856,0.9857
6,0.9583,0.997,0.9301,0.9581,0.9571,0.9267,0.9279
7,0.9667,0.9967,0.9446,0.9663,0.966,0.9417,0.9422
8,0.9083,0.969,0.8718,0.9108,0.9076,0.8391,0.8416
9,0.7647,0.9247,0.7352,0.8142,0.7765,0.6056,0.6154


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7736,0.927,0.7736,0.79,0.7766,0.6604,0.6652


Unnamed: 0,tuberculose,covid,seloturismo,category,Label,Score
0,0.000000,0.081986,0.000000,covid,covid,0.9534
1,0.000000,0.079624,0.000000,covid,covid,0.9534
2,0.000000,0.064799,0.000952,covid,covid,0.9502
3,0.002560,0.012262,0.001509,covid,covid,0.9339
4,0.007521,0.015967,0.002447,covid,covid,0.9243
...,...,...,...,...,...,...
472,0.000000,0.000000,0.124177,seloturismo,seloturismo,0.9634
473,0.000000,0.001007,0.087195,seloturismo,seloturismo,0.9634
474,0.000000,0.000000,0.117481,seloturismo,seloturismo,0.9634
475,0.000000,0.003107,0.130259,seloturismo,seloturismo,0.9633


In [59]:
# Avaliação do modelo
evaluate_model(best_calib)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [60]:
# Predição com os dados de validação
valid = predict_model(best_calib)
valid

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7736,0.927,0.7736,0.79,0.7766,0.6604,0.6652


Unnamed: 0,tuberculose,covid,seloturismo,category,Label,Score
0,0.000000,0.081986,0.000000,covid,covid,0.9534
1,0.000000,0.079624,0.000000,covid,covid,0.9534
2,0.000000,0.064799,0.000952,covid,covid,0.9502
3,0.002560,0.012262,0.001509,covid,covid,0.9339
4,0.007521,0.015967,0.002447,covid,covid,0.9243
...,...,...,...,...,...,...
472,0.000000,0.000000,0.124177,seloturismo,seloturismo,0.9634
473,0.000000,0.001007,0.087195,seloturismo,seloturismo,0.9634
474,0.000000,0.000000,0.117481,seloturismo,seloturismo,0.9634
475,0.000000,0.003107,0.130259,seloturismo,seloturismo,0.9633


In [61]:
final = finalize_model(best_calib)
final

CalibratedClassifierCV(base_estimator=VotingClassifier(estimators=[('rf',
                                                                    RandomForestClassifier(bootstrap=True,
                                                                                           ccp_alpha=0.0,
                                                                                           class_weight=None,
                                                                                           criterion='gini',
                                                                                           max_depth=None,
                                                                                           max_features='auto',
                                                                                           max_leaf_nodes=None,
                                                                                           max_samples=None,
                                                      