In [1]:
import numpy as np
import pandas as pd
from pycaret.nlp import *
import nltk
from nltk.corpus import stopwords
import unicodedata
import unidecode
import re
import spacy
import pprint
from gensim import corpora

In [2]:
# download das stopwords para o idioma português
nltk.download('stopwords')
stop_words = stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# montagem do dataset
df_covid = pd.read_csv('data/exemplos-treinamento-covid.csv', delimiter=';')
df_seloturismo = pd.read_csv('data/exemplos-treinamento-seloturismo.csv', delimiter=';')
df_tuberculose = pd.read_csv('data/exemplos-treinamento-tuberculose.csv', delimiter=';')
df_teste = pd.read_csv('data/dados-testes-experimentos.csv', delimiter=';')

df_all = pd.concat([df_covid, df_seloturismo, df_tuberculose, df_teste], axis=0)
df_all.reset_index(drop=True, inplace=True)
df_all


Unnamed: 0,input,category
0,dor no abdomen é covid?,covid
1,Pego covid tocando numa objeto_contaminado,covid
2,Pego covid através de meio,covid
3,onde começou o covid,covid
4,onde coemçou a infestação do covid,covid
...,...,...
1194,Para os setores de Meios de Hospedagem as prem...,seloturismo
1195,"Ao final da estada do hóspede, deverá ser real...",seloturismo
1196,É proibido pernoitar na embarcação,seloturismo
1197,Fornecer Equipamentos de Proteção Individual (...,seloturismo


In [4]:
# Normalização string de entrada 
# retira pontuação
df_all.input = df_all.input.str.replace(r'[^\w\s]+', '')

# remove numeração
df_all.input = df_all.input.str.replace(r'[0-9]+', '')

# remove underscore
df_all.input = df_all.input.str.replace('_', ' ')
df_all

  df_all.input = df_all.input.str.replace(r'[^\w\s]+', '')
  df_all.input = df_all.input.str.replace(r'[0-9]+', '')


Unnamed: 0,input,category
0,dor no abdomen é covid,covid
1,Pego covid tocando numa objeto contaminado,covid
2,Pego covid através de meio,covid
3,onde começou o covid,covid
4,onde coemçou a infestação do covid,covid
...,...,...
1194,Para os setores de Meios de Hospedagem as prem...,seloturismo
1195,Ao final da estada do hóspede deverá ser reali...,seloturismo
1196,É proibido pernoitar na embarcação,seloturismo
1197,Fornecer Equipamentos de Proteção Individual E...,seloturismo


In [5]:
# Lematização string de entrada
## Verificar o uso do spacy pois tem em português
nlp = spacy.load("pt_core_news_lg")
lem = nlp.get_pipe("lemmatizer")

inputs = []
for doc in df_all.input:
    d = nlp(doc)
    s = ' '.join([token.lemma_ for token in d])       
    inputs.append(s)
df_all['input'] = inputs
df_all



Unnamed: 0,input,category
0,dor em o abdomen ser Covid,covid
1,pego Covid tocar em um objeto contaminar,covid
2,pego Covid através de meio,covid
3,onde começar o Covid,covid
4,onde coemçar o infestação de o Covid,covid
...,...,...
1194,para o setor de Meios de Hospedagem o premissa...,seloturismo
1195,a o final de o estada de o hóspede dever ser r...,seloturismo
1196,ser proibir pernoitar em o embarcação,seloturismo
1197,Fornecer Equipamentos de Proteção Individual E...,seloturismo


In [6]:
# geração de corpus com dataset, remoção de stopwords e tokenização
text_corpus = df_all.input.to_numpy()
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stop_words]
         for document in text_corpus]
# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint.pprint(processed_corpus)

[['covid'],
 ['pego', 'covid', 'tocar', 'objeto', 'contaminar'],
 ['pego', 'covid', 'através', 'meio'],
 ['onde', 'começar', 'covid'],
 ['onde', 'covid'],
 ['vírus', 'transmitir', 'através', 'meio'],
 ['significar', 'covid'],
 ['comorbidade', 'poder', 'tomar', 'vacina'],
 ['covid', 'família', 'vírus'],
 ['vacina', 'disponível', 'covid'],
 ['criança', 'poder', 'vacinar', 'contra', 'covid'],
 ['bom', 'vacina'],
 ['vacina', 'contra', 'covid', 'seguro'],
 ['ter', 'tomar', 'quanto', 'dose', 'vacina'],
 ['prevenir', 'contagio'],
 ['covid', 'começar', 'china'],
 ['covid', 'doença', 'grave'],
 ['covid', 'grave'],
 ['covid', 'virus', 'sarscov'],
 ['covid', 'vírus', 'grave'],
 ['covid'],
 ['covid', 'doença', 'grave'],
 ['covid', 'gripe'],
 ['covid', 'criar', 'laboratorio'],
 ['covid', 'vir'],
 ['covid', 'vir'],
 ['covid'],
 ['falar', 'covid'],
 ['covid'],
 ['querer', 'dizer', 'covid'],
 ['virus', 'família', 'covid'],
 ['covid', 'vir', 'china'],
 ['virus', 'comum', 'família', 'covid'],
 ['pegar',

In [7]:
# atualização de strings de entrada pós criação de corpus
inputs = []
for arr in processed_corpus:
    s = ' '.join([w for w in arr])         
    inputs.append(s)
inputs
df_all['input'] = inputs
df_all

Unnamed: 0,input,category
0,covid,covid
1,pego covid tocar objeto contaminar,covid
2,pego covid através meio,covid
3,onde começar covid,covid
4,onde covid,covid
...,...,...
1194,setor meios hospedagem recomendar setor,seloturismo
1195,final dever realizar limpeza unidade,seloturismo
1196,,seloturismo
1197,proteção,seloturismo


In [8]:
## Continuação da normalização da string de entrada

# retira acentuação
for i in df_all.index:
    df_all.input[i] = unidecode.unidecode(df_all.input[i])

# Remove Emails
df_all.input = [re.sub('\S*@\S*\s?', '', sent) for sent in df_all.input]

# Remove new line characters
df_all.input = [re.sub('\s+', ' ', sent) for sent in df_all.input]

# Remove distracting single quotes
df_all.input = [re.sub("\'", "", sent) for sent in df_all.input]

df_all

Unnamed: 0,input,category
0,covid,covid
1,pego covid tocar objeto contaminar,covid
2,pego covid atraves meio,covid
3,onde comecar covid,covid
4,onde covid,covid
...,...,...
1194,setor meios hospedagem recomendar setor,seloturismo
1195,final dever realizar limpeza unidade,seloturismo
1196,,seloturismo
1197,protecao,seloturismo


In [9]:
# atualização do corpus pós término da normalização
text_corpus = df_all.input.to_numpy()
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stop_words]
         for document in text_corpus]
# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint.pprint(processed_corpus)

[['covid'],
 ['pego', 'covid', 'tocar', 'objeto', 'contaminar'],
 ['pego', 'covid', 'atraves', 'meio'],
 ['onde', 'comecar', 'covid'],
 ['onde', 'covid'],
 ['virus', 'transmitir', 'atraves', 'meio'],
 ['significar', 'covid'],
 ['comorbidade', 'poder', 'tomar', 'vacina'],
 ['covid', 'familia', 'virus'],
 ['vacina', 'disponivel', 'covid'],
 ['crianca', 'poder', 'vacinar', 'contra', 'covid'],
 ['bom', 'vacina'],
 ['vacina', 'contra', 'covid', 'seguro'],
 ['ter', 'tomar', 'quanto', 'dose', 'vacina'],
 ['prevenir', 'contagio'],
 ['covid', 'comecar', 'china'],
 ['covid', 'doenca', 'grave'],
 ['covid', 'grave'],
 ['covid', 'virus', 'sarscov'],
 ['covid', 'virus', 'grave'],
 ['covid'],
 ['covid', 'doenca', 'grave'],
 ['covid', 'gripe'],
 ['covid', 'criar', 'laboratorio'],
 ['covid', 'vir'],
 ['covid', 'vir'],
 ['covid'],
 ['falar', 'covid'],
 ['covid'],
 ['querer', 'dizer', 'covid'],
 ['virus', 'familia', 'covid'],
 ['covid', 'vir', 'china'],
 ['virus', 'comum', 'familia', 'covid'],
 ['pegar',

In [10]:
# criação de dicionário
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary<657 unique tokens: ['covid', 'contaminar', 'objeto', 'pego', 'tocar']...>


In [11]:
# criação de bag of words com corpus
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint.pprint(bow_corpus)

[[(0, 1)],
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(0, 1), (3, 1), (5, 1), (6, 1)],
 [(0, 1), (7, 1), (8, 1)],
 [(0, 1), (8, 1)],
 [(5, 1), (6, 1), (9, 1), (10, 1)],
 [(0, 1), (11, 1)],
 [(12, 1), (13, 1), (14, 1), (15, 1)],
 [(0, 1), (10, 1), (16, 1)],
 [(0, 1), (15, 1), (17, 1)],
 [(0, 1), (13, 1), (18, 1), (19, 1), (20, 1)],
 [(15, 1), (21, 1)],
 [(0, 1), (15, 1), (18, 1), (22, 1)],
 [(14, 1), (15, 1), (23, 1), (24, 1), (25, 1)],
 [(26, 1), (27, 1)],
 [(0, 1), (7, 1), (28, 1)],
 [(0, 1), (29, 1), (30, 1)],
 [(0, 1), (30, 1)],
 [(0, 1), (10, 1), (31, 1)],
 [(0, 1), (10, 1), (30, 1)],
 [(0, 1)],
 [(0, 1), (29, 1), (30, 1)],
 [(0, 1), (32, 1)],
 [(0, 1), (33, 1), (34, 1)],
 [(0, 1), (35, 1)],
 [(0, 1), (35, 1)],
 [(0, 1)],
 [(0, 1), (36, 1)],
 [(0, 1)],
 [(0, 1), (37, 1), (38, 1)],
 [(0, 1), (10, 1), (16, 1)],
 [(0, 1), (28, 1), (35, 1)],
 [(0, 1), (10, 1), (16, 1), (39, 1)],
 [(0, 1), (40, 1), (41, 1)],
 [(10, 1), (39, 1), (42, 1), (43, 1)],
 [(0, 1), (39, 1)],
 [(41, 1), (44, 1)

In [12]:
# criação do setup Pycaret para NLP
setup_nlp = setup(df_all, target='input')

Description,Value
session_id,1503
Documents,1199
Vocab Size,357
Custom Stopwords,False


In [13]:
# alteração da configuração do text, para o processado em português (por padrão o Pycaret processa em inglês)
set_config('text', processed_corpus)
get_config('text')

[['covid'],
 ['pego', 'covid', 'tocar', 'objeto', 'contaminar'],
 ['pego', 'covid', 'atraves', 'meio'],
 ['onde', 'comecar', 'covid'],
 ['onde', 'covid'],
 ['virus', 'transmitir', 'atraves', 'meio'],
 ['significar', 'covid'],
 ['comorbidade', 'poder', 'tomar', 'vacina'],
 ['covid', 'familia', 'virus'],
 ['vacina', 'disponivel', 'covid'],
 ['crianca', 'poder', 'vacinar', 'contra', 'covid'],
 ['bom', 'vacina'],
 ['vacina', 'contra', 'covid', 'seguro'],
 ['ter', 'tomar', 'quanto', 'dose', 'vacina'],
 ['prevenir', 'contagio'],
 ['covid', 'comecar', 'china'],
 ['covid', 'doenca', 'grave'],
 ['covid', 'grave'],
 ['covid', 'virus', 'sarscov'],
 ['covid', 'virus', 'grave'],
 ['covid'],
 ['covid', 'doenca', 'grave'],
 ['covid', 'gripe'],
 ['covid', 'criar', 'laboratorio'],
 ['covid', 'vir'],
 ['covid', 'vir'],
 ['covid'],
 ['falar', 'covid'],
 ['covid'],
 ['querer', 'dizer', 'covid'],
 ['virus', 'familia', 'covid'],
 ['covid', 'vir', 'china'],
 ['virus', 'comum', 'familia', 'covid'],
 ['pegar',

In [14]:
# alteração da configuração do corpus, para o processado em português (por padrão o Pycaret processa em inglês)
set_config('corpus', bow_corpus)
get_config('corpus')

[[(0, 1)],
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(0, 1), (3, 1), (5, 1), (6, 1)],
 [(0, 1), (7, 1), (8, 1)],
 [(0, 1), (8, 1)],
 [(5, 1), (6, 1), (9, 1), (10, 1)],
 [(0, 1), (11, 1)],
 [(12, 1), (13, 1), (14, 1), (15, 1)],
 [(0, 1), (10, 1), (16, 1)],
 [(0, 1), (15, 1), (17, 1)],
 [(0, 1), (13, 1), (18, 1), (19, 1), (20, 1)],
 [(15, 1), (21, 1)],
 [(0, 1), (15, 1), (18, 1), (22, 1)],
 [(14, 1), (15, 1), (23, 1), (24, 1), (25, 1)],
 [(26, 1), (27, 1)],
 [(0, 1), (7, 1), (28, 1)],
 [(0, 1), (29, 1), (30, 1)],
 [(0, 1), (30, 1)],
 [(0, 1), (10, 1), (31, 1)],
 [(0, 1), (10, 1), (30, 1)],
 [(0, 1)],
 [(0, 1), (29, 1), (30, 1)],
 [(0, 1), (32, 1)],
 [(0, 1), (33, 1), (34, 1)],
 [(0, 1), (35, 1)],
 [(0, 1), (35, 1)],
 [(0, 1)],
 [(0, 1), (36, 1)],
 [(0, 1)],
 [(0, 1), (37, 1), (38, 1)],
 [(0, 1), (10, 1), (16, 1)],
 [(0, 1), (28, 1), (35, 1)],
 [(0, 1), (10, 1), (16, 1), (39, 1)],
 [(0, 1), (40, 1), (41, 1)],
 [(10, 1), (39, 1), (42, 1), (43, 1)],
 [(0, 1), (39, 1)],
 [(41, 1), (44, 1)

In [15]:
# alteração da configuração do data_, para o processado em português (por padrão o Pycaret processa em inglês)
set_config('data_', df_all)
get_config('data_')

Unnamed: 0,input,category
0,covid,covid
1,pego covid tocar objeto contaminar,covid
2,pego covid atraves meio,covid
3,onde comecar covid,covid
4,onde covid,covid
...,...,...
1194,setor meios hospedagem recomendar setor,seloturismo
1195,final dever realizar limpeza unidade,seloturismo
1196,,seloturismo
1197,protecao,seloturismo


In [16]:
# alteração da configuração do dicionário, para o processado em português (por padrão o Pycaret processa em inglês)
set_config('id2word', dictionary)
d = get_config('id2word')
print(d)

Dictionary<657 unique tokens: ['covid', 'contaminar', 'objeto', 'pego', 'tocar']...>


In [17]:
# modelos disponíveis no Pycaret
models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
lda,Latent Dirichlet Allocation,gensim/models/ldamodel
lsi,Latent Semantic Indexing,gensim/models/lsimodel
hdp,Hierarchical Dirichlet Process,gensim/models/hdpmodel
rp,Random Projections,gensim/models/rpmodel
nmf,Non-Negative Matrix Factorization,sklearn.decomposition.NMF


In [18]:
# Latent Dirichlet Allocation
lda = create_model('lda', num_topics=3, multi_core=True)
print(lda)


LdaMulticore<num_terms=657, num_topics=3, decay=0.5, chunksize=100>


In [19]:
# Latent Semantic Indexing
# aparentemente algum erro no pacote
lsi = create_model('lsi', num_topics=3, multi_core=True)
print(lsi)


LsiModel<num_terms=657, num_topics=3, decay=1.0, chunksize=20000>


In [20]:
# Hierarchical Dirichlet Process
hdp = create_model('hdp', num_topics=3, multi_core=True)
print(hdp)


<gensim.models.hdpmodel.HdpModel object at 0x000002CDF32D5850>


In [21]:
# Random Projections
rp = create_model('rp', num_topics=3, multi_core=True)
print(rp)


RpModel<num_terms=657, num_topics=3>


In [22]:
# Non-Negative Matrix Factorization
nmf = create_model('nmf', num_topics=3, multi_core=True)
print(nmf)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=3, random_state=1503, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)


In [23]:
# Avaliação do modelo
evaluate_model(lda)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Frequency Plot', 'freque…

In [24]:
# Avaliação do modelo
#evaluate_model(lsi)

In [25]:
# Avaliação do modelo
evaluate_model(hdp)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Frequency Plot', 'freque…

In [26]:
# Avaliação do modelo
evaluate_model(rp)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Frequency Plot', 'freque…

In [27]:
# Avaliação do modelo
evaluate_model(nmf)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Frequency Plot', 'freque…

In [28]:
# Resultados do modelo
lda_results = assign_model(lda)
lda_results

Unnamed: 0,input,category,Topic_0,Topic_1,Topic_2,Dominant_Topic,Perc_Dominant_Topic
0,covid,covid,0.185142,0.648167,0.166691,Topic 1,0.65
1,pego covid tocar objeto contaminar,covid,0.057223,0.887099,0.055678,Topic 1,0.89
2,pego covid atraves meio,covid,0.068073,0.865104,0.066822,Topic 1,0.87
3,onde comecar covid,covid,0.085708,0.830843,0.083449,Topic 1,0.83
4,onde covid,covid,0.115830,0.773017,0.111153,Topic 1,0.77
...,...,...,...,...,...,...,...
1194,setor meios hospedagem recomendar setor,seloturismo,0.055598,0.888786,0.055616,Topic 1,0.89
1195,final dever realizar limpeza unidade,seloturismo,0.825220,0.056066,0.118714,Topic 0,0.83
1196,,seloturismo,0.333333,0.333333,0.333333,Topic 0,0.33
1197,protecao,seloturismo,0.166773,0.666357,0.166870,Topic 1,0.67


In [29]:
# Resultados do modelo
#lsi_results = assign_model(lsi)
#lsi_results

In [30]:
# Resultados do modelo
hdp_results = assign_model(hdp)
hdp_results

Unnamed: 0,input,category,Topic_0,Topic_1,Topic_2,Dominant_Topic
0,covid,covid,0.164791,0.644363,0.190846,Topic 1
1,pego covid tocar objeto contaminar,covid,0.297691,0.472568,0.229741,Topic 1
2,pego covid atraves meio,covid,0.076817,0.345622,0.577561,Topic 2
3,onde comecar covid,covid,0.640749,0.264275,0.094977,Topic 0
4,onde covid,covid,0.128802,0.748667,0.122531,Topic 1
...,...,...,...,...,...,...
1194,setor meios hospedagem recomendar setor,seloturismo,0.853460,0.070993,0.075547,Topic 0
1195,final dever realizar limpeza unidade,seloturismo,0.862665,0.074345,0.062991,Topic 0
1196,,seloturismo,,,,
1197,protecao,seloturismo,0.172949,0.604878,0.222173,Topic 1


In [31]:
# Resultados do modelo
rp_results = assign_model(rp)
rp_results

Unnamed: 0,input,category,Topic_0,Topic_1,Topic_2,Dominant_Topic
0,covid,covid,0.577350,0.577350,-0.577350,Topic 0
1,pego covid tocar objeto contaminar,covid,0.577350,0.577350,0.577350,Topic 0
2,pego covid atraves meio,covid,-1.154701,0.000000,-1.154701,Topic 1
3,onde comecar covid,covid,0.577350,1.732051,-0.577350,Topic 1
4,onde covid,covid,1.154701,1.154701,-1.154701,Topic 0
...,...,...,...,...,...,...
1194,setor meios hospedagem recomendar setor,seloturismo,-0.577350,0.577350,1.732051,Topic 2
1195,final dever realizar limpeza unidade,seloturismo,-0.577350,-0.577350,-1.732051,Topic 0
1196,,seloturismo,,,,
1197,protecao,seloturismo,0.577350,0.577350,0.577350,Topic 0


In [32]:
# Resultados do modelo
nmf_results = assign_model(nmf)
nmf_results

Unnamed: 0,input,category,Topic_0,Topic_1,Topic_2,Dominant_Topic,Perc_Dominant_Topic
0,covid,covid,0.000000,0.473615,0.000000,Topic 1,1.00
1,pego covid tocar objeto contaminar,covid,0.000000,0.058017,0.000000,Topic 1,1.00
2,pego covid atraves meio,covid,0.000000,0.073348,0.000000,Topic 1,1.00
3,onde comecar covid,covid,0.000000,0.117244,0.001173,Topic 1,0.99
4,onde covid,covid,0.000000,0.206631,0.002319,Topic 1,0.99
...,...,...,...,...,...,...,...
1194,setor meios hospedagem recomendar setor,seloturismo,0.000000,0.000000,0.009162,Topic 2,1.00
1195,final dever realizar limpeza unidade,seloturismo,0.000066,0.001263,0.001381,Topic 2,0.51
1196,,seloturismo,0.000000,0.000000,0.000000,Topic 0,
1197,protecao,seloturismo,0.000000,0.000000,0.005243,Topic 2,1.00


In [33]:
# Criação de dataset para classificação
nmf_results.drop(['input','Dominant_Topic', 'Perc_Dominant_Topic'], axis=1, inplace=True)
nmf_results

Unnamed: 0,category,Topic_0,Topic_1,Topic_2
0,covid,0.000000,0.473615,0.000000
1,covid,0.000000,0.058017,0.000000
2,covid,0.000000,0.073348,0.000000
3,covid,0.000000,0.117244,0.001173
4,covid,0.000000,0.206631,0.002319
...,...,...,...,...
1194,seloturismo,0.000000,0.000000,0.009162
1195,seloturismo,0.000066,0.001263,0.001381
1196,seloturismo,0.000000,0.000000,0.000000
1197,seloturismo,0.000000,0.000000,0.005243


In [34]:
from pycaret.classification import *

# # criação do setup Pycaret para Classificação
setup_class = setup(data=nmf_results, target='category', train_size=0.7)

Unnamed: 0,Description,Value
0,session_id,3895
1,Target,category
2,Target Type,Multiclass
3,Label Encoded,"covid: 0, seloturismo: 1, tuberculose: 2"
4,Original Data,"(1199, 4)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [35]:
# comparação de modelos e salva o de maior Accuracy
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9464,0.99,0.9242,0.9478,0.9459,0.9061,0.9073,0.042
catboost,CatBoost Classifier,0.9392,0.9905,0.921,0.9412,0.939,0.8944,0.8955,0.762
rf,Random Forest Classifier,0.938,0.9894,0.9172,0.9399,0.9378,0.8921,0.8932,0.049
knn,K Neighbors Classifier,0.9356,0.9769,0.9087,0.9368,0.9349,0.8867,0.8882,0.006
gbc,Gradient Boosting Classifier,0.9356,0.9901,0.9184,0.9382,0.9357,0.8883,0.8893,0.069
xgboost,Extreme Gradient Boosting,0.9344,0.9862,0.9196,0.9368,0.9346,0.8869,0.8877,0.056
lightgbm,Light Gradient Boosting Machine,0.9344,0.9887,0.9131,0.9361,0.9339,0.8858,0.887,0.027
dt,Decision Tree Classifier,0.9273,0.9407,0.9063,0.9301,0.9274,0.874,0.8752,0.005
qda,Quadratic Discriminant Analysis,0.9177,0.9874,0.8674,0.9253,0.9134,0.8511,0.86,0.004
nb,Naive Bayes,0.9142,0.9866,0.8612,0.9225,0.9094,0.8441,0.8539,0.003


In [36]:
# Avaliação do modelo
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [37]:
# Predição com os dados de validação
predict_model(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.95,0.9937,0.9455,0.953,0.9506,0.9141,0.9148


Unnamed: 0,Topic_0,Topic_1,Topic_2,category,Label,Score
0,0.040895,0.000118,0.000023,tuberculose,tuberculose,0.98
1,0.000085,0.110319,0.000000,covid,covid,1.00
2,0.000000,0.000000,0.026939,seloturismo,seloturismo,1.00
3,0.000000,0.000000,0.063151,seloturismo,seloturismo,1.00
4,0.056054,0.002482,0.000000,tuberculose,tuberculose,0.98
...,...,...,...,...,...,...
355,0.000000,0.000000,0.051404,seloturismo,seloturismo,1.00
356,0.061643,0.007615,0.006979,tuberculose,tuberculose,0.97
357,0.142097,0.000000,0.000000,tuberculose,tuberculose,1.00
358,0.000000,0.001884,0.052354,seloturismo,seloturismo,1.00
