In [1]:
#import sys
#!{sys.executable} -m pip install BitVector

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import spacy
from spacy.lang.pt import Portuguese
from nltk.corpus import stopwords
import time
from pycaret.classification import *

import warnings
warnings.filterwarnings('ignore')

In [3]:
nlp = spacy.load('pt_core_news_lg')
parser = Portuguese()

# Preparação dos dados

In [4]:
# Import data
df_traindata = pd.read_csv('../data/traindata.csv', delimiter=';')
df_testdata = pd.read_csv('../data/testdata.csv', delimiter=';')

# Data preparation for Binary
df_traindata['category'] = df_traindata.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})
df_testdata['category'] = df_testdata.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})

In [5]:
# Lemmatizing input string
nlp = spacy.load("pt_core_news_lg")
lem = nlp.get_pipe("lemmatizer")

def lemmatize(data):
    inputs = []
    for doc in data.input:
        d = nlp(doc)
        s = ' '.join([token.lemma_ for token in d])       
        inputs.append(s)
    data['input'] = inputs
    return data

traindata = lemmatize(df_traindata)
testdata = lemmatize(df_testdata)

In [6]:
# Normalizing and TF-IDF preparation
stop_words = stopwords.words('portuguese')

tv = TfidfVectorizer(lowercase=True, stop_words=stop_words, strip_accents='unicode', token_pattern="[A-Za-z]+")
train_tf_idf = tv.fit_transform(traindata['input'])
test_tf_idf = tv.transform(testdata['input'])

traindata_vect = pd.DataFrame(train_tf_idf.toarray(), columns=tv.get_feature_names())
traindata_vect['target_cat'] = traindata.reset_index().category

testdata_vect = pd.DataFrame(test_tf_idf.toarray(), columns=tv.get_feature_names())
testdata_vect['target_cat'] = testdata.reset_index().category

# Treinamento do modelo com os dados originais

In [7]:
s = setup(data=traindata_vect, target='target_cat', numeric_features=list(traindata_vect.iloc[:,:-1].columns) ,session_id=9999, fold=10, test_data=testdata_vect, silent=True, fix_imbalance=True)
svm = create_model('svm')


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.95,0.0,0.9426,0.9531,0.9496,0.9142,0.9156
1,0.9333,0.0,0.8991,0.938,0.932,0.8818,0.8868
2,0.95,0.0,0.9156,0.954,0.9484,0.9106,0.9148
3,1.0,0.0,1.0,1.0,1.0,1.0,1.0
4,0.9917,0.0,0.9855,0.9918,0.9916,0.9855,0.9856
5,0.9,0.0,0.8313,0.9072,0.8919,0.8161,0.8295
6,0.9667,0.0,0.9543,0.967,0.9666,0.9417,0.9421
7,1.0,0.0,1.0,1.0,1.0,1.0,1.0
8,0.9917,0.0,0.9855,0.992,0.9916,0.9856,0.9857
9,0.9496,0.0,0.9156,0.952,0.9477,0.9112,0.9139


In [8]:

tempo_ini = time.time()
predict_model(svm)
tempo_fim = time.time()
print('######### ', testdata_vect.shape[0], 'registros.')
print("Tempo médio de inferência: {:.5f} segundos".format((tempo_fim - tempo_ini) / testdata_vect.shape[0]))

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.8532,0,0.8532,0.8646,0.853,0.7799,0.7847


#########  477 registros.
Tempo médio de inferência: 0.00030 segundos


# Inferência para 3.721 registros

In [9]:
df_novotestdata = pd.read_csv('../data/novodatasetgerado.csv', delimiter=';')
df_novotestdata['category'] = df_novotestdata.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})

# Lemmatizing input string
novo_testdata = lemmatize(df_novotestdata)

novo_test_tf_idf = tv.transform(novo_testdata['input'])

novo_testdata_vect = pd.DataFrame(novo_test_tf_idf.toarray(), columns=tv.get_feature_names())
novo_testdata_vect['target_cat'] = novo_testdata.reset_index().category

In [10]:

tempo_ini = time.time()
predict_model(svm, data=novo_testdata_vect)
tempo_fim = time.time()
print('######### ', novo_testdata_vect.shape[0], 'registros.')
print("Tempo médio de inferência: {:.5f} segundos".format((tempo_fim - tempo_ini) / novo_testdata_vect.shape[0]))

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.6665,0,0.6575,0.6641,0.6392,0.4975,0.521


#########  3721 registros.
Tempo médio de inferência: 0.00235 segundos


# Inferência para MI registros

In [11]:
df_novotestdata = pd.read_csv('../data/novodatasetgerado-frasesduplicadas.csv', delimiter=';')
df_novotestdata['category'] = df_novotestdata.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})


In [18]:

# Lemmatizing input string
novo_testdata = lemmatize(df_novotestdata.head(1000))

novo_test_tf_idf = tv.transform(novo_testdata['input'])

novo_testdata_vect = pd.DataFrame(novo_test_tf_idf.toarray(), columns=tv.get_feature_names())
novo_testdata_vect['target_cat'] = novo_testdata.reset_index().category



In [15]:
tempo_ini = time.time()
predict_model(svm, data=novo_testdata_vect)
tempo_fim = time.time()
print('######### ', novo_testdata_vect.shape[0], 'registros.')
print("Tempo médio de inferência: {:.5f} segundos".format((tempo_fim - tempo_ini) / novo_testdata_vect.shape[0]))

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.753,0,0.7196,0.7718,0.7492,0.6151,0.6279


#########  1000 registros.
Tempo médio de inferência: 0.00291 segundos
