In [80]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import spacy
from spacy.lang.pt import Portuguese
from nltk.corpus import stopwords
import time

import warnings
warnings.filterwarnings('ignore')

In [71]:
nlp = spacy.load('pt_core_news_lg')
parser = Portuguese()

# Preparação dos dados

In [81]:
# Import data
df_traindata = pd.read_csv('../data/traindata.csv', delimiter=';')
df_testdata = pd.read_csv('../data/testdata.csv', delimiter=';')

# Data preparation for Binary
df_traindata['category'] = df_traindata.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})
df_testdata['category'] = df_testdata.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})

In [82]:
# monta dataset
df_total = pd.concat([df_traindata, df_testdata])

textos = df_total['input'].tolist()
rotulos = df_total['category'].tolist()

# Normalizing and TF-IDF preparation
stop_words = stopwords.words('portuguese')

# Transforma os textos em vetores de recursos usando TF-IDF
vectorizer = TfidfVectorizer(lowercase=True, stop_words=stop_words, strip_accents='unicode', token_pattern="[A-Za-z]+")
X = vectorizer.fit_transform(textos)

# Divide o dataset em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, rotulos, test_size=0.2)

# Treinamento do modelo com os dados originais

In [85]:
# Cria o classificador LSH
start_time_train = time.time()
lsh = NearestNeighbors(n_neighbors=1, n_jobs=-1, algorithm='auto')
lsh.fit(X_train)
end_time_train = time.time()
print("Tempo de treino: {:.5f} segundos".format(end_time_train - start_time_train))

# Realize a inferência usando o classificador
tempo_ini = time.time()
_, indices = lsh.kneighbors(X_test)
y_pred = [y_train[i[0]] for i in indices]
tempo_fim = time.time()

print("Tempo médio de inferência: {:.5f} segundos".format((tempo_fim - tempo_ini) / X_test.shape[0]))
print(classification_report(y_test, y_pred, target_names=['0', '1', '2']))


Tempo de treino: 0.00000 segundos
Tempo médio de inferência: 0.00007 segundos
              precision    recall  f1-score   support

           0       0.74      0.79      0.77        72
           1       0.95      0.94      0.95       178
           2       0.80      0.77      0.78        86

    accuracy                           0.87       336
   macro avg       0.83      0.83      0.83       336
weighted avg       0.87      0.87      0.87       336



# Inferência para 3.721 registros

In [87]:
df_novotestdata = pd.read_csv('../data/novodatasetgerado.csv', delimiter=';')
df_novotestdata['category'] = df_novotestdata.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})

rotulos = df_novotestdata['category'].tolist()

novos_dados_vetorizados = vectorizer.transform(df_novotestdata['input'].tolist())

# Realize a inferência usando o classificador
tempo_ini = time.time()
_, indices = lsh.kneighbors(novos_dados_vetorizados)
y_pred = [y_train[i[0]] for i in indices]
tempo_fim = time.time()

print('######### ', novos_dados_vetorizados.shape[0], 'registros.')
print("Tempo médio de inferência: {:.5f} segundos".format((tempo_fim - tempo_ini) / novos_dados_vetorizados.shape[0]))
print(classification_report(rotulos, y_pred, target_names=['0', '1', '2']))

#########  3721 registros.
Tempo médio de inferência: 0.00002 segundos
              precision    recall  f1-score   support

           0       0.43      0.34      0.38      1158
           1       0.73      0.78      0.75      1236
           2       0.60      0.66      0.63      1327

    accuracy                           0.60      3721
   macro avg       0.58      0.60      0.59      3721
weighted avg       0.59      0.60      0.59      3721



# Inferência para MI registros

In [90]:
headers = ['input', 'category']
df_novotestdata = pd.read_csv('../data/novodatasetgerado-frasesduplicadas.csv', delimiter=';', error_bad_lines=False, names=headers, header=0)
df_novotestdata['category'] = df_novotestdata.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})


Skipping line 6: expected 2 fields, saw 3
Skipping line 433: expected 2 fields, saw 3
Skipping line 626: expected 2 fields, saw 3
Skipping line 1053: expected 2 fields, saw 3
Skipping line 1246: expected 2 fields, saw 3
Skipping line 1673: expected 2 fields, saw 3
Skipping line 1866: expected 2 fields, saw 3
Skipping line 2293: expected 2 fields, saw 3
Skipping line 2486: expected 2 fields, saw 3
Skipping line 2913: expected 2 fields, saw 3
Skipping line 3106: expected 2 fields, saw 3
Skipping line 3533: expected 2 fields, saw 3
Skipping line 3726: expected 2 fields, saw 3
Skipping line 4153: expected 2 fields, saw 3
Skipping line 4346: expected 2 fields, saw 3
Skipping line 4773: expected 2 fields, saw 3
Skipping line 4966: expected 2 fields, saw 3
Skipping line 5393: expected 2 fields, saw 3
Skipping line 5586: expected 2 fields, saw 3
Skipping line 6013: expected 2 fields, saw 3
Skipping line 6206: expected 2 fields, saw 3
Skipping line 6633: expected 2 fields, saw 3
Skipping line 6

In [98]:
subset = df_novotestdata.head(1000000)

rotulos = subset['category'].tolist()

novos_dados_vetorizados = vectorizer.transform(subset['input'].tolist())

# Realize a inferência usando o classificador
tempo_ini = time.time()
_, indices = lsh.kneighbors(novos_dados_vetorizados)
y_pred = [y_train[i[0]] for i in indices]
tempo_fim = time.time()

print('######### ', novos_dados_vetorizados.shape[0], 'registros.')
print("Tempo médio de inferência: {:.5f} segundos".format((tempo_fim - tempo_ini) / novos_dados_vetorizados.shape[0]))
print(classification_report(rotulos, y_pred, target_names=['0', '1', '2']))

#########  1000000 registros.
Tempo médio de inferência: 0.00002 segundos
              precision    recall  f1-score   support

           0       0.43      0.35      0.38    310653
           1       0.73      0.78      0.76    333308
           2       0.60      0.66      0.63    356039

    accuracy                           0.60   1000000
   macro avg       0.59      0.60      0.59   1000000
weighted avg       0.59      0.60      0.59   1000000

