In [10]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import spacy
from spacy.lang.pt import Portuguese
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

In [11]:
nlp = spacy.load('pt_core_news_lg')
parser = Portuguese()

# Preparação dos dados

In [12]:
# Import data
df_traindata = pd.read_csv('../data/traindata.csv', delimiter=';')
df_testdata = pd.read_csv('../data/testdata.csv', delimiter=';')

# Data preparation for Binary
df_traindata['category'] = df_traindata.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})
df_testdata['category'] = df_testdata.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})

In [24]:
# monta dataset
df_total = pd.concat([df_traindata, df_testdata])

textos = df_total['input'].tolist()
rotulos = df_total['category'].tolist()

# Transforma os textos em vetores de recursos usando TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(textos)

# Divide o dataset em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, rotulos, test_size=0.2)

In [25]:
# Cria o classificador LSH
lsh = NearestNeighbors(n_neighbors=1, n_jobs=-1, algorithm='auto')
lsh.fit(X_train)

# Realize a inferência usando o classificador
_, indices = lsh.kneighbors(X_test)
y_pred = [y_train[i[0]] for i in indices]

# Avalie a acurácia do modelo
acc = accuracy_score(y_test, y_pred)
print("Acurácia:", acc)


Acurácia: 0.8869047619047619


In [15]:


# Normalizing and TF-IDF preparation
stop_words = stopwords.words('portuguese')

X_train = tv.fit_transform(df_traindata['input'])
y_train = df_traindata['category']

X_test = tv.transform(df_testdata['input'])
y_test = df_testdata['category']

tv = TfidfVectorizer(lowercase=True, stop_words=stop_words, strip_accents='unicode', token_pattern="[A-Za-z]+")
train_tf_idf = tv.fit_transform(df_traindata['input'])
test_tf_idf = tv.transform(df_testdata['input'])

traindata_vect = pd.DataFrame(train_tf_idf.toarray(), columns=tv.get_feature_names())
traindata_vect['target_cat'] = df_traindata.reset_index().category

testdata_vect = pd.DataFrame(test_tf_idf.toarray(), columns=tv.get_feature_names())
testdata_vect['target_cat'] = df_testdata.reset_index().category

Unnamed: 0,abdomen,abracar,abrace,abraco,acessibilidade,acesso,acho,acionar,acometer,acometidos,...,vistoriar,vistorias,vivem,vivendo,vivo,voltar,vou,vulneraveis,vulneravel,target_cat
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1194,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1195,0.0,0.0,0.0,0.0,0.0,0.0,0.499533,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1196,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1197,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
