In [1]:
# Load packages
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd
from nltk.stem.porter import PorterStemmer
import string
import spacy
from spacy.lang.pt import Portuguese
from nltk.corpus import stopwords

nlp = spacy.load('pt_core_news_lg')
parser = Portuguese()

### Preparação dos dados

In [2]:
# Import data
df_traindata = pd.read_csv('../data/traindata.csv', delimiter=';')
df_testdata = pd.read_csv('../data/testdata.csv', delimiter=';')

# Data preparation
df_traindata['category'] = df_traindata.reset_index().category.map({'covid':-1, 'seloturismo':-1, 'tuberculose':1})
df_testdata['category'] = df_testdata.reset_index().category.map({'covid':-1, 'seloturismo':-1, 'tuberculose':1})

# create a new dataset with only covid category data
dados_classe_alvo = df_traindata[df_traindata['category'] == 1]

dados_outliers = df_traindata[df_traindata['category'] == -1].sample(frac=0.05)
dados_outliers

Unnamed: 0,input,category
429,quais são os protocolos e medidas de proteção ...,-1
772,as empresas serão fiscalizadas,-1
371,atendimento ao cidadao,-1
43,Existe risco se uma pessoa tomar vacinas difer...,-1
508,A adoção dos protocolos do selo levou em consi...,-1
258,posso contrair covid na troca de curativos,-1
259,protejo sempre a boca com máscara posso pegar ...,-1
597,quanto custa tres selos,-1
292,minhas encomendas de área infectadas via corre...,-1
655,o que o governo brasileiro espera com o selo,-1


In [3]:
# create train and test data
train_text = dados_classe_alvo['input'].tolist()
train_labels = dados_classe_alvo['category'].tolist()

outliers_text = dados_outliers['input'].tolist()
outliers_labels = dados_outliers['category'].tolist()

test_text = df_testdata['input'].tolist()
test_labels = df_testdata['category'].tolist()

### Limpeza e tokenização dos dados

In [4]:
# stop words list
STOPLIST = set(stopwords.words('portuguese'))
print(STOPLIST)
# special characters
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”","''"]

{'tiver', 'até', 'dos', 'estivéramos', 'estejamos', 'houvéramos', 'tinham', 'elas', 'essas', 'eu', 'esteve', 'hei', 'houverá', 'havemos', 'pelo', 'tivéssemos', 'seus', 'fomos', 'estão', 'eles', 'seu', 'tu', 'no', 'num', 'mas', 'tivera', 'teríamos', 'foram', 'foi', 'às', 'como', 'hajam', 'tenhamos', 'estive', 'éramos', 'o', 'sou', 'há', 'suas', 'tivesse', 'seria', 'ou', 'pela', 'fosse', 'fora', 'à', 'estiveram', 'ser', 'você', 'hão', 'lhes', 'nossos', 'estar', 'estes', 'ela', 'por', 'tuas', 'aqueles', 'aquele', 'teria', 'haja', 'seja', 'para', 'tínhamos', 'pelas', 'que', 'estamos', 'são', 'já', 'minhas', 'não', 'tenho', 'quem', 'com', 'estiverem', 'nem', 'estivessem', 'fôramos', 'qual', 'sejam', 'só', 'está', 'esta', 'estejam', 'entre', 'houverei', 'sejamos', 'tivemos', 'a', 'estiver', 'do', 'esse', 'hajamos', 'meus', 'houvera', 'houvéssemos', 'seríamos', 'teremos', 'tenham', 'isto', 'depois', 'de', 'houveríamos', 'nossa', 'forem', 'se', 'esteja', 'te', 'isso', 'ele', 'os', 'delas', 'ti

In [5]:
# class for cleaning the text
class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
            return {}

def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text

In [6]:
# tokenizing the raw text
def tokenizeText(sample):
    tokens = parser(sample)

    # lemmatization
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # remove stop words and special characters
    tokens = [tok for tok in tokens if tok.lower() not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # only take words with length greater than or equal to 3
    tokens = [tok for tok in tokens if len(tok) >= 3]

    # remove remaining tokens that are not alphabetic
    tokens = [tok for tok in tokens if tok.isalpha()]

    # stemming of words
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]

    return list(set(tokens))

In [7]:
# lets see tokenized random text
tokenizeText(nlp(train_text[9]))

['laríngea',
 'pulmonar',
 'tratamento',
 'média',
 'pessoa',
 'tuberculos',
 'poder',
 'ativo',
 'infectar']

### Representação dos dados em TF-IDF

In [8]:
tv = TfidfVectorizer(strip_accents='unicode', token_pattern="[A-Za-z]+")
train_tf_idf = tv.fit_transform(train_text)
outliers_tf_idf = tv.transform(outliers_text)
test_tf_idf = tv.transform(test_text)

traindata_vect = pd.DataFrame(train_tf_idf.toarray(), columns=tv.get_feature_names_out())
outliers_vect = pd.DataFrame(outliers_tf_idf.toarray(), columns=tv.get_feature_names_out())
testdata_vect = pd.DataFrame(test_tf_idf.toarray(), columns=tv.get_feature_names_out())

### One-class SVM
One-class SVM is an unsupervised algorithm that learns a decision function for novelty detection: classifying new data as similar or different to the training set.

In [44]:
# OneClassSVM algorithm
clf = OneClassSVM(nu=0.0002, kernel="rbf", gamma='scale')
pipe_clf = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', tv), ('clf', clf)])

In [45]:
# fit OneClassSVM model
pipe_clf.fit(train_text, train_labels)

In [46]:
# validate OneClassSVM model with train set
preds_train = pipe_clf.predict(train_text)
print("accuracy:", accuracy_score(train_labels, preds_train))

accuracy: 0.11387900355871886


In [47]:
# validate OneClassSVM model with outliers set
preds_outliers = pipe_clf.predict(outliers_text)
print("accuracy:", accuracy_score(outliers_labels, preds_outliers))

accuracy: 0.9130434782608695


In [48]:
# validate OneClassSVM model with test set
preds_test = pipe_clf.predict(test_text)
preds_test

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [49]:
# calculate prediction errors
n_error_train = preds_train[preds_train == -1].size
n_error_test = preds_test[preds_test == -1].size
n_error_outliers = preds_outliers[preds_outliers == 1].size

print('Train erros: ', n_error_train)
print('Test erros: ', n_error_test)
print('Outliers erros: ', n_error_outliers)

Train erros:  249
Test erros:  469
Outliers erros:  4


In [50]:
# let's take random text from dataset
test_text[289]

'Qual o protocolo de acompanhamento dos acometidos por tuberculose?'

In [51]:
# results
results = confusion_matrix(test_labels, preds_test)
print('Confusion Matrix :')
print(results)
print('Accuracy Score :', accuracy_score(test_labels, preds_test))
print('Report : ')
print(classification_report(test_labels, preds_test))

Confusion Matrix :
[[316   2]
 [153   6]]
Accuracy Score : 0.6750524109014675
Report : 
              precision    recall  f1-score   support

          -1       0.67      0.99      0.80       318
           1       0.75      0.04      0.07       159

    accuracy                           0.68       477
   macro avg       0.71      0.52      0.44       477
weighted avg       0.70      0.68      0.56       477



In [None]:
# check actual category
test_labels[289]

In [None]:
# let's predict the category of above random text
pipe_clf.predict([test_text[289]])