In [119]:
# Load packages
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd
from nltk.stem.porter import PorterStemmer
import string
import spacy
from spacy.lang.pt import Portuguese
from nltk.corpus import stopwords

nlp = spacy.load('pt_core_news_lg')
parser = Portuguese()

### Preparação dos dados

In [120]:
# Import data
df_traindata = pd.read_csv('../data/traindata.csv', delimiter=';')
df_testdata = pd.read_csv('../data/testdata.csv', delimiter=';')

# Data preparation
df_traindata['category'] = df_traindata.reset_index().category.map({'covid':1, 'seloturismo':-1, 'tuberculose':-1})
df_testdata['category'] = df_testdata.reset_index().category.map({'covid':1, 'seloturismo':-1, 'tuberculose':-1})

# create a new dataset with only covid category data
covid_df = df_traindata[df_traindata['category'] == 1]

In [121]:
covid_df.shape

(231, 2)

In [122]:
# create train and test data
train_text = covid_df['input'].tolist()
train_labels = covid_df['category'].tolist()

test_text = df_testdata['input'].tolist()
test_labels = df_testdata['category'].tolist()

### Limpeza e tokenização dos dados

In [123]:
# stop words list
STOPLIST = set(stopwords.words('portuguese'))
print(STOPLIST)
# special characters
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”","''"]

{'houvera', 'houverem', 'tua', 'elas', 'terá', 'os', 'dele', 'sou', 'minhas', 'nossos', 'fomos', 'houveríamos', 'na', 'somos', 'houve', 'ser', 'também', 'tivesse', 'estiverem', 'minha', 'houveram', 'tivera', 'tenham', 'mais', 'pela', 'essas', 'estivéssemos', 'houvesse', 'vos', 'da', 'tuas', 'isto', 'pelos', 'que', 'estas', 'há', 'hei', 'estavam', 'te', 'este', 'numa', 'não', 'esses', 'houver', 'estivesse', 'nosso', 'ele', 'aqueles', 'me', 'houvéssemos', 'são', 'estivermos', 'tiveram', 'hão', 'houvermos', 'fôramos', 'nas', 'vocês', 'isso', 'teve', 'temos', 'eu', 'qual', 'serei', 'estejamos', 'tivessem', 'esta', 'teria', 'delas', 'estou', 'haver', 'meu', 'de', 'fosse', 'tivemos', 'a', 'estamos', 'o', 'estes', 'houveremos', 'será', 'deles', 'eles', 'pelo', 'por', 'sejam', 'tinha', 'meus', 'estivera', 'nossa', 'estejam', 'um', 'às', 'formos', 'serão', 'só', 'havemos', 'depois', 'seremos', 'tínhamos', 'estão', 'fossem', 'terei', 'tenho', 'hajamos', 'dela', 'era', 'estivessem', 'estar', 'ten

In [124]:
# class for cleaning the text
class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
            return {}

def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text

In [125]:
# tokenizing the raw text
def tokenizeText(sample):
    tokens = parser(sample)

    # lemmatization
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # remove stop words and special characters
    tokens = [tok for tok in tokens if tok.lower() not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # only take words with length greater than or equal to 3
    tokens = [tok for tok in tokens if len(tok) >= 3]

    # remove remaining tokens that are not alphabetic
    tokens = [tok for tok in tokens if tok.isalpha()]

    # stemming of words
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]

    return list(set(tokens))

In [126]:
# lets see tokenized random text
tokenizeText(nlp(train_text[9]))

['estarei', 'completament', 'vacinar', 'imun', 'coronavíru']

### Extração de características

In [127]:
# getting features
vectorizer = HashingVectorizer(n_features=20, tokenizer=tokenizeText)

features = vectorizer.fit_transform(train_text).toarray()
features.shape



(231, 20)

In [128]:
tv = TfidfVectorizer(strip_accents='unicode', token_pattern="[A-Za-z]+")
train_tf_idf = tv.fit_transform(train_text)
test_tf_idf = tv.transform(test_text)

traindata_vect = pd.DataFrame(train_tf_idf.toarray(), columns=tv.get_feature_names_out())
traindata_vect['target_cat'] = covid_df.reset_index().category

testdata_vect = pd.DataFrame(test_tf_idf.toarray(), columns=tv.get_feature_names_out())
testdata_vect['target_cat'] = df_testdata.reset_index().category

### One-class SVM
One-class SVM is an unsupervised algorithm that learns a decision function for novelty detection: classifying new data as similar or different to the training set.

In [129]:
# OneClassSVM algorithm
clf = OneClassSVM(nu=0.1, kernel="rbf", gamma='scale')
pipe_clf = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', tv), ('clf', clf)])

In [130]:
# fit OneClassSVM model
pipe_clf.fit(train_text, train_labels)

In [131]:
# validate OneClassSVM model with train set
preds_train = pipe_clf.predict(train_text)
print("accuracy:", accuracy_score(train_labels, preds_train))

accuracy: 0.6363636363636364


In [132]:
# validate OneClassSVM model with test set
preds_test = pipe_clf.predict(test_text)
preds_test

array([ 1,  1,  1,  1, -1, -1,  1, -1, -1, -1, -1,  1,  1,  1,  1,  1, -1,
       -1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,  1,  1, -1,  1, -1,  1,
       -1, -1,  1,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1,  1,  1, -1,  1,
        1, -1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1, -1, -1,  1,  1,  1, -1, -1,  1,  1,  1,  1, -1, -1, -1,  1,  1,
        1, -1,  1, -1, -1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1,
        1,  1, -1, -1,  1, -1,  1, -1, -1, -1,  1,  1, -1, -1, -1,  1,  1,
        1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1,
       -1,  1, -1,  1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1,  1,  1, -1, -1,  1,  1,  1, -1,  1, -1, -1,
       -1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1, -1,  1, -1, -1, -1, -1,
       -1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1,  1,  1,  1,  1, -1, -1,
       -1, -1, -1, -1, -1

In [133]:
results = confusion_matrix(test_labels, preds_test)
print('Confusion Matrix :')
print(results)
print('Accuracy Score :', accuracy_score(test_labels, preds_test))
print('Report : ')
print(classification_report(test_labels, preds_test))

Confusion Matrix :
[[225  93]
 [ 61  98]]
Accuracy Score : 0.6771488469601677
Report : 
              precision    recall  f1-score   support

          -1       0.79      0.71      0.75       318
           1       0.51      0.62      0.56       159

    accuracy                           0.68       477
   macro avg       0.65      0.66      0.65       477
weighted avg       0.70      0.68      0.68       477



In [134]:
# let's take random text from dataset
test_text[289]

'Qual o protocolo de acompanhamento dos acometidos por tuberculose?'

In [135]:
# check actual category
test_labels[289]

-1

In [136]:
# let's predict the category of above random text
pipe_clf.predict([test_text[289]])

array([-1], dtype=int64)