In [1]:
# Load packages
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd
from nltk.stem.porter import PorterStemmer
import string
import spacy
from spacy.lang.pt import Portuguese
from nltk.corpus import stopwords

nlp = spacy.load('pt_core_news_lg')
parser = Portuguese()

### Preparação dos dados

In [2]:
# Import data
df_traindata = pd.read_csv('../data/traindata.csv', delimiter=';')
df_testdata = pd.read_csv('../data/testdata.csv', delimiter=';')

# Data preparation
df_traindata['category'] = df_traindata.reset_index().category.map({'covid':-1, 'seloturismo':1, 'tuberculose':-1})
df_testdata['category'] = df_testdata.reset_index().category.map({'covid':-1, 'seloturismo':1, 'tuberculose':-1})

# create a new dataset with only covid category data
dados_classe_alvo = df_traindata[df_traindata['category'] == 1]

In [3]:
# create train and test data
train_text = dados_classe_alvo['input'].tolist()
train_labels = dados_classe_alvo['category'].tolist()

test_text = df_testdata['input'].tolist()
test_labels = df_testdata['category'].tolist()

### Limpeza e tokenização dos dados

In [4]:
# stop words list
STOPLIST = set(stopwords.words('portuguese'))
print(STOPLIST)
# special characters
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”","''"]

{'tivéssemos', 'nossa', 'tiveram', 'estamos', 'sejamos', 'do', 'estas', 'estivemos', 'nosso', 'tivera', 'tuas', 'houveria', 'pelos', 'fosse', 'os', 'estão', 'tivessem', 'estejam', 'sou', 'era', 'num', 'será', 'houveriam', 'pelas', 'deles', 'e', 'já', 'esteve', 'seja', 'estivera', 'vos', 'tivermos', 'como', 'éramos', 'houverão', 'te', 'tiverem', 'ou', 'na', 'tivéramos', 'qual', 'teve', 'entre', 'hão', 'aquela', 'vocês', 'estivéssemos', 'que', 'ele', 'teriam', 'teríamos', 'estiveram', 'somos', 'serão', 'teremos', 'suas', 'fora', 'mas', 'forem', 'não', 'seria', 'estivesse', 'tínhamos', 'nos', 'houverei', 'tem', 'haver', 'essas', 'depois', 'pela', 'dos', 'tenhamos', 'estou', 'estes', 'sejam', 'houvessem', 'de', 'for', 'houvéssemos', 'tenham', 'houvesse', 'esse', 'hei', 'me', 'nas', 'tém', 'ela', 'isto', 'por', 'se', 'houvemos', 'tivemos', 'numa', 'nós', 'seríamos', 'hajamos', 'tinham', 'estivessem', 'à', 'muito', 'o', 'tinha', 'há', 'eu', 'eram', 'no', 'terá', 'mesmo', 'nossos', 'seriam', 

In [5]:
# class for cleaning the text
class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
            return {}

def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text

In [6]:
# tokenizing the raw text
def tokenizeText(sample):
    tokens = parser(sample)

    # lemmatization
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # remove stop words and special characters
    tokens = [tok for tok in tokens if tok.lower() not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # only take words with length greater than or equal to 3
    tokens = [tok for tok in tokens if len(tok) >= 3]

    # remove remaining tokens that are not alphabetic
    tokens = [tok for tok in tokens if tok.isalpha()]

    # stemming of words
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]

    return list(set(tokens))

In [7]:
# lets see tokenized random text
tokenizeText(nlp(train_text[9]))

['ministério', 'lançar', 'selo', 'turismo']

### Representação dos dados em TF-IDF

In [8]:
tv = TfidfVectorizer(strip_accents='unicode', token_pattern="[A-Za-z]+")
train_tf_idf = tv.fit_transform(train_text)
test_tf_idf = tv.transform(test_text)

traindata_vect = pd.DataFrame(train_tf_idf.toarray(), columns=tv.get_feature_names_out())

testdata_vect = pd.DataFrame(test_tf_idf.toarray(), columns=tv.get_feature_names_out())

### One-class SVM
One-class SVM is an unsupervised algorithm that learns a decision function for novelty detection: classifying new data as similar or different to the training set.

In [9]:
# OneClassSVM algorithm
clf = OneClassSVM(nu=0.1, kernel="rbf", gamma='scale')
pipe_clf = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', tv), ('clf', clf)])

In [10]:
# fit OneClassSVM model
pipe_clf.fit(train_text, train_labels)

In [12]:
# validate OneClassSVM model with train set
preds_train = pipe_clf.predict(train_text)
print("accuracy:", accuracy_score(train_labels, preds_train))

accuracy: 0.7132459970887919


In [13]:
# validate OneClassSVM model with test set
preds_test = pipe_clf.predict(test_text)
preds_test

array([ 1,  1, -1,  1, -1, -1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1, -1,
        1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1,
        1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1,  1,  1,  1,  1, -1,  1,
        1, -1,  1, -1, -1, -1, -1, -1, -1,  1,  1,  1,  1, -1, -1,  1,  1,
        1,  1,  1, -1,  1, -1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1, -1,
        1,  1, -1,  1, -1,  1,  1, -1,  1,  1,  1,  1, -1, -1, -1,  1,  1,
        1, -1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1,
        1,  1, -1, -1,  1, -1,  1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,
        1, -1, -1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1, -1,
       -1,  1, -1, -1, -1, -1,  1,  1,  1,  1, -1,  1,  1, -1, -1, -1, -1,
       -1,  1, -1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,
       -1, -1,  1,  1,  1,  1, -1,  1, -1, -1,  1, -1,  1, -1, -1, -1, -1,
       -1, -1, -1,  1,  1, -1, -1,  1,  1, -1, -1,  1, -1,  1, -1, -1, -1,
       -1,  1, -1, -1,  1

In [14]:
results = confusion_matrix(test_labels, preds_test)
print('Confusion Matrix :')
print(results)
print('Accuracy Score :', accuracy_score(test_labels, preds_test))
print('Report : ')
print(classification_report(test_labels, preds_test))

Confusion Matrix :
[[163 155]
 [ 70  89]]
Accuracy Score : 0.5283018867924528
Report : 
              precision    recall  f1-score   support

          -1       0.70      0.51      0.59       318
           1       0.36      0.56      0.44       159

    accuracy                           0.53       477
   macro avg       0.53      0.54      0.52       477
weighted avg       0.59      0.53      0.54       477



In [15]:
# let's take random text from dataset
test_text[289]

'Qual o protocolo de acompanhamento dos acometidos por tuberculose?'

In [16]:
# check actual category
test_labels[289]

-1

In [17]:
# let's predict the category of above random text
pipe_clf.predict([test_text[289]])

array([1], dtype=int64)