In [1]:
# Load packages
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd
from nltk.stem.porter import PorterStemmer
import string
import spacy
from spacy.lang.pt import Portuguese
from nltk.corpus import stopwords

nlp = spacy.load('pt_core_news_lg')
parser = Portuguese()

### Preparação dos dados

In [17]:
# Import data
df_traindata = pd.read_csv('../data/traindata.csv', delimiter=';')
df_testdata = pd.read_csv('../data/testdata.csv', delimiter=';')

# Data preparation
df_traindata['category'] = df_traindata.reset_index().category.map({'covid':-1, 'seloturismo':-1, 'tuberculose':1})
df_testdata['category'] = df_testdata.reset_index().category.map({'covid':-1, 'seloturismo':-1, 'tuberculose':1})

# create a new dataset with only covid category data
dados_classe_alvo = df_traindata[df_traindata['category'] == 1]

dados_outliers = df_traindata[df_traindata['category'] == -1].sample(frac=0.02)
dados_outliers

Unnamed: 0,input,category
654,qual a finalidade do do programa Selo Turismo ...,-1
193,pode me dizer o que é covid,-1
495,caso surjam novas orientações de higiene preci...,-1
140,Valorização do destino e das pessoas que vivem...,-1
389,quero me desligar do programa selo turismo res...,-1
492,Sairam novas o orientações de higiene divulgad...,-1
890,SOLICITAR O SELO MESMO ESTANDO COM A ATIVIDADE...,-1
810,Para o turista vindo do exterior o que represe...,-1
181,covid vem do morcego,-1
659,o que é o selo turismo responsavel,-1


In [77]:
# create train and test data
train_text = dados_classe_alvo['input'].tolist()
train_labels = dados_classe_alvo['category'].tolist()

outliers_text = dados_outliers['input'].tolist()
outliers_labels = dados_outliers['category'].tolist()

test_text = df_testdata['input'].tolist()
test_labels = df_testdata['category'].tolist()

### Limpeza e tokenização dos dados

In [19]:
# stop words list
STOPLIST = set(stopwords.words('portuguese'))
print(STOPLIST)
# special characters
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”","''"]

{'tínhamos', 'no', 'com', 'fomos', 'minha', 'numa', 'às', 'foi', 'estivéssemos', 'teus', 'ao', 'estávamos', 'nos', 'será', 'tivermos', 'estão', 'houveremos', 'estive', 'formos', 'hajamos', 'terão', 'houveríamos', 'já', 'tiver', 'terei', 'nossa', 'suas', 'seriam', 'sejam', 'houver', 'não', 'num', 'teríamos', 'aquilo', 'isso', 'que', 'sou', 'seu', 'tivera', 'as', 'houvesse', 'estava', 'meu', 'estivemos', 'por', 'o', 'quem', 'esta', 'eram', 'ser', 'sem', 'seria', 'deles', 'como', 'dela', 'nosso', 'teremos', 'estiveram', 'dele', 'estivessem', 'temos', 'tivemos', 'ou', 'serão', 'essas', 'hajam', 'for', 'nas', 'são', 'tive', 'uma', 'na', 'houveria', 'vocês', 'éramos', 'esteja', 'aquelas', 'aqueles', 'tivéramos', 'só', 'houveram', 'tivéssemos', 'tua', 'aquela', 'para', 'em', 'os', 'teu', 'tinham', 'a', 'eles', 'até', 'houvera', 'isto', 'me', 'tivesse', 'houvéssemos', 'haver', 'fosse', 'houvéramos', 'aquele', 'dos', 'esse', 'muito', 'somos', 'tém', 'tiverem', 'nossas', 'fossem', 'tenho', 'este

In [20]:
# class for cleaning the text
class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
            return {}

def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text

In [21]:
# tokenizing the raw text
def tokenizeText(sample):
    tokens = parser(sample)

    # lemmatization
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # remove stop words and special characters
    tokens = [tok for tok in tokens if tok.lower() not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # only take words with length greater than or equal to 3
    tokens = [tok for tok in tokens if len(tok) >= 3]

    # remove remaining tokens that are not alphabetic
    tokens = [tok for tok in tokens if tok.isalpha()]

    # stemming of words
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]

    return list(set(tokens))

In [22]:
# lets see tokenized random text
tokenizeText(nlp(train_text[9]))

['média',
 'pessoa',
 'infectar',
 'poder',
 'ativo',
 'tratamento',
 'tuberculos',
 'laríngea',
 'pulmonar']

### Representação dos dados em TF-IDF

In [23]:
tv = TfidfVectorizer(strip_accents='unicode', token_pattern="[A-Za-z]+")
train_tf_idf = tv.fit_transform(train_text)
outliers_tf_idf = tv.transform(outliers_text)
test_tf_idf = tv.transform(test_text)

traindata_vect = pd.DataFrame(train_tf_idf.toarray(), columns=tv.get_feature_names_out())
outliers_vect = pd.DataFrame(outliers_tf_idf.toarray(), columns=tv.get_feature_names_out())
testdata_vect = pd.DataFrame(test_tf_idf.toarray(), columns=tv.get_feature_names_out())

### One-class SVM
One-class SVM is an unsupervised algorithm that learns a decision function for novelty detection: classifying new data as similar or different to the training set.

In [55]:
# OneClassSVM algorithm
clf = OneClassSVM(nu=0.0002, kernel="rbf", gamma='scale')
pipe_clf = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', tv), ('clf', clf)])

In [56]:
# fit OneClassSVM model
pipe_clf.fit(train_text, train_labels)

In [57]:
# validate OneClassSVM model with train set
preds_train = pipe_clf.predict(train_text)
print("accuracy:", accuracy_score(train_labels, preds_train))

accuracy: 0.11387900355871886


In [58]:
# validate OneClassSVM model with outliers set
preds_outliers = pipe_clf.predict(outliers_text)
print("accuracy:", accuracy_score(outliers_labels, preds_outliers))

accuracy: 0.9444444444444444


In [59]:
# validate OneClassSVM model with test set
preds_test = pipe_clf.predict(test_text)
preds_test

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1,  1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [60]:
# calculate prediction errors
n_error_train = preds_train[preds_train == -1].size
n_error_test = preds_test[preds_test == -1].size
n_error_outliers = preds_outliers[preds_outliers == 1].size

print('Train erros: ', n_error_train)
print('Test erros: ', n_error_test)
print('Outliers erros: ', n_error_outliers)

Train erros:  249
Test erros:  469
Outliers erros:  1


In [61]:
# let's take random text from dataset
test_text[289]

'Qual o protocolo de acompanhamento dos acometidos por tuberculose?'

In [62]:
# results
results = confusion_matrix(test_labels, preds_test)
print('Confusion Matrix :')
print(results)
print('Accuracy Score :', accuracy_score(test_labels, preds_test))
print('Report : ')
print(classification_report(test_labels, preds_test))

Confusion Matrix :
[[316   2]
 [153   6]]
Accuracy Score : 0.6750524109014675
Report : 
              precision    recall  f1-score   support

          -1       0.67      0.99      0.80       318
           1       0.75      0.04      0.07       159

    accuracy                           0.68       477
   macro avg       0.71      0.52      0.44       477
weighted avg       0.70      0.68      0.56       477



In [32]:
# check actual category
test_labels[289]

1

In [33]:
# let's predict the category of above random text
pipe_clf.predict([test_text[289]])

array([-1], dtype=int64)

In [81]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split

# Seleciona as categorias desejadas
categories = ['sci.med', 'comp.graphics', 'rec.sport.baseball']
newsgroups = fetch_20newsgroups(subset='all', categories=categories)

# Converte os textos em vetores de características com TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)

# Cria o modelo One-class SVM
clf = OneClassSVM(kernel='rbf', gamma='auto')

# Separa os dados em conjunto de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, newsgroups.target, test_size=0.2, random_state=42)

# Treina o modelo com os dados de treinamento
clf.fit(X_train, y_train)

# Realiza as predições com os dados de teste
y_pred = clf.predict(X_test)

# Calcula a precisão do modelo
accuracy = np.mean(y_pred == y_test)
print("Accuracy: {:.2f}".format(accuracy))


Accuracy: 0.11
