In [1]:
# Import packages
import numpy as np
import pandas as pd
import nltk
import spacy

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
from pycaret.classification import *

# download das stopwords para o idioma português
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Import data
df_traindata = pd.read_csv('../data/traindata.csv', delimiter=';')
df_testdata = pd.read_csv('../data/testdata.csv', delimiter=';')

# Data preparation for Binary
df_traindata['category'] = df_traindata.reset_index().category.map({'covid':0, 'seloturismo':0, 'tuberculose':1})
df_testdata['category'] = df_testdata.reset_index().category.map({'covid':0, 'seloturismo':0, 'tuberculose':1})

# Data separation for One-Class
#traindata_covid = df_traindata[df_traindata['category']=='covid']
#traindata_tuberculose = df_traindata[df_traindata['category']=='tuberculose']
#traindata_seloturismo = df_traindata[df_traindata['category']=='seloturismo']

In [3]:
# Lemmatizing input string
nlp = spacy.load("pt_core_news_lg")
lem = nlp.get_pipe("lemmatizer")

def lemmatize(data):
    inputs = []
    for doc in data.input:
        d = nlp(doc)
        s = ' '.join([token.lemma_ for token in d])       
        inputs.append(s)
    data['input'] = inputs
    return data

traindata = lemmatize(df_traindata)
testdata = lemmatize(df_testdata)

In [4]:
# Normalizing and TF-IDF preparation
stop_words = stopwords.words('portuguese')

tv = TfidfVectorizer(lowercase=True, stop_words=stop_words, strip_accents='unicode', token_pattern="[A-Za-z]+")
train_tf_idf = tv.fit_transform(traindata['input'])
test_tf_idf = tv.transform(testdata['input'])

traindata_vect = pd.DataFrame(train_tf_idf.toarray(), columns=tv.get_feature_names())
traindata_vect['target_cat'] = traindata.reset_index().category

testdata_vect = pd.DataFrame(test_tf_idf.toarray(), columns=tv.get_feature_names())
testdata_vect['target_cat'] = testdata.reset_index().category
#testdata_vect.head()
#traindata_vect.head()

In [5]:
s = setup(data=traindata_vect, target='target_cat',numeric_features=list(traindata_vect.iloc[:,:-1].columns), session_id=9999, fold=10, test_data=testdata_vect, silent=True, fix_imbalance=True)

Unnamed: 0,Description,Value
0,session_id,9999
1,Target,target_cat
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(1199, 1286)"
5,Missing Values,False
6,Numeric Features,1285
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [6]:
#top5 = compare_models(n_select = 5)
model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.9817,0.984,0.9393,0.9805,0.9583,0.9466,0.9478,3.306
gbc,Gradient Boosting Classifier,0.9808,0.9765,0.9357,0.98,0.9561,0.9439,0.9453,0.388
lightgbm,Light Gradient Boosting Machine,0.98,0.984,0.9217,0.9923,0.9538,0.9412,0.9436,0.096
dt,Decision Tree Classifier,0.9792,0.9665,0.9429,0.9669,0.954,0.9406,0.9412,0.083
rf,Random Forest Classifier,0.9783,0.9857,0.911,0.9962,0.9496,0.9359,0.939,0.165
ada,Ada Boost Classifier,0.9783,0.9804,0.9357,0.9704,0.951,0.9372,0.9388,0.179
et,Extra Trees Classifier,0.9783,0.9872,0.9321,0.9763,0.9513,0.9374,0.9397,0.207
xgboost,Extreme Gradient Boosting,0.9775,0.9854,0.9251,0.9788,0.949,0.9347,0.9369,0.932
lr,Logistic Regression,0.9758,0.994,0.9002,0.9963,0.9422,0.9272,0.9317,0.486
svm,SVM - Linear Kernel,0.9758,0.0,0.925,0.9704,0.9439,0.9287,0.9316,0.078


In [7]:
# tune models
#tuned = [tune_model(i) for i in top5]

In [8]:
# ensemble models
#bagged = [ensemble_model(i) for i in top5]

In [9]:
# blend models
#blended = blend_models(estimator_list = tuned) 

In [10]:
# stack models
#stacked = stack_models(estimator_list = tuned) 

In [11]:
# automl 
#best = automl(optimize = 'F1')

In [12]:
# calibrate model
#model = calibrate_model(best)

In [13]:
predict_model(model, probability_threshold=0.75)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,CatBoost Classifier,0.8637,0.9275,0.5912,1.0,0.7431,0.6585,0.7006


Unnamed: 0,abdomen,abracar,abraco,acessibilidade,acesso,achar,acionar,acometir,acompanhar,acontecer,...,vistoria,vistorias,viver,vivo,voltar,vou,vulneravel,target_cat,Label,Score
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9942
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9942
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9939
3,0.0,0.0,0.0,0.0,0.493011,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9939
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9962
473,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9959
474,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9961
475,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.9957


In [14]:
print(model)

<catboost.core.CatBoostClassifier object at 0x000001FEFA0F9100>
