In [1]:
# Import packages
import numpy as np
import pandas as pd
import nltk
import spacy

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
from pycaret.classification import *

# download das stopwords para o idioma português
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Import data
df_traindata = pd.read_csv('../data/traindata.csv', delimiter=';')
df_testdata = pd.read_csv('../data/testdata.csv', delimiter=';')

# Data preparation for Binary
df_traindata['category'] = df_traindata.reset_index().category.map({'covid':0, 'seloturismo':0, 'tuberculose':1})
df_testdata['category'] = df_testdata.reset_index().category.map({'covid':0, 'seloturismo':0, 'tuberculose':1})

# Data separation for One-Class
#traindata_covid = df_traindata[df_traindata['category']=='covid']
#traindata_tuberculose = df_traindata[df_traindata['category']=='tuberculose']
#traindata_seloturismo = df_traindata[df_traindata['category']=='seloturismo']

In [3]:
# Lemmatizing input string
nlp = spacy.load("pt_core_news_lg")
lem = nlp.get_pipe("lemmatizer")

def lemmatize(data):
    inputs = []
    for doc in data.input:
        d = nlp(doc)
        s = ' '.join([token.lemma_ for token in d])       
        inputs.append(s)
    data['input'] = inputs
    return data

traindata = lemmatize(df_traindata)
testdata = lemmatize(df_testdata)

In [4]:
# Normalizing and TF-IDF preparation
stop_words = stopwords.words('portuguese')

tv = TfidfVectorizer(lowercase=True, stop_words=stop_words, strip_accents='unicode', token_pattern="[A-Za-z]+")
train_tf_idf = tv.fit_transform(traindata['input'])
test_tf_idf = tv.transform(testdata['input'])

traindata_vect = pd.DataFrame(train_tf_idf.toarray(), columns=tv.get_feature_names())
traindata_vect['target_cat'] = traindata.reset_index().category

testdata_vect = pd.DataFrame(test_tf_idf.toarray(), columns=tv.get_feature_names())
testdata_vect['target_cat'] = testdata.reset_index().category
#testdata_vect.head()
#traindata_vect.head()

In [5]:
s = setup(data=traindata_vect, target='target_cat', session_id=9999, fold=10, test_data=testdata_vect, silent=True, fix_imbalance=True)

Unnamed: 0,Description,Value
0,session_id,9999
1,Target,target_cat
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(1199, 1286)"
5,Missing Values,False
6,Numeric Features,665
7,Categorical Features,620
8,Ordinal Features,False
9,High Cardinality Features,False


In [6]:
top5 = compare_models(n_select = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.9825,0.9816,0.9393,0.9839,0.96,0.9489,0.9501,4.467
gbc,Gradient Boosting Classifier,0.9792,0.9752,0.9286,0.98,0.9523,0.939,0.9406,0.38
ada,Ada Boost Classifier,0.9758,0.9858,0.9254,0.9717,0.9461,0.9306,0.9325,0.178
lr,Logistic Regression,0.975,0.994,0.9001,0.9926,0.9403,0.9248,0.9293,0.687
xgboost,Extreme Gradient Boosting,0.975,0.983,0.9179,0.9727,0.943,0.9271,0.9288,0.811
lightgbm,Light Gradient Boosting Machine,0.9741,0.9834,0.9252,0.9639,0.9429,0.9262,0.9276,0.099
dt,Decision Tree Classifier,0.9708,0.9686,0.9286,0.9457,0.9361,0.9173,0.9181,0.08
et,Extra Trees Classifier,0.9633,0.9887,0.9466,0.9085,0.9247,0.9006,0.903,0.216
ridge,Ridge Classifier,0.9625,0.0,0.9143,0.9384,0.9192,0.8952,0.9007,0.08
rf,Random Forest Classifier,0.94,0.9901,0.936,0.8608,0.8868,0.8474,0.8576,0.172


In [7]:
# tune models
tuned = [tune_model(i) for i in top5]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9,0.9503,0.8929,0.7353,0.8065,0.7399,0.7462
1,0.9417,0.9532,0.7857,0.9565,0.8627,0.8262,0.8326
2,0.9583,0.9823,0.9643,0.871,0.9153,0.8877,0.8897
3,0.9917,1.0,1.0,0.9655,0.9825,0.977,0.9773
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,0.9917,1.0,0.9643,1.0,0.9818,0.9764,0.9767
7,0.9917,1.0,0.9643,1.0,0.9818,0.9764,0.9767
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# ensemble models
bagged = [ensemble_model(i) for i in tuned]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9333,0.9422,0.8214,0.8846,0.8519,0.8089,0.8098
1,0.95,0.9536,0.7857,1.0,0.88,0.849,0.8588
2,0.9583,0.9839,0.9643,0.871,0.9153,0.8877,0.8897
3,0.9917,1.0,1.0,0.9655,0.9825,0.977,0.9773
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,0.9917,1.0,0.9643,1.0,0.9818,0.9764,0.9767
6,0.9917,1.0,0.9643,1.0,0.9818,0.9764,0.9767
7,0.9917,1.0,0.9643,1.0,0.9818,0.9764,0.9767
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,0.9748,1.0,0.8929,1.0,0.9434,0.9272,0.9297


In [9]:
# blend models
blended = blend_models(estimator_list = top5) 

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.925,0.9627,0.7857,0.88,0.8302,0.7823,0.7843
1,0.925,0.9631,0.7143,0.9524,0.8163,0.7704,0.783
2,0.9833,0.9899,0.9286,1.0,0.963,0.9522,0.9533
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,0.9833,1.0,0.9286,1.0,0.963,0.9522,0.9533
6,0.9917,1.0,0.9643,1.0,0.9818,0.9764,0.9767
7,0.9917,1.0,0.9643,1.0,0.9818,0.9764,0.9767
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,0.9916,1.0,0.9643,1.0,0.9818,0.9764,0.9766


In [10]:
# stack models
stacked = stack_models(estimator_list = top5) 

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.925,0.9666,0.7857,0.88,0.8302,0.7823,0.7843
1,0.925,0.9666,0.7143,0.9524,0.8163,0.7704,0.783
2,0.9833,0.9903,0.9286,1.0,0.963,0.9522,0.9533
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,0.9833,1.0,0.9286,1.0,0.963,0.9522,0.9533
6,0.9917,1.0,0.9643,1.0,0.9818,0.9764,0.9767
7,0.9917,1.0,0.9643,1.0,0.9818,0.9764,0.9767
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
# automl 
best = automl(optimize = 'Accuracy')

In [12]:
# calibrate model
model = calibrate_model(best)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9333,0.9356,0.8214,0.8846,0.8519,0.8089,0.8098
1,0.9333,0.9375,0.75,0.9545,0.84,0.7987,0.8079
2,0.9917,0.9798,0.9643,1.0,0.9818,0.9764,0.9767
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,0.9833,1.0,0.9286,1.0,0.963,0.9522,0.9533
6,0.9917,0.9984,0.9643,1.0,0.9818,0.9764,0.9767
7,0.9917,1.0,0.9643,1.0,0.9818,0.9764,0.9767
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
predict_model(model, probability_threshold=0.75)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,CatBoost Classifier,0.8679,0.8919,0.6038,1.0,0.7529,0.6702,0.7099


Unnamed: 0,abdomen,abracar,acesso,achar,acontecer,acrescimo,acrescimos,aderir,aderiram,aderiu,...,vao_0.0,varios_0.3994513750076294,vc_0.0,vendi_0.5771128535270691,verdura_0.5685370564460754,vistorias_0.0,vou_0.0,target_cat,Label,Score
0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0,0.9786
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0,0.9786
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0,0.9787
3,0.0,0.0,0.493011,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0,0.9786
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0,0.9786
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0,0.9792
473,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0,0.9792
474,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.458805,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0,0.9792
475,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0,0,0.9792


In [14]:
print(model)

CalibratedClassifierCV(base_estimator=<catboost.core.CatBoostClassifier object at 0x0000029B3DC42340>,
                       cv=5, method='sigmoid')
