In [15]:
# Import packages
import numpy as np
import pandas as pd
import nltk
import spacy

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
from pycaret.classification import *

# download das stopwords para o idioma português
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# Import data
df_traindata = pd.read_csv('../data/traindata.csv', delimiter=';')
df_testdata = pd.read_csv('../data/testdata.csv', delimiter=';')

# Data preparation for Binary
df_traindata['category'] = df_traindata.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})
df_testdata['category'] = df_testdata.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})

# Data separation for One-Class
#traindata_covid = df_traindata[df_traindata['category']=='covid']
#traindata_tuberculose = df_traindata[df_traindata['category']=='tuberculose']
#traindata_seloturismo = df_traindata[df_traindata['category']=='seloturismo']

In [17]:
# Lemmatizing input string
nlp = spacy.load("pt_core_news_lg")
lem = nlp.get_pipe("lemmatizer")

def lemmatize(data):
    inputs = []
    for doc in data.input:
        d = nlp(doc)
        s = ' '.join([token.lemma_ for token in d])       
        inputs.append(s)
    data['input'] = inputs
    return data

traindata = lemmatize(df_traindata)
testdata = lemmatize(df_testdata)

In [18]:
# Normalizing and TF-IDF preparation
stop_words = stopwords.words('portuguese')

tv = TfidfVectorizer(lowercase=True, stop_words=stop_words, strip_accents='unicode', token_pattern="[A-Za-z]+")
train_tf_idf = tv.fit_transform(traindata['input'])
test_tf_idf = tv.transform(testdata['input'])

traindata_vect = pd.DataFrame(train_tf_idf.toarray(), columns=tv.get_feature_names())
traindata_vect['target_cat'] = traindata.reset_index().category

testdata_vect = pd.DataFrame(test_tf_idf.toarray(), columns=tv.get_feature_names())
testdata_vect['target_cat'] = testdata.reset_index().category


In [19]:
s = setup(data=traindata_vect, target='target_cat', numeric_features=list(traindata_vect.iloc[:,:-1].columns) ,session_id=9999, fold=10, test_data=testdata_vect, silent=True, fix_imbalance=True)

Unnamed: 0,Description,Value
0,session_id,9999
1,Target,target_cat
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(1199, 1286)"
5,Missing Values,False
6,Numeric Features,1285
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [20]:
model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.9633,0.0,0.9429,0.9655,0.962,0.9347,0.9374,0.1
et,Extra Trees Classifier,0.955,0.9901,0.9278,0.9578,0.9533,0.9197,0.9231,0.234
lr,Logistic Regression,0.9541,0.9947,0.9284,0.9568,0.9525,0.9179,0.9215,0.159
rf,Random Forest Classifier,0.9491,0.9892,0.9187,0.9528,0.9472,0.9085,0.9131,0.19
ridge,Ridge Classifier,0.9474,0.0,0.9235,0.9497,0.9462,0.9076,0.91,0.098
gbc,Gradient Boosting Classifier,0.9433,0.9852,0.9097,0.9487,0.9405,0.8977,0.9036,1.218
xgboost,Extreme Gradient Boosting,0.9408,0.9877,0.9175,0.9432,0.9394,0.8955,0.8984,2.533
catboost,CatBoost Classifier,0.9375,0.9874,0.9015,0.9433,0.9343,0.8871,0.8936,8.838
lightgbm,Light Gradient Boosting Machine,0.9299,0.9785,0.9059,0.9311,0.9285,0.8772,0.8795,0.128
dt,Decision Tree Classifier,0.9283,0.9324,0.9085,0.9302,0.9273,0.8751,0.8773,0.092


In [29]:
# tune models
tuned = tune_model(model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.95,0.0,0.9426,0.9531,0.9496,0.9142,0.9156
1,0.9333,0.0,0.902,0.9374,0.9315,0.8825,0.8869
2,0.9583,0.0,0.9353,0.9598,0.9582,0.9273,0.9281
3,1.0,0.0,1.0,1.0,1.0,1.0,1.0
4,0.9917,0.0,0.9855,0.9918,0.9916,0.9855,0.9856
5,0.9083,0.0,0.8432,0.9116,0.8995,0.8332,0.8435
6,0.9667,0.0,0.9543,0.9669,0.9667,0.9421,0.9422
7,1.0,0.0,1.0,1.0,1.0,1.0,1.0
8,0.9917,0.0,0.9855,0.992,0.9916,0.9856,0.9857
9,0.958,0.0,0.9301,0.9599,0.9569,0.926,0.9282


In [30]:
# ensemble models
bagged = ensemble_model(model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.925,0.9931,0.9049,0.9305,0.9239,0.87,0.873
1,0.9167,0.9444,0.8731,0.9221,0.9145,0.8515,0.858
2,0.9333,0.9804,0.8918,0.9348,0.9316,0.881,0.8845
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.9917,1.0,0.9855,0.9918,0.9916,0.9855,0.9856
5,0.9083,0.9748,0.8432,0.9135,0.8999,0.8323,0.8441
6,0.9667,0.9952,0.9543,0.9665,0.9665,0.942,0.9422
7,0.9917,1.0,0.9952,0.992,0.9917,0.9857,0.9858
8,0.9917,0.9999,0.9855,0.992,0.9916,0.9856,0.9857
9,0.9496,0.9893,0.9182,0.9514,0.9485,0.9112,0.9133


In [32]:
# blend models
#blended = blend_models(estimator_list = model) 

In [34]:
# stack models
#stacked = stack_models(estimator_list = model) 

In [35]:
# automl 
model = automl(optimize = 'F1')

In [36]:
# calibrate model
#model = calibrate_model(best)

In [37]:
predict_model(model, probability_threshold=0.75)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,SVM - Linear Kernel,0.8344,0,0.8344,0.8544,0.8317,0.7516,0.7611


Unnamed: 0,abdomen,abracar,abraco,acessibilidade,acesso,achar,acionar,acometir,acompanhar,acontecer,...,visivel,vistoria,vistorias,viver,vivo,voltar,vou,vulneravel,target_cat,Label
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,0.0,0.0,0.0,0.0,0.493011,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
473,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
474,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
475,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1


In [38]:
print(model)

SGDClassifier(alpha=0.0002, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.2, fit_intercept=True,
              l1_ratio=0.8100000001, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=-1, penalty='l2',
              power_t=0.5, random_state=9999, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)
