In [1]:
# Import packages
import numpy as np
import pandas as pd
import nltk
import spacy

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import OneClassSVM
from pycaret.classification import *

# download das stopwords para o idioma português
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Import data
df_traindata = pd.read_csv('../data/traindata.csv', delimiter=';')
df_testdata = pd.read_csv('../data/testdata.csv', delimiter=';')

# Data preparation for Binary
df_traindata['category'] = df_traindata.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})
df_testdata['category'] = df_testdata.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})

# Data separation for One-Class
#traindata_covid = df_traindata[df_traindata['category']=='covid']
#traindata_tuberculose = df_traindata[df_traindata['category']=='tuberculose']
#traindata_seloturismo = df_traindata[df_traindata['category']=='seloturismo']

In [3]:
# Lemmatizing input string
nlp = spacy.load("pt_core_news_lg")
lem = nlp.get_pipe("lemmatizer")

def lemmatize(data):
    inputs = []
    for doc in data.input:
        d = nlp(doc)
        s = ' '.join([token.lemma_ for token in d])       
        inputs.append(s)
    data['input'] = inputs
    return data

traindata = lemmatize(df_traindata)
testdata = lemmatize(df_testdata)

In [4]:
# Normalizing and TF-IDF preparation
stop_words = stopwords.words('portuguese')

tv = TfidfVectorizer(lowercase=True, stop_words=stop_words, strip_accents='unicode', token_pattern="[A-Za-z]+")
train_tf_idf = tv.fit_transform(traindata['input'])
test_tf_idf = tv.transform(testdata['input'])

traindata_vect = pd.DataFrame(train_tf_idf.toarray(), columns=tv.get_feature_names())
traindata_vect['target_cat'] = traindata.reset_index().category

testdata_vect = pd.DataFrame(test_tf_idf.toarray(), columns=tv.get_feature_names())
testdata_vect['target_cat'] = testdata.reset_index().category
#testdata_vect.head()
#traindata_vect.head()

In [16]:
s = setup(data=traindata_vect, target='target_cat', session_id=9999, fold=10, test_data=testdata_vect, silent=True, preprocess=False)

Unnamed: 0,Description,Value
0,session_id,9999
1,Target,target_cat
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(1199, 1286)"
5,Missing Values,False
6,Numeric Features,665
7,Categorical Features,620
8,Transformed Train Set,"(1199, 1285)"
9,Transformed Test Set,"(477, 1285)"


In [25]:
top5 = compare_models(n_select=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
svm,SVM - Linear Kernel,0.9575,0.0,0.9347,0.9602,0.9563,0.9241,0.9273,0.247
et,Extra Trees Classifier,0.9508,0.9879,0.9213,0.9542,0.9488,0.9121,0.9159,0.31
rf,Random Forest Classifier,0.9416,0.9893,0.9072,0.9462,0.9394,0.8945,0.9002,0.295
ridge,Ridge Classifier,0.9391,0.0,0.902,0.9425,0.9364,0.8903,0.8956,0.252
gbc,Gradient Boosting Classifier,0.9358,0.9848,0.8956,0.9421,0.9322,0.8837,0.8908,1.023
catboost,CatBoost Classifier,0.93,0.9896,0.8877,0.9378,0.9257,0.8723,0.8813,5.324
dt,Decision Tree Classifier,0.9291,0.9319,0.9087,0.9313,0.928,0.8765,0.8789,0.233
lr,Logistic Regression,0.9207,0.9944,0.8704,0.9295,0.9151,0.8538,0.8654,0.281
ada,Ada Boost Classifier,0.9074,0.9639,0.855,0.9125,0.9014,0.8318,0.8407,2.501
knn,K Neighbors Classifier,0.8732,0.9579,0.8223,0.8792,0.8696,0.7741,0.7806,0.311


In [26]:
# tune models
tuned = [tune_model(i) for i in top5]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.925,0.976,0.8919,0.9275,0.9238,0.8677,0.8711
1,0.9,0.9549,0.8471,0.915,0.8975,0.8179,0.8319
2,0.9,0.983,0.8339,0.9033,0.8939,0.8173,0.8274
3,0.9917,1.0,0.9855,0.992,0.9916,0.9856,0.9857
4,0.9833,0.996,0.971,0.9838,0.9831,0.9709,0.9714
5,0.8917,0.9954,0.8168,0.8976,0.8809,0.8006,0.8147
6,0.95,0.9976,0.9156,0.954,0.9484,0.9106,0.9148
7,0.9917,1.0,0.9881,0.9918,0.9916,0.9855,0.9856
8,0.95,0.9959,0.9375,0.9509,0.95,0.9127,0.9132
9,0.9076,0.9919,0.8483,0.9177,0.9027,0.8318,0.8429


In [27]:
# ensemble models
bagged = [ensemble_model(i) for i in top5]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9167,0.9748,0.873,0.9274,0.915,0.85,0.8598
1,0.875,0.9409,0.8066,0.8976,0.8706,0.7686,0.79
2,0.8833,0.9898,0.8075,0.8893,0.8755,0.7844,0.7987
3,0.9917,0.997,0.9855,0.9918,0.9916,0.9855,0.9856
4,0.975,0.9964,0.9565,0.976,0.9744,0.956,0.9572
5,0.875,0.9921,0.7878,0.8973,0.8595,0.7646,0.7905
6,0.95,0.9987,0.9156,0.954,0.9484,0.9106,0.9148
7,0.975,0.9983,0.9591,0.976,0.9748,0.9561,0.957
8,0.9167,1.0,0.8577,0.9272,0.9103,0.8475,0.8596
9,0.9328,0.9969,0.8841,0.9386,0.9272,0.8796,0.887


In [28]:
# blend models
blended = blend_models(estimator_list = top5) 

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9333,0.0,0.9008,0.9384,0.9315,0.8819,0.8874
1,0.9083,0.0,0.8586,0.9211,0.9063,0.8339,0.8456
2,0.925,0.0,0.8773,0.9269,0.9225,0.8653,0.8701
3,1.0,0.0,1.0,1.0,1.0,1.0,1.0
4,0.9917,0.0,0.9855,0.9918,0.9916,0.9855,0.9856
5,0.9,0.0,0.8313,0.9049,0.8914,0.817,0.8286
6,0.9583,0.0,0.9301,0.9602,0.9572,0.9263,0.9284
7,1.0,0.0,1.0,1.0,1.0,1.0,1.0
8,0.9667,0.0,0.942,0.9685,0.9655,0.941,0.9432
9,0.9496,0.0,0.9182,0.9537,0.9487,0.9103,0.9142


In [29]:
# stack models
stacked = stack_models(estimator_list = top5) 

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8833,0.9894,0.8964,0.9142,0.8888,0.8087,0.8195
1,0.9333,0.9768,0.902,0.9372,0.9316,0.8832,0.8868
2,0.9583,0.9987,0.9353,0.9598,0.9582,0.9273,0.9281
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.9917,1.0,0.9855,0.9918,0.9916,0.9855,0.9856
5,0.9333,0.9973,0.8892,0.9317,0.9306,0.8821,0.8841
6,0.975,0.9997,0.9688,0.9757,0.9752,0.957,0.9571
7,0.9917,1.0,0.9952,0.992,0.9917,0.9857,0.9858
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,0.958,0.9996,0.9301,0.9596,0.9569,0.9264,0.9281


In [30]:
# automl 
best = automl(optimize = 'Accuracy')

In [31]:
# calibrate model
model = calibrate_model(best)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9333,0.9888,0.8988,0.9404,0.9323,0.8812,0.8876
1,0.9083,0.9695,0.8616,0.9142,0.9055,0.8368,0.8437
2,0.95,0.9948,0.9208,0.9511,0.9493,0.9117,0.9134
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.9917,0.9997,0.9855,0.9918,0.9916,0.9855,0.9856
5,0.8917,0.9953,0.8168,0.8976,0.8809,0.8006,0.8147
6,0.9833,0.9987,0.9736,0.9838,0.9833,0.9709,0.9713
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,0.9496,0.9971,0.9156,0.9537,0.948,0.9103,0.9145


In [32]:
predict_model(model, probability_threshold=0.75)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Stacking Classifier,0,0.9483,0,0,0,0,0


ValueError: Length mismatch: Expected axis has 3 elements, new values have 1 elements