# Pré-processamento

In [1]:
# Install packages
# Note: use Python 3.8 env
#!pip install --pre pycaret
#!pip install pycaret[analysis]
#!pip install boto3
#!pip install pycaret[mlops]

In [2]:
# Import packages
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from pycaret.classification import *
import pandas as pd

# download das stopwords para o idioma português
nltk.download('stopwords')
stop_words = stopwords.words('portuguese')

ImportError: Numba needs NumPy 1.20 or less

In [None]:
# Montagem de datasets
df_covid = pd.read_csv('data/exemplos-treinamento-covid.csv', delimiter=';')
df_seloturismo = pd.read_csv('data/exemplos-treinamento-seloturismo.csv', delimiter=';')
df_tuberculose = pd.read_csv('data/exemplos-treinamento-tuberculose.csv', delimiter=';')
df_experimento = pd.read_csv('data/dados-testes-experimentos.csv', delimiter=';')
df_all = pd.concat([df_covid, df_seloturismo, df_tuberculose, df_experimento], axis=0)

# Vetorização das palavras
cv = CountVectorizer(lowercase=True, stop_words=stop_words, strip_accents='unicode', token_pattern="[A-Za-z]+")
BoW = cv.fit_transform(df_all['input'])
BoW_df = pd.DataFrame(BoW.toarray(), columns=cv.get_feature_names())
BoW_df['target_cat'] = df_all.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})

# Unseen data
#df_test = pd.read_csv('data/dados-testes-experimentos.csv', delimiter=';')
#df_test_blind = df_test.copy()
#df_test_blind.drop('category', axis=1, inplace=True)

# Vetorização das palavras
#cv = CountVectorizer(lowercase=True, stop_words=stop_words, strip_accents='unicode', token_pattern="[A-Za-z]+")
#BoW_blind = cv.fit_transform(df_test_blind['input'])
#BoW_df_blind = pd.DataFrame(BoW_blind.toarray(), columns=cv.get_feature_names())
#BoW_df_blind['target_cat'] = df_test.reset_index().category.map({'covid':0, 'seloturismo':1, 'tuberculose':2})

# Inicialização

In [None]:
# Setup
# Iniciando os recursos da biblioteca, passando como alvo a coluna 'class' do dataset
setup = setup(data=BoW_df, target='target_cat', session_id=20221, train_size = 0.7, fold=10)

# Treinamento

In [None]:
# compare models
top5 = compare_models(n_select = 5) 

# Otimização

In [None]:
# tune model
# tune models
tuned_top5 = [tune_model(i) for i in top5]
#best_tune = tune_model(best, choose_better = True)
#predict_model(best_tune)

In [None]:
# ensemble model
# ensemble models
bagged_top5 = [ensemble_model(i) for i in tuned_top5]
#best_ens = ensemble_model(best, choose_better = True)
#predict_model(best_ens)

In [None]:
# blend models
blender = blend_models(estimator_list = top5) 

In [None]:
# stack models
stacker = stack_models(estimator_list = top5) 

In [None]:
# automl 
best = automl(optimize = 'Accuracy')
print(best)

In [None]:
# calibrate model
best_calib = calibrate_model(best)
predict_model(best_calib)

# Análise

In [None]:
# plot model
plot_model(best_calib, plot = 'auc')

In [None]:
# launch evaluate widget
evaluate_model(best_calib)

In [None]:
# interpret model
#interpret_model(best_calib)

In [None]:
# launch dashboard
#dashboard(best_calib)

In [None]:
# deep check model
#deep_check(best_calib)

In [None]:
# launch eda
eda(display_format = 'bokeh')

In [None]:
# check leaderboard
get_leaderboard()

# Deploy

In [None]:
# finalize a model
final = finalize_model(best_calib)

In [None]:
# deploy a model
deploy_model(final, model_name = 'bow_multi_class_aws', platform = 'aws', authentication = { 'bucket'  : 'md-chatbot' })

In [None]:

# Salvando o modelo
save_model(final, 'models/bow_multi_class')

In [None]:
# save config
save_config('config/bow_multi_class_config')

In [None]:
# create api
create_api(best_calib, 'api/bow_multi_class_api')

In [None]:
# create app
create_app(best_calib)