In [None]:
import mlflow
import cloudpickle
import numpy as np
from keras.utils import to_categorical
from app.modeling.tuning import tune_model
from keras.callbacks import EarlyStopping
from mlflow.models import infer_signature
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from app.preprocessing.text import prepare_text_data
from app.modeling.evaluation import ClassificationEvaluator
from app.utilities.utils import load_dataset, extract_score_by_trial, save_artifact_locally, save_experiment

In [None]:
mlflow.set_tracking_uri("http://localhost:2020/")

In [None]:
mlflow.set_experiment(experiment_name="Domain Classification")

In [None]:
train_data = load_dataset(filepath="data/train.csv")
validation_data = load_dataset(filepath="data/validation.csv")
test_data = load_dataset(filepath="data/test.csv")

# Modeling

In [None]:
label_col = 'domain'
text_col = "item"

train_label = train_data.pop(label_col)
validation_label = validation_data.pop(label_col)
test_label = test_data.pop(label_col)

texts_train, texts_validation, texts_test = train_data[text_col].copy(), validation_data[text_col].copy(), test_data[text_col].copy()

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts=texts_train)

In [None]:
encoder = LabelEncoder()

train_label = encoder.fit_transform(train_label)
test_label = encoder.transform(test_label)
validation_label = encoder.transform(validation_label)

In [None]:
prepare_text_data_path = "models/preprocessing/prepare_text_data.pkl"
tokenizer_path = "models/preprocessing/tokenizer.pkl"
label_encoder_path = "models/preprocessing/label_encoder.pkl"

with open(prepare_text_data_path, "wb") as file:
    cloudpickle.dump(obj=prepare_text_data, file=file)
    
with open(tokenizer_path, "wb") as file:
    cloudpickle.dump(obj=tokenizer, file=file)
    
with open(label_encoder_path, "wb") as file:
    cloudpickle.dump(obj=encoder, file=file)

In [None]:
with mlflow.start_run(run_name="preprocessing"):
    mlflow.log_artifact(local_path=prepare_text_data_path)
    mlflow.log_artifact(local_path=tokenizer_path)
    mlflow.log_artifact(local_path=label_encoder_path)

In [None]:
texts_train = prepare_text_data(texts=texts_train, tokenizer=tokenizer, max_len=30)
texts_validation = prepare_text_data(texts=texts_validation, tokenizer=tokenizer, max_len=30)
texts_test = prepare_text_data(texts=texts_test, tokenizer=tokenizer, max_len=30)

In [None]:
vocab_size = len(tokenizer.word_index) + 1

# Modeling

## ML Models

### Sklearn

In [None]:
class_names = {i: v for i, v in enumerate(encoder.classes_)}

In [None]:
sklearn_tuner = tune_model(model_type="sklearn")

sklearn_tuner.search(texts_train, train_label)

In [None]:
gbm = sklearn_tuner.get_best_models(num_models=1)[0]

In [None]:
evaluation = ClassificationEvaluator(observed=test_label, predicted=gbm.predict(texts_test))
evaluation.print_metrics()
evaluation.confusion_matrix(class_names=class_names)

In [None]:
model_path = "models/hist_gradient_boosting"

description = "Entrenamiento de HistGradientBoosting para clasificación del dominio"

In [None]:
signature = infer_signature(model_input=texts_validation, model_output=gbm.predict(texts_validation))

In [None]:
with mlflow.start_run(run_name="hist_gradient_boosting", description=description):
    mlflow.log_metrics(metrics=evaluation.calculate_metrics())
    mlflow.log_params(params=gbm.get_params())
    mlflow.log_metric(key="accuracy validation", value=extract_score_by_trial(tuner=sklearn_tuner))
    mlflow.log_table(data=evaluation.confusion_matrix(class_names=class_names), artifact_file=f"metrics/confusion_matrix_gbm.json")
    mlflow.sklearn.save_model(sk_model=gbm, path=model_path)
    mlflow.sklearn.log_model(sk_model=gbm, artifact_path=model_path, signature=signature)

### XGBoost

In [None]:
xgb_tuner = tune_model(model_type="xgboost")

xgb_tuner.search(texts_train, train_label)

In [None]:
xgb_model = xgb_tuner.get_best_models(num_models=1)[0]

evaluation = ClassificationEvaluator(observed=test_label, predicted=xgb_model.predict(texts_test))
evaluation.print_metrics()
evaluation.confusion_matrix()

In [None]:
model_path = "models/xgboost"

description = "Entrenamiento de XGBoost para clasificación del dominio"

In [None]:
with mlflow.start_run(run_name="xgb", description=description):
    mlflow.log_metrics(metrics=evaluation.calculate_metrics())
    mlflow.log_params(params=xgb_model.get_params())
    mlflow.log_metric(key="accuracy validation", value=extract_score_by_trial(tuner=xgb_tuner))
    mlflow.log_table(data=evaluation.confusion_matrix(), artifact_file=f"metrics/confusion_matrix_xgboost.json")
    mlflow.xgboost.save_model(xgb_model=xgb_model, path=model_path)

## Neural Network Architecture

### RNN

In [None]:
max_length = 30
embedding_dim = 50

early_stopping = EarlyStopping(monitor="val_accuracy", patience=10)

In [None]:
rnn_tuner = tune_model(model_type="rnn", input_dim=vocab_size, output_dim=embedding_dim, max_length=max_length)

rnn_tuner.search(texts_train, to_categorical(train_label),
                 epochs=100, batch_size=256, validation_data=(texts_validation, to_categorical(validation_label)), callbacks=[early_stopping])

In [None]:
rnn = rnn_tuner.get_best_models(num_models=1)[0]

rnn.summary()

In [None]:
evaluation = ClassificationEvaluator(observed=test_label, predicted=np.argmax(rnn.predict(texts_test), axis=1))
evaluation.print_metrics()
evaluation.confusion_matrix()

In [None]:
model_path = "models/rnn"

description = "Entrenamiento Arquitectura Embedding - RNN"

with mlflow.start_run(run_name="emb_rnn", description=description):
    mlflow.log_metrics(metrics=evaluation.calculate_metrics())
    mlflow.log_params(params=rnn_tuner.get_best_hyperparameters(num_trials=1)[0].values)
    mlflow.log_metric(key="accuracy validation", value=extract_score_by_trial(tuner=rnn_tuner))
    mlflow.log_table(data=evaluation.confusion_matrix(), artifact_file=f"metrics/confusion_matrix_rnn.json")
    mlflow.tensorflow.save_model(model=rnn, path=model_path)

### LSTM

In [None]:
lstm_tuner = tune_model(model_type="lstm", input_dim=vocab_size, output_dim=embedding_dim, max_length=max_length)

lstm_tuner.search(texts_train, to_categorical(train_label),
                  epochs=100, batch_size=256, validation_data=(texts_validation, to_categorical(validation_label)), callbacks=[early_stopping])

In [None]:
lstm = lstm_tuner.get_best_models(num_models=1)[0]

lstm.summary()

In [None]:
evaluation = ClassificationEvaluator(observed=test_label, predicted=np.argmax(lstm.predict(texts_test), axis=1))
evaluation.print_metrics()
evaluation.confusion_matrix()

In [None]:
model_path = "models/lstm"

description = "Entrenamiento Arquitectura Embedding - LSTM"

with mlflow.start_run(run_name="emb_lstm", description=description):
    mlflow.log_metrics(metrics=evaluation.calculate_metrics())
    mlflow.log_params(params=lstm_tuner.get_best_hyperparameters(num_trials=1)[0].values)
    mlflow.log_metric(key="accuracy validation", value=extract_score_by_trial(tuner=lstm_tuner))
    mlflow.log_table(data=evaluation.confusion_matrix(), artifact_file=f"metrics/confusion_matrix_lstm.json")
    mlflow.tensorflow.save_model(model=lstm, path=model_path)