In [None]:
import mlflow
from jobtools.arguments import file2namespace

In [None]:
from transformers import Trainer, TrainingArguments
from hatedetection.hate_detection_classifier import HateDetectionClassifier
from hatedetection.evaluation import compute_classification_metrics
from hatedetection.text_datasets import ClassificationDataset
from hatedetection.prep import load_examples

In [None]:
params = file2namespace('train.params.yml')
input_dataset_path = 'data/train/*.csv'
eval_dataset_path = 'data/eval/*.csv'

In [None]:
mlflow.set_experiment("hate-pt-speech-mlflow")

In [None]:
classifier = HateDetectionClassifier()

In [None]:
classifier.build(baseline=params.model.baseline)

In [None]:
classifier.split_unique_words = params.data.preprocessing.split_unique_words
classifier.split_seq_len = params.data.preprocessing.split_seq_len

In [None]:
examples_train, labels_train = load_examples(input_dataset_path)
examples_eval, labels_eval = load_examples(eval_dataset_path)

In [None]:
train_dataset = ClassificationDataset(examples=examples_train,
                                      labels=labels_train,
                                      tokenizer=classifier.tokenizer)
eval_dataset = ClassificationDataset(examples=examples_eval,
                                     labels=labels_eval,
                                     tokenizer=classifier.tokenizer)

In [None]:
training_args = TrainingArguments(**vars(params.trainer))

In [None]:
trainer = Trainer(
        model=classifier.model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_classification_metrics
    )

In [None]:
history = trainer.train()

In [None]:
evaluation_metrics = trainer.evaluate()

In [None]:
saved_location=f"{params.model.output_dir}/{params.model.name}"
artifacts = classifier.save_pretrained(saved_location)

In [None]:
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec
from mlflow.types import DataType

input_schema = Schema([
  ColSpec(DataType.string, "text"),
])
output_schema = Schema([
  ColSpec(DataType.integer, "hate"),
  ColSpec(DataType.double, "confidence"),
])
signature = ModelSignature(inputs=input_schema, outputs=output_schema)

In [None]:
mlflow.log_metrics(dict(filter(lambda item: item[1] is not None, evaluation_metrics.items())))
mlflow.log_params(history.metrics)
model_info = mlflow.pyfunc.log_model(artifact_path=params.model.name, 
                                     code_path=['./hatedetection'], 
                                     registered_model_name='hate-pt-speech',
                                     python_model=classifier,
                                     artifacts=artifacts,
                                     signature=signature)

In [None]:
mlflow.pyfunc.log_model("classifier", 
                        data_path=saved_location, 
                        code_path=["hatedetection"], 
                        loader_module="hatedetection.hate_detection_classifier", 
                        registered_model_name="hate-pt-speech", 
                        signature=signature)

In [None]:
mlflow.end_run()