## Imports

In [None]:
import os
import re
import torch
import json
import transformers
import cloudpickle
import mlflow.pytorch
import mlflow

import pandas as pd
import numpy as np

from typing import Any, List, Dict
from pathlib import Path
from sys import version_info
from mlflow.models import ModelSignature
from mlflow.types.schema import Schema, ColSpec
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
from transformers import (
    DataCollatorWithPadding,
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EvalPrediction,
)

from utils import merge_title_perex_body
from textra_tools import initialize_experiment

### Define constants

In [None]:
# Input data filepaths
SENSITIVE_DATA_FILEPATH = "data/processed-sensitive-data.csv"
SENSITIVE_DATA_AUGMENTED_FILEPATH = "data/augmented-all.csv"
NON_SENSITIVE_DATA_FILEPATH = "data/processed-nonsensitive-data.csv"

# Output data filepath
OUTPUT_DIR = "distilbert_5ep_weighted_CE_loss_w_augmented_data"

OUTPUT_MODEL_DIR = Path(os.path.join(OUTPUT_DIR, "model"))
OUTPUT_MODEL_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_MODEL_DIR = str(OUTPUT_MODEL_DIR)

VAL_SET_RATIO = TEST_SET_RATIO = 0.10
RANDOM_SEED = 11
NONSENSITIVE_SAMPLE = 5000

MLFLOW_URL = "https://mlflow.lsnews.eu"
EXPERIMENT_NAME = "en-sensitive-data"
RUN_NAME = "distilbert_5ep_weighted_CE_loss_w_augmented_data"

STAGE_AUTH = "textra-developers.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = STAGE_AUTH
PYTHON_VERSION = "{major}.{minor}.{micro}".format(
    major=version_info.major, minor=version_info.minor, micro=version_info.micro
)  # for mlflow model logging

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(DEVICE)

In [None]:
experiment_id = initialize_experiment(experiment_name=EXPERIMENT_NAME, url=MLFLOW_URL, credentials=STAGE_AUTH)

In [None]:
run = mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME)

## 1. Prepare Data

Run below commands to merge augmented splits

In [None]:
# !cat data/augmented_1.csv > data/augmented-all.csv
# !cat data/augmented_2.csv >> data/augmented-all.csv
# !cat data/augmented_3.csv >> data/augmented-all.csv
# !cat data/augmented_4.csv >> data/augmented-all.csv
# !wc -l data/*

In [None]:
df_sensitive = pd.read_csv(SENSITIVE_DATA_FILEPATH)  # index_col=0
df_sensitive_augmented = pd.read_csv(SENSITIVE_DATA_AUGMENTED_FILEPATH)
df_sensitive_augmented = df_sensitive_augmented[df_sensitive_augmented.id != "id"]  # remove rows created due to merging
df_nonsensitive = pd.read_csv(NON_SENSITIVE_DATA_FILEPATH)

In [None]:
df_sensitive_augmented.shape

In [None]:
df_sensitive_augmented.head(3)

In [None]:
df_sensitive_augmented.iloc[0].text

In [None]:
# Convert id to string for easier manipulation
df_sensitive_augmented["id"] = df_sensitive_augmented["id"].astype(str)
df_sensitive["id"] = df_sensitive["id"].astype(str)
df_nonsensitive["id"] = df_nonsensitive["id"].astype(str)

## Subsample nonsensitive

In [None]:
df_nonsensitive = df_nonsensitive.sample(frac=1, random_state=RANDOM_SEED)[:NONSENSITIVE_SAMPLE]

In [None]:
df_sensitive["sensitive"] = True
df_sensitive_augmented["sensitive"] = True
df_nonsensitive["sensitive"] = False

In [None]:
set(df_sensitive.id) & set(df_nonsensitive.id)

### 1.1 Concatenate the text

In [None]:
df_sensitive["text"] = df_sensitive.apply(merge_title_perex_body, axis=1)
df_nonsensitive["text"] = df_nonsensitive.apply(merge_title_perex_body, axis=1)

df_sensitive.drop(["title", "perex", "body"], axis=1, inplace=True)
df_nonsensitive.drop(["title", "perex", "body"], axis=1, inplace=True)

df_sensitive = df_sensitive.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
df_nonsensitive = df_nonsensitive.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

### 1.2 Train/Validation/Test Split

In [None]:
def train_validation_test_split(df, train_ratio=0.6, validation_ratio=0.2, seed=RANDOM_SEED):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_ratio * m)
    validation_end = int(validation_ratio * m) + train_end
    train = df.iloc[perm[:train_end]]
    validation = df.iloc[perm[train_end:validation_end]]
    test = df.iloc[perm[validation_end:]]
    return train, validation, test

In [None]:
df_sensitive_train, df_sensitive_val, df_sensitive_test = train_validation_test_split(
    df_sensitive, train_ratio=0.8, validation_ratio=VAL_SET_RATIO
)

In [None]:
assert len(df_sensitive_train) + len(df_sensitive_val) + len(df_sensitive_test) == len(df_sensitive)
print(
    f"Number of sensitive training data: {len(df_sensitive_train)}\nNumber of sensitive validation data: {len(df_sensitive_val)}\nNumber of sensitive test data: {len(df_sensitive_test)}"
)

In [None]:
df_nonsensitive_train, df_nonsensitive_val, df_nonsensitive_test = train_validation_test_split(
    df_nonsensitive, train_ratio=0.8, validation_ratio=VAL_SET_RATIO
)

In [None]:
assert len(df_nonsensitive_train) + len(df_nonsensitive_val) + len(df_nonsensitive_test) == len(df_nonsensitive)
print(
    f"Number of nonsensitive training data: {len(df_nonsensitive_train)}\nNumber of nonsensitive validation data: {len(df_nonsensitive_val)}\nNumber of nonsensitive test data: {len(df_nonsensitive_test)}"
)

### 1.3 Merge with augmented data

Need to be done after train/val/test split, so we can avoid to train-val-test data bleeding

In [None]:
for i, row in df_sensitive_augmented.iterrows():
    if row.id[1:] in df_sensitive_train.id.values:
        df_sensitive_train = df_sensitive_train.append(row, ignore_index=True)
    elif row.id[1:] in df_sensitive_val.id.values:
        df_sensitive_val = df_sensitive_val.append(row, ignore_index=True)
    elif row.id[1:] in df_sensitive_test.id.values:
        df_sensitive_test = df_sensitive_test.append(row, ignore_index=True)
    else:
        print(f"ERROR: {row.id} could not be found anywhere")

In [None]:
assert len(df_sensitive_augmented) + len(df_sensitive) == len(df_sensitive_train) + len(df_sensitive_val) + len(
    df_sensitive_test
)

### 1.4 Merge sensitive and nonsensitive data

In [None]:
df_train = df_sensitive_train.merge(df_nonsensitive_train, how="outer")
df_val = df_sensitive_val.merge(df_nonsensitive_val, how="outer")
df_test = df_sensitive_test.merge(df_nonsensitive_test, how="outer")

In [None]:
print(
    f"Number of training data: {len(df_train)}\nNumber of validation data: {len(df_val)}\nNumber of test data: {len(df_test)}"
)

In [None]:
df_train.head(3)

## 2. Get Tokenizer and Model

Define label mappings

In [None]:
label2id = {
    "nonsensitive": 0,
    "sensitive": 1,
}
id2label = {
    0: "nonsensitive",
    1: "sensitive",
}

In [None]:
model_name = "distilbert-base-cased"

config = AutoConfig.from_pretrained(model_name, label2id=label2id, id2label=id2label, num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## 3. Dataset

In [None]:
class DatasetRetriever(Dataset):
    def __init__(self, data, tokenizer, is_test=False):
        self.data = data
        self.tokenizer = tokenizer
        self.text = self.data.text.values.tolist()
        self.label = self.data.sensitive.values.tolist()
        self.is_test = is_test  # if the label is not present

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        tokenized = tokenizer(self.text[item], truncation=True)
        if not self.is_test:
            return {
                "label": int(self.label[item]),
                "input_ids": tokenized["input_ids"],
                "attention_mask": tokenized["attention_mask"],
            }
        else:
            return {
                "input_ids": tokenized["input_ids"],
                "attention_mask": tokenized["attention_mask"],
            }

In [None]:
train_dataset = DatasetRetriever(df_train, tokenizer)
val_dataset = DatasetRetriever(df_val, tokenizer)
test_dataset = DatasetRetriever(df_test, tokenizer)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# save and log data
data_path = Path(os.path.join(OUTPUT_DIR, "data"))
data_path.mkdir(parents=True, exist_ok=True)

df_train.to_csv(os.path.join(data_path, "train.csv"))
df_val.to_csv(os.path.join(data_path, "val.csv"))
df_test.to_csv(os.path.join(data_path, "test.csv"))

mlflow.log_artifact(data_path)

## 4. Trainer

In [None]:
nonsensitive_weight = 1 - df_train.sensitive.value_counts()[False] / len(df_train)
sensitive_weight = 1 - df_train.sensitive.value_counts()[True] / len(df_train)

print(f"Nonsensitive weight {nonsensitive_weight}\nSensitive weight {sensitive_weight}")

In [None]:
# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

In [None]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, weights=[nonsensitive_weight, sensitive_weight], return_outputs=False):
        assert len(weights) == 2
        labels = inputs.get("labels").to("cpu")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits").to("cpu")
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor(weights, dtype=torch.float).to("cpu"))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    seed=RANDOM_SEED,
    overwrite_output_dir="True",
    evaluation_strategy="steps",
    eval_steps=500,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
train_result = trainer.train()
metrics = train_result.metrics
metrics["train_samples"] = len(train_dataset)

In [None]:
# Log training arguments to mlflow
for arg in str(training_args).split("\n")[1:-1]:
    arg, arg_val = arg.strip(",").strip("_").split("=")
    mlflow.log_param(arg, arg_val)

In [None]:
trainer.save_model(OUTPUT_MODEL_DIR)  # Saves the tokenizer too for easy upload

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

mlflow.log_artifact("results/all_results.json")
mlflow.log_artifact("results/trainer_state.json")

### Eval on train data

In [None]:
def log_metrics(df_report: pd.DataFrame, split: str):
    """Method that logs metrics to mlflow from classification report"""
    class_mapping = {"accuracy": "acc"}
    metric_mapping = {"precision": "prec", "recall": "rec", "f1-score": "f1"}
    for class_name, class_values in df_report.iterrows():
        log_class_name = (
            f"{split}_{class_mapping[class_name]}".replace(" ", "_")
            if class_name in class_mapping
            else f"{split}_{class_name}".replace(" ", "_")
        )
        for metric_name, metric_val in zip(class_values.index, class_values.values):
            log_metric_name = (
                f"{log_class_name}_{metric_mapping[metric_name]}"
                if metric_name in metric_mapping
                else f"{log_class_name}_{metric_name}"
            )
            if log_metric_name.startswith(f"{split}_acc"):
                if log_metric_name.endswith("prec"):
                    mlflow.log_metric(f"{split}_acc", metric_val)
            else:
                mlflow.log_metric(log_metric_name, metric_val)

In [None]:
reports_path = Path(os.path.join(OUTPUT_DIR, "reports"))
reports_path.mkdir(parents=True, exist_ok=True)

In [None]:
predictions = trainer.predict(train_dataset, metric_key_prefix="predict").predictions
predictions = np.argmax(predictions, axis=1)
true = [i["label"] for i in train_dataset]

print(classification_report(true, predictions, target_names=["nonsensitive", "sensitive"]))

clf_report = classification_report(true, predictions, target_names=["nonsensitive", "sensitive"], output_dict=True)
df_report = pd.DataFrame(clf_report).transpose()
df_report.to_csv(os.path.join(reports_path, "clf_report_train.csv"))

### Eval on val data

In [None]:
predictions = trainer.predict(val_dataset, metric_key_prefix="predict").predictions
predictions = np.argmax(predictions, axis=1)
true = [i["label"] for i in val_dataset]

print(classification_report(true, predictions, target_names=["nonsensitive", "sensitive"]))

clf_report = classification_report(true, predictions, target_names=["nonsensitive", "sensitive"], output_dict=True)
df_report = pd.DataFrame(clf_report).transpose()
df_report.to_csv(os.path.join(reports_path, "clf_report_val.csv"))

### Evaluate on test data

In [None]:
predictions = trainer.predict(test_dataset, metric_key_prefix="predict").predictions
predictions = np.argmax(predictions, axis=1)
true = [i["label"] for i in test_dataset]

print(classification_report(true, predictions, target_names=["nonsensitive", "sensitive"]))

clf_report = classification_report(true, predictions, target_names=["nonsensitive", "sensitive"], output_dict=True)
df_report = pd.DataFrame(clf_report).transpose()
df_report.to_csv(os.path.join(reports_path, "clf_report_test.csv"))

In [None]:
mlflow.log_artifact(reports_path)

### Log to MLflow

In [None]:
mlflow.log_artifact(OUTPUT_MODEL_DIR)

In [None]:
class TransformerWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, model_tokenizer_path: str):
        self.model_tokenizer_path = model_tokenizer_path

    def load_context(self, context):
        from transformers import (
            AutoTokenizer,
            AutoModelForSequenceClassification,
        )
        print("Tokenizer initialization...")
        self.tokenizer = AutoTokenizer.from_pretrained(context.artifacts["model_tokenizer_path"])
        print("Model initialization...")
        self.model = AutoModelForSequenceClassification.from_pretrained(context.artifacts["model_tokenizer_path"], config=config)
        # Optimize model by quantization -> weights represented using int8 instead of float32
        self.model = torch.quantization.quantize_dynamic(self.model, {torch.nn.Linear}, dtype=torch.qint8)
        print("Model is initialized!")

    def predict(self, context, model_input: pd.DataFrame):
        text = (model_input.title + " " + model_input.lead + " " + model_input.body).tolist()
        tokenized = self.tokenizer(text, truncation=True, max_length=512, return_tensors="pt", padding=True)
        prediction = self.model(**tokenized).logits
        prediction = torch.softmax(prediction, axis=1)  # convert outputs to [0, 1] range, i.e. probability prediction
        prediction = prediction.cpu().detach().numpy()
        pred_index = list(np.argmax(prediction, axis=1))
        output = [{'isSensitive': True if pred_index == 1 else False, 'score': prediction[i, p], 'id': sample_id} for i, (p, sample_id) in enumerate(zip(pred_index, model_input.id))]
        return output

In [None]:
artifacts = {
    "model_tokenizer_path": f"{OUTPUT_MODEL_DIR}",
}

conda_env = {
    "channels": ["defaults"],
    "dependencies": [
        "python={}".format(PYTHON_VERSION),
        "pip",
        {
            "pip": [
                "mlflow=={}".format(mlflow.__version__),
                "transformers[onnx]=={}".format(transformers.__version__),
                "cloudpickle=={}".format(cloudpickle.__version__),
                "torch==1.11.0",
            ],
        },
    ],
    "name": "transformers_env",
}

input_schema = input_schema = Schema(
    [
        ColSpec("string", "id"),
        ColSpec("string", "title"),
        ColSpec("string", "lead"),
        ColSpec("string", "body"),
    ]
)
signature = ModelSignature(inputs=input_schema)

# Log PyTorch model
print(f"Running {run.info.run_id} run")
print(f"mlflow models serve -m runs:/{run.info.run_id}/{OUTPUT_MODEL_DIR} --no-conda")
# if --no-conda is deprecated
# print(f"mlflow models serve -m runs:/{run.info.run_id}/{OUTPUT_MODEL_DIR} --env-manager local\n\n")
mlflow.pyfunc.log_model(
    artifact_path=OUTPUT_MODEL_DIR,
    python_model=TransformerWrapper(model_tokenizer_path=OUTPUT_MODEL_DIR),
    artifacts=artifacts,
    conda_env=conda_env,
    signature=signature,
)

In [None]:
mlflow.end_run()