In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import transformers
from transformers import TextDataset, DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import datasets
from transformers import pipeline
from sklearn.model_selection import train_test_split
from datasets import load_metric, load_dataset, Dataset

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 228
transformers.set_seed(seed)
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
metric = load_metric("sacrebleu") # bleu
clear_output()

In [None]:
train_df, val_df = pd.read_csv("./../data/interim/train_df", index_col = 0), pd.read_csv("./../data/interim/val_df", index_col = 0)

In [None]:
train_df.dropna(inplace=True)
val_df.dropna(inplace=True)

In [None]:
dataset = datasets.DatasetDict({"train":Dataset.from_pandas(train_df),"val":Dataset.from_pandas(val_df)})

In [None]:
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

DatasetDict({
    train: Dataset({
        features: ['reference', 'translation'],
        num_rows: 566221
    })
    val: Dataset({
        features: ['reference', 'translation'],
        num_rows: 11556
    })
})

In [None]:
prefix = "detox: "
max_len = 128

def preprocess_function(examples):
    '''
    Takes the dictionary with "reference" text, "translation" text - detoxificated version.
    Converts the given examples to needed format for model.
    Returns dict with needed properties
    '''
    inputs = [prefix + ex for ex in examples["reference"]] # adding prefix to show this is specific task
    targets = [ex for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_len, truncation=True)
    labels = tokenizer(targets, max_length=max_len, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
preprocess_function(dataset['train'][:2])

{'input_ids': [[16379, 10, 129, 34, 909, 5, 3, 5, 5, 21019, 66, 6426, 2302, 5, 1], [16379, 10, 25, 2714, 6, 6819, 388, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[830, 34, 30, 909, 2824, 432, 8, 6426, 11607, 1883, 5, 1], [25, 3, 27826, 6, 6819, 388, 5, 1]]}

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets['train'][0]

Map:   0%|          | 0/566221 [00:00<?, ? examples/s]

Map:   0%|          | 0/11556 [00:00<?, ? examples/s]

{'reference': 'get it yourself. ..release all 42 numbers.',
 'translation': 'bring it on yourself.... All the 42 balls released.',
 'input_ids': [16379,
  10,
  129,
  34,
  909,
  5,
  3,
  5,
  5,
  21019,
  66,
  6426,
  2302,
  5,
  1],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [830, 34, 30, 909, 2824, 432, 8, 6426, 11607, 1883, 5, 1]}

In [None]:
def compute_metrics(eval_preds):
    '''
    This function takes a tuple with two numpy arrays: one for predictions, other for expected output
    Returns dictionary with metrics: BLEU and gen_len
    '''
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id) # -100 is bad token so remove him
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # decode tokens to words
    decoded_labels = [[label.strip()] for label in decoded_labels] # strip to remove spaces
    result = metric.compute(predictions=decoded_preds, references=decoded_labels) # calculate BLEU 
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens) # average length of predictions
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
batch_size = 16
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.6618,1.512375,24.1634,13.3144
2,1.5896,1.464592,24.7885,13.3299
3,1.5669,1.441468,25.1846,13.2721
4,1.5642,1.428742,25.3381,13.2653
5,1.5567,1.42529,25.3949,13.2518




TrainOutput(global_step=176945, training_loss=1.6150634617301411, metrics={'train_runtime': 22043.4381, 'train_samples_per_second': 128.433, 'train_steps_per_second': 8.027, 'total_flos': 3.3247376759291904e+16, 'train_loss': 1.6150634617301411, 'epoch': 5.0})

In [None]:
model.save_pretrained("./../models/modelka")

In [None]:
text = "detox: oh, I have fucked up, sorry. what can I do for you, fucking nerd?"
tokenized_text = tokenizer(text, return_tensors = "pt").to("cuda")

In [None]:
out = model.generate(input_ids = tokenized_text["input_ids"], attention_mask = tokenized_text["attention_mask"], max_length = 128, num_return_sequences = 1,)
preds = [
        tokenizer.decode(gen_id,
        skip_special_tokens = True,
        clean_up_tokenization_spaces=True)
        for gen_id in out ]
preds[0]

"I'm sorry, what can I do for you, nerd?"