# MLM Fine-tuning with the training data

This notebook takes you through the training of the Longformer using HuggingFace APIs. Hugging face makes it really easy to build models, and I would encourage going through the HF course and the new book NLP with Transformers.

A lot of the code is taken and inspired from
1. https://huggingface.co/
2. https://github.com/nlp-with-transformers/notebooks
3. Picked up a ton of inspiration from the other shared notebooks!



In [None]:
!pip install transformers datasets tokenizers huggingface

In [None]:
import pandas as pd
import os
import torch
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from tqdm import tqdm
from datasets import Dataset
import warnings

os.chdir('/kaggle/input/feedback-prize-2021')

DATA_FOLDER = '.'
TRAIN_FOLDER = os.path.join(DATA_FOLDER, 'train')
TEST_FOLDER = os.path.join(DATA_FOLDER, 'test')

warnings.filterwarnings('ignore')

### Here, we extract the text from "train" folder

In [None]:
train_data = []
for i in tqdm(os.listdir(TRAIN_FOLDER)):
    with open(os.path.join(TRAIN_FOLDER, i), 'r') as f:
        train_data.append({'text': f.read(), 'id': i[:-4]})
df_train = pd.DataFrame(train_data)
dataset = Dataset.from_pandas(df_train, split='train')
del df_train
dataset

### Import the pretrained longformer model with the MLM head and it's tokenizer (RoBERTa).

In [None]:
MODEL_CKPT = 'allenai/longformer-base-4096'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = AutoModelForMaskedLM.from_pretrained(MODEL_CKPT).to(device)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

### Using the map function to tokenize the dataset in batches

In [None]:
def tokenize_function(batched_data):
    result = tokenizer(batched_data['text'], padding='max_length', truncation=True, max_length=1024)
    if tokenizer.is_fast:
        result['word_ids'] = [result.word_ids(i) for i in range(len(result['input_ids']))]
    return result

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text', 'id'])
# tokenized_datasets[0]

In [None]:
tokenized_datasets

### Breaking the text into chunks of 1024 for training

In [None]:
chunk_size = 1024
def group_texts(batched_data):
    concatenated_examples = {k: sum(batched_data[k], []) for k in batched_data.keys()}
    total_length = len(concatenated_examples[list(batched_data.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size
    result = {k : [t[i: i+chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatenated_examples.items()}
    result['labels'] = result['input_ids'].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

### Applying the masking of random words (tokens of even subwords) to enable MLM fine-tuning

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
dataset_split = lm_datasets.train_test_split(test_size=0.1)
dataset_split

In [None]:
from transformers import TrainingArguments

batch_size = 4
# Show the training loss with every epoch
logging_steps = len(dataset_split["train"]) // batch_size
model_name = MODEL_CKPT.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"/kaggle/working/{model_name}-finetuned-essay",
    overwrite_output_dir=False,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=True,
    logging_steps=logging_steps,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_split["train"],
    eval_dataset=dataset_split["test"],
    data_collator=data_collator,
)

In [None]:
dataset_split['train']

In [None]:
trainer.train()

In [None]:
import math
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.save_model()