In [1]:
import pandas as pd
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AdamW, get_scheduler
from transformers import pipeline
import torch
from datasets import Dataset
from datasets import load_dataset
from datasets import DatasetDict
import math

## Import data for converting to 🤗 dataset

In [2]:
annotated_df = pd.read_csv("datasets/flashback_annotated.tsv", sep="\t", index_col=0)
unannotated_df = pd.read_csv("datasets/flashback_raw.tsv", sep="\t", index_col=0)

## Finetune for masked LM task

In [31]:
tokenizer = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')
model_checkpoint = AutoModelForMaskedLM.from_pretrained('KB/bert-base-swedish-cased')

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/491 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/390k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

In [36]:
default_mask_filler = pipeline(
    "fill-mask", model=model_checkpoint, tokenizer=tokenizer
)

In [37]:
default_mask_filler("Vad hjälper det att Paris har så otroligt mycket vackert och kul att uppleva, när det står araber och [MASK] precis över allt.")

[{'score': 0.16540421545505524,
  'token': 38404,
  'token_str': 'araber',
  'sequence': 'Vad hjälper det att Paris har så otroligt mycket vackert och kul att uppleva, när det står araber och araber precis över allt.'},
 {'score': 0.08115306496620178,
  'token': 17319,
  'token_str': 'muslimer',
  'sequence': 'Vad hjälper det att Paris har så otroligt mycket vackert och kul att uppleva, när det står araber och muslimer precis över allt.'},
 {'score': 0.06851934641599655,
  'token': 32241,
  'token_str': 'fransmän',
  'sequence': 'Vad hjälper det att Paris har så otroligt mycket vackert och kul att uppleva, när det står araber och fransmän precis över allt.'},
 {'score': 0.04426249861717224,
  'token': 13410,
  'token_str': 'judar',
  'sequence': 'Vad hjälper det att Paris har så otroligt mycket vackert och kul att uppleva, när det står araber och judar precis över allt.'},
 {'score': 0.0346917100250721,
  'token': 38381,
  'token_str': 'européer',
  'sequence': 'Vad hjälper det att Par

### Testing default masked LM

In [8]:
tokenizer.decode(tokenizer.mask_token_id)

'[MASK]'

In [9]:
inputs = tokenizer("Jag hatar att kvinnor [MASK] så mycket.", return_tensors="pt")
inputs

{'input_ids': tensor([[    2,   361, 19134,    48,  1921,     4,   181,   408,     7,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [10]:
' '.join([tokenizer.decode(x) for x in inputs['input_ids']])

'[CLS] Jag hatar att kvinnor [MASK] så mycket. [SEP]'

In [11]:
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_index

tensor([5])

In [12]:
token_logits = model_checkpoint(**inputs).logits
token_logits

tensor([[[-5.5930, -2.4002, -3.3570,  ..., -5.7172, -6.7367, -7.6456],
         [-3.4565, -1.4567, -3.5432,  ..., -1.2682, -6.0981, -2.3997],
         [-2.4656,  2.2299, -5.2629,  ..., -1.9827, -0.9353, -2.6023],
         ...,
         [-9.2317,  0.7863, -3.6915,  ..., -6.0307, -7.7294, -9.9168],
         [-2.9332,  2.2197, -0.8976,  ..., -2.6394, -2.0116, -3.4200],
         [-5.9556, -2.4276, -2.0761,  ..., -5.2691, -6.6226, -7.1704]]],
       grad_fn=<ViewBackward0>)

In [13]:
mask_token_logits = token_logits[0, mask_token_index, :]
mask_token_logits

tensor([[-4.1517,  2.8392, -2.8566,  ..., -2.6504, -3.2629, -6.0682]],
       grad_fn=<IndexBackward0>)

In [14]:
# Pick the <mask> candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
for mask_word in [tokenizer.decode([x]) for x in top_5_tokens]:
    print(f"Jag hatar att kvinnor {mask_word} så mycket.")

Jag hatar att kvinnor bråkar så mycket.
Jag hatar att kvinnor pratar så mycket.
Jag hatar att kvinnor gråter så mycket.
Jag hatar att kvinnor dricker så mycket.
Jag hatar att kvinnor lider så mycket.


In [5]:
def test_masked_lm(text):
    inputs = tokenizer("%s [MASK]." % text, return_tensors="pt")
    token_logits = model_checkpoint(**inputs).logits
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    mask_token_logits = token_logits[0, mask_token_index, :]
    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
    for mask_word in [tokenizer.decode([x]) for x in top_5_tokens]:
        print("%s %s." %(text, mask_word))

In [6]:
test_masked_lm("Jag hatar att kvinnor")

Jag hatar att kvinnor gråter.
Jag hatar att kvinnor drabbas.
Jag hatar att kvinnor ljuger.
Jag hatar att kvinnor dricker.
Jag hatar att kvinnor slåss.


In [7]:
test_masked_lm("Jag hatar att män")

Jag hatar att män ljuger.
Jag hatar att män bråkar.
Jag hatar att män gråter.
Jag hatar att män slåss.
Jag hatar att män dricker.


In [8]:
test_masked_lm("Invandrare bör")

Invandrare bör skyddas.
Invandrare bör utvisas.
Invandrare bör prioriteras.
Invandrare bör avvisas.
Invandrare bör uppmuntras.


### Create 🤗 dataset

In [4]:
unannotated_df = unannotated_df.sample(frac = 1).reset_index(drop=True)

In [5]:
masked_lm_test_df = unannotated_df[:10000].reset_index(drop=True)
masked_lm_train_df = unannotated_df[10000:110000].reset_index(drop=True)

In [6]:
masked_lm_dataset = DatasetDict({
    "train": Dataset.from_pandas(masked_lm_train_df),
    "test": Dataset.from_pandas(masked_lm_test_df)
})

masked_lm_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
})

#### Convert dataset to chunks

In [7]:
def tokenize_function(document):
    """ returns tokenized document, quickly if tokenizer 'is fast' according to hf"""
    result = tokenizer(document["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [8]:
tokenizer.model_max_length

1000000000000000019884624838656

This seems like an error, so I'll redefine it according to the BERT specification:

In [9]:
tokenizer.model_max_length = 512
chunk_size = tokenizer.model_max_length
chunk_size

512

In [10]:
tokenized_datasets = masked_lm_dataset.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets



  0%|          | 0/100 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1180 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/10 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 10000
    })
})

In [13]:
tokenized_samples = tokenized_datasets["train"][:10]

In [14]:
print("Sum: %d" %sum([len(x) for x in tokenized_samples["input_ids"]]))
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"Concatenated length: {total_length}")

Sum: 929
Concatenated length: 929


These are identical, which is what we want

In [15]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

In [16]:
for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 512'
'>>> Chunk length: 417'


In [11]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [12]:
lm_dataset = tokenized_datasets.map(group_texts, batched=True)
lm_dataset

  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 17371
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1711
    })
})

In [None]:
tokenizer.decode(lm_dataset["train"][10]["input_ids"])

### Training

#### 🤗 Trainer

In [13]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
samples = [lm_dataset["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [15]:
batch_size = 12
# Show the training loss with every epoch
logging_steps = len(lm_dataset["train"]) // batch_size
model_name = "bert-base-swedish-cased"

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-flashback",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_accumulation_steps=1,  # slower, but less prone to overflow CUDA memory
    num_train_epochs=15,
    save_steps = 5000,
    logging_steps=logging_steps,
)

In [16]:
trainer = Trainer(
    model=model_checkpoint,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

##### Training using trainer

In [18]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1711
  Batch size = 12


>>> Perplexity: 36.28


In [17]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 17318
  Num Epochs = 15
  Instantaneous batch size per device = 12
  Total train batch size (w. parallel, distributed & accumulation) = 12
  Gradient Accumulation steps = 1
  Total optimization steps = 21660


Epoch,Training Loss,Validation Loss
1,2.3935,2.163418
2,2.2723,2.12742
3,2.22,2.094575
4,2.1722,2.067284
5,2.1422,2.061248
6,2.1188,2.04888
7,2.0889,2.01977
8,2.074,2.025061
9,2.0642,2.003407
10,2.0455,2.004086


The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1727
  Batch size = 12
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1727
  Batch size = 12
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1727
  Batch size = 12
Saving model checkpoint to bert-base-swedish-cased-finetuned-flashback/ch

TrainOutput(global_step=21660, training_loss=2.110331358341629, metrics={'train_runtime': 16431.4322, 'train_samples_per_second': 15.809, 'train_steps_per_second': 1.318, 'total_flos': 6.838851728120832e+16, 'train_loss': 2.110331358341629, 'epoch': 15.0})

In [18]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1727
  Batch size = 12


>>> Perplexity: 7.34


In [19]:
trainer.save_model("flashback_lm_model_larger_batch_size")

Saving model checkpoint to flashback_lm_model_larger_batch_size
Configuration saved in flashback_lm_model_larger_batch_size/config.json
Model weights saved in flashback_lm_model_larger_batch_size/pytorch_model.bin
tokenizer config file saved in flashback_lm_model_larger_batch_size/tokenizer_config.json
Special tokens file saved in flashback_lm_model_larger_batch_size/special_tokens_map.json
