In [16]:

## Python 3.8


In [17]:

from datasets import load_dataset
datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')



In [18]:

from transformers import AutoTokenizer
    
from transformers import AutoConfig, AutoModelForCausalLM



In [19]:

model_checkpoint = "bert-base-cased"
tokenizer_checkpoint = "sgugger/bert-like-tokenizer"


In [20]:

def tokenize_function(examples):
    return tokenizer(examples["text"])


In [21]:

tokenizer          = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])


In [22]:

# block_size = tokenizer.model_max_length
block_size = 128


In [23]:

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


In [24]:

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)


In [28]:

from transformers import AutoConfig, AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments

config = AutoConfig.from_pretrained(model_checkpoint)
model  = AutoModelForMaskedLM.from_config(config)


In [30]:

training_args = TrainingArguments(
    "/scratch/scholar/rcalix/test-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False
    ## push_to_hub_model_id=f"{model_checkpoint}-wikitext2",
)


In [32]:

from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)


In [33]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)


Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [34]:

trainer.train()


Epoch,Training Loss,Validation Loss
1,7.1032,7.065444
2,6.8884,6.888522
3,6.8744,6.885355


TrainOutput(global_step=7038, training_loss=7.0453632785095754, metrics={'train_runtime': 1128.1728, 'train_samples_per_second': 49.889, 'train_steps_per_second': 6.238, 'total_flos': 3703423157830656.0, 'train_loss': 7.0453632785095754, 'epoch': 3.0})

In [36]:

import math


In [37]:

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")


Perplexity: 960.64



The perplexity is still quite high since for this demo we trained on a small dataset for a small number of epochs. For a real LM training, you would need a larger dataset and more epochs.
