In [None]:
from google.colab import drive
drive.mount("/content/drive")

!pip install datasets evaluate transformers[sentencepiece]

In [2]:
import logging
import math
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional

import datasets
from datasets import load_dataset

import evaluate
import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_FOR_MASKED_LM_MAPPING,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    is_torch_tpu_available,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

In [8]:
raw_datasets = load_dataset("wikitext", "wikitext-2-raw-v1") 



  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
raw_datasets

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [10]:
raw_datasets["train"]

Dataset({
    features: ['text'],
    num_rows: 36718
})

In [18]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [19]:
model = AutoModelForMaskedLM.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

In [20]:
column_names = raw_datasets["train"].column_names
column_names

['text']

In [21]:
text_column_name = "text" if "text" in column_names else column_names[0]
text_column_name

'text'

In [22]:
def tokenize_function(examples):
    return tokenizer(examples[text_column_name], return_special_tokens_mask=True)

In [23]:
tokenized_datasets = raw_datasets.map(
                tokenize_function,
                batched=True,
                desc="Running tokenizer on every text in dataset",
            )

Running tokenizer on every text in dataset:   0%|          | 0/5 [00:00<?, ?ba/s]

Running tokenizer on every text in dataset:   0%|          | 0/37 [00:00<?, ?ba/s]

Running tokenizer on every text in dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

In [42]:
max_seq_length = 256
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_seq_length:
        total_length = (total_length // max_seq_length) * max_seq_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result

In [43]:
tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    desc=f"Grouping texts in chunks of {max_seq_length}",
)

Grouping texts in chunks of 256:   0%|          | 0/2 [00:00<?, ?ba/s]

Grouping texts in chunks of 256:   0%|          | 0/11 [00:00<?, ?ba/s]

Grouping texts in chunks of 256:   0%|          | 0/2 [00:00<?, ?ba/s]

In [44]:
tokenized_datasets

DatasetDict({
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 5020
    })
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 42500
    })
    validation: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 4456
    })
})

In [45]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

In [46]:
train_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 42500
})

In [47]:
train_dataset["text"][0][0]

' '

In [48]:
def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        # Depending on the model and config, logits may contain extra tensors,
        # like past_key_values, but logits always come first
        logits = logits[0]
    return logits.argmax(dim=-1)

In [49]:
metric = evaluate.load("accuracy")

In [50]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # preds have the same shape as the labels, after the argmax(-1) has been calculated
    # by preprocess_logits_for_metrics
    labels = labels.reshape(-1)
    preds = preds.reshape(-1)
    mask = labels != -100
    labels = labels[mask]
    preds = preds[mask]
    return metric.compute(predictions=preds, references=labels)

In [51]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,
    pad_to_multiple_of=None,
)

In [52]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/HuggingFace/model_files/NLP/roberta-based", 
    evaluation_strategy="epoch"
    )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [53]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    )

In [54]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: text, special_tokens_mask. If text, special_tokens_mask are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 42500
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 15939
  Number of trainable parameters = 124697433


Epoch,Training Loss,Validation Loss


RuntimeError: ignored