In [1]:
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM
from datasets import load_dataset

model = AutoModelForMaskedLM.from_pretrained('distilbert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [3]:
text = "This is a great [MASK]"

In [4]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits

mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

2024-02-05 01:18:35.897557: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-05 01:18:36.278841: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'>>> This is a great !'
'>>> This is a great .'
'>>> This is a great deal'
'>>> This is a great adventure'
'>>> This is a great ;'


In [17]:
dataset = load_dataset("text", data_files={"train": "../train/train", "test": "test", "dev":"../dev/dev"})
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 275317
    })
    test: Dataset({
        features: ['text'],
        num_rows: 77877
    })
    dev: Dataset({
        features: ['text'],
        num_rows: 40117
    })
})

In [18]:
def tokenize_function(examples):
    result = tokenizer(examples["text"], max_length=128, padding='max_length', truncation=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

tokenized_datasets = dataset.map(
    tokenize_function, batched=True
)
tokenized_datasets

Map:   0%|          | 0/275317 [00:00<?, ? examples/s]

Map:   0%|          | 0/77877 [00:00<?, ? examples/s]

Map:   0%|          | 0/40117 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 275317
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 77877
    })
    dev: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 40117
    })
})

In [19]:
labels = tokenized_datasets["train"]["input_ids"]
labeled_dataset_train = tokenized_datasets["train"].add_column("labels", labels)
labeled_dataset_train

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 275317
})

In [20]:
labels = tokenized_datasets["dev"]["input_ids"]
labeled_dataset_dev = tokenized_datasets["dev"].add_column("labels", labels)
labeled_dataset_dev

Dataset({
    features: ['text', 'input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 40117
})

In [21]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [22]:
samples = [labeled_dataset_train[i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] normal value [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

'>>> [CLS] production capacity ( tonnes ) [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

In [28]:
from transformers import TrainingArguments

batch_size = 64

logging_steps = len(labeled_dataset_train)
model_name = "distilbert-finetuned-europarl"

training_args = TrainingArguments(
    output_dir=model_name,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    num_train_epochs=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=True,
    logging_steps=logging_steps,
    remove_unused_columns=False
)

In [29]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=labeled_dataset_train,
    eval_dataset=labeled_dataset_dev,
    data_collator=whole_word_masking_data_collator,
    tokenizer=tokenizer,
)

In [30]:
print(len(labeled_dataset_train[9]['input_ids']))

128


In [31]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

KeyboardInterrupt: 

In [32]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [33]:
!jupyter nbconvert --to script bert.ipynb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[NbConvertApp] Converting notebook bert.ipynb to script
[NbConvertApp] Writing 4161 bytes to bert.py
