# Fine-tuning a masked language model (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.

You will need to setup git, adapt your email and name in the following cell.

In [None]:
!git config --global user.email "jian6768@gmail.com"
!git config --global user.name "jian6768"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoModelForMaskedLM

# model_checkpoint = "distilbert-base-uncased"
# model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

model_checkpoint = "answerdotai/ModernBERT-base"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

In [None]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 150M'
'>>> BERT number of parameters: 110M'


In [None]:
text = "This is a great [MASK]."


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [None]:
inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
token_logits.shape

torch.Size([1, 8, 50368])

In [None]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
#torch.where(inputs["input_ids"] == tokenizer.mask_token_id) has output shape (batch, relevant_mask position in sequence)
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

#token logits is a distribution of possible words. top k provides 5 words with the highest probability.
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great  idea.'
'>>> This is a great  question.'
'>>> This is a great  example.'
'>>> This is a great  video.'
'>>> This is a great  post.'


In [None]:
inputs

{'input_ids': tensor([[50281,  1552,   310,   247,  1270, 50284,    15, 50282]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
torch.where(inputs["input_ids"] == tokenizer.mask_token_id)

(tensor([0]), tensor([5]))

In [None]:
inputs

{'input_ids': tensor([[50281,  1552,   310,   247,  1270, 50284,    15, 50282]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
#Note that inputs that are created by tokenizer has input_ids and attention masks.
inputs

{'input_ids': tensor([[50281,  1552,   310,   247,  1270, 50284,    15, 50282]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
inputs.input_ids.shape

torch.Size([1, 8])

In [None]:
from datasets import load_dataset

# imdb_dataset = load_dataset("imdb")
soqa_ds = load_dataset("pacovaldez/stackoverflow-questions")

soqa_ds = soqa_ds.rename_column("body", "text")
soqa_ds

README.md:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/19 [00:00<?, ?files/s]

(…)ost_questions_train_000000000000.parquet:   0%|          | 0.00/41.3M [00:00<?, ?B/s]

(…)ost_questions_train_000000000001.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

(…)ost_questions_train_000000000002.parquet:   0%|          | 0.00/42.1M [00:00<?, ?B/s]

(…)ost_questions_train_000000000003.parquet:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

(…)ost_questions_train_000000000004.parquet:   0%|          | 0.00/42.1M [00:00<?, ?B/s]

(…)ost_questions_train_000000000005.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

(…)ost_questions_train_000000000006.parquet:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

(…)ost_questions_train_000000000007.parquet:   0%|          | 0.00/42.1M [00:00<?, ?B/s]

(…)ost_questions_train_000000000008.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

(…)ost_questions_train_000000000009.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

(…)ost_questions_train_000000000010.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

(…)ost_questions_train_000000000011.parquet:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

(…)ost_questions_train_000000000012.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

(…)ost_questions_train_000000000013.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

(…)ost_questions_train_000000000014.parquet:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

(…)ost_questions_train_000000000015.parquet:   0%|          | 0.00/42.3M [00:00<?, ?B/s]

(…)ost_questions_train_000000000016.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

(…)ost_questions_train_000000000017.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

(…)ost_questions_train_000000000018.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

(…)uestions_validation_000000000000.parquet:   0%|          | 0.00/132M [00:00<?, ?B/s]

(…)uestions_validation_000000000001.parquet:   0%|          | 0.00/132M [00:00<?, ?B/s]

(…)uestions_validation_000000000002.parquet:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading data:   0%|          | 0/19 [00:00<?, ?files/s]

post_questions_test_000000000000.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

post_questions_test_000000000001.parquet:   0%|          | 0.00/41.3M [00:00<?, ?B/s]

post_questions_test_000000000002.parquet:   0%|          | 0.00/42.1M [00:00<?, ?B/s]

post_questions_test_000000000003.parquet:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

post_questions_test_000000000004.parquet:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

post_questions_test_000000000005.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

post_questions_test_000000000006.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

post_questions_test_000000000007.parquet:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

post_questions_test_000000000008.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

post_questions_test_000000000009.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

post_questions_test_000000000010.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

post_questions_test_000000000011.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

post_questions_test_000000000012.parquet:   0%|          | 0.00/41.5M [00:00<?, ?B/s]

post_questions_test_000000000013.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

post_questions_test_000000000014.parquet:   0%|          | 0.00/42.1M [00:00<?, ?B/s]

post_questions_test_000000000015.parquet:   0%|          | 0.00/41.5M [00:00<?, ?B/s]

post_questions_test_000000000016.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

post_questions_test_000000000017.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

post_questions_test_000000000018.parquet:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1572294 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/785098 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1570866 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'label'],
        num_rows: 1572294
    })
    validation: Dataset({
        features: ['title', 'text', 'label'],
        num_rows: 785098
    })
    test: Dataset({
        features: ['title', 'text', 'label'],
        num_rows: 1570866
    })
})

In [None]:
soqa_ds

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'label'],
        num_rows: 1572294
    })
    validation: Dataset({
        features: ['title', 'text', 'label'],
        num_rows: 785098
    })
    test: Dataset({
        features: ['title', 'text', 'label'],
        num_rows: 1570866
    })
})

In [None]:
#Shrink the dataset to a smaller size
train_size = 25_0000
test_size = int(0.1 * train_size)

soqa_ds_reduced = soqa_ds["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
soqa_ds_reduced

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'label'],
        num_rows: 250000
    })
    test: Dataset({
        features: ['title', 'text', 'label'],
        num_rows: 25000
    })
})

In [None]:
#KJ added
sample = soqa_ds_reduced["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")


'>>> Review: <p>I want to execute a command in my viewmodel when the user presses enter in a TextBox.
The command works when bound to a button.</p>

<pre><code>&lt;Button Content="Add" Command="{Binding Path=AddCommand}" /&gt;
</code></pre>

<p>But I can't bring it to work from the TextBox.
I tried an Inputbinding, but it didn't work.</p>

<pre><code>&lt;TextBox.InputBindings&gt;
    &lt;KeyBinding Command="{Binding Path=AddCommand}" Key="Enter"/&gt;
&lt;/TextBox.InputBindings&gt;
</code></pre>

<p>I also tried to set the working button as default, but it doesn't get executed when enter is pressed.</p>

<p>Thanks for your help.</p>'
'>>> Label: 0'

'>>> Review: <p>On my laptop I have installed Win 7 and VS 2012.But I need to get ready for the Microsoft exam for the "windows store apps using HTML 5, css and javascript". So my idea is to install Win 8.1 and VS 2013 trial versions for get hands on practice for above exams.So my question is can I install Win 8.1 and VS 2013 trial versions

In [None]:
# sample = imdb_dataset["train"].shuffle(seed=42).select(range(3))

# for row in sample:
#     print(f"\n'>>> Review: {row['text']}'")
#     print(f"'>>> Label: {row['label']}'")

In [None]:
soqa_ds_reduced

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'label'],
        num_rows: 250000
    })
    test: Dataset({
        features: ['title', 'text', 'label'],
        num_rows: 25000
    })
})

In [None]:
#KJ version
def tokenize_function(examples):
    result = tokenizer(examples["text"], truncation=True,max_length=8192)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = soqa_ds_reduced.map(
    tokenize_function, batched=True, remove_columns=["title","text", "label"]
)
tokenized_datasets

Map:   0%|          | 0/250000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 250000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
})

In [None]:
# def tokenize_function(examples):
#     result = tokenizer(examples["text"])
#     if tokenizer.is_fast:
#         result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
#     return result


# # Use batched=True to activate fast multithreading!
# tokenized_datasets = imdb_dataset.map(
#     tokenize_function, batched=True, remove_columns=["text", "label"]
# )
# tokenized_datasets

Map:   0%|          | 0/1572294 [00:00<?, ? examples/s]

Map:   0%|          | 0/785098 [00:00<?, ? examples/s]

Map:   0%|          | 0/1570866 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 1572294
    })
    validation: Dataset({
        features: ['title', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 785098
    })
    test: Dataset({
        features: ['title', 'input_ids', 'attention_mask', 'word_ids'],
        num_rows: 1570866
    })
})

In [None]:
# tokenizer.decode(tokenized_datasets['unsupervised'][1]["input_ids"])
tokenizer.decode(tokenized_datasets['test'][1]["input_ids"])

'[CLS]<p>I\'m using <code>UICollectionView</code> in my swift class, it\'s placed on my <code>UIViewController</code>. I connected the collectionView to the outlet in my code, I set up <code>delegate</code> and <code>datasource</code> and I see the outcome in my app. Everything works besides the fact that when I click each cell - nothing happens.</p>\n\n<p>My code is as follows:</p>\n\n<pre><code>class UsersList: UIViewController, UICollectionViewDataSource, UICollectionViewDelegate {\n\n\n@IBOutlet weak var tview: UICollectionView!\n\noverride func viewWillAppear(animated: Bool) {\n    super.viewWillAppear(animated)\n\n    tview.backgroundColor = UIColor.whiteColor() //this works\n    tview.delegate = self\n    tview.dataSource = self\n}\n\nfunc collectionView(tview: UICollectionView, didSelectItemAtIndexPath indexPath: NSIndexPath) {\n    print("You selected cell #\\(indexPath.item)!")\n    //this does not appear in the console :(\n}\n</code></pre>\n\n<p>Is there anything else I coul

In [None]:
tokenizer.model_max_length

8192

In [None]:
chunk_size = 128

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 250000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
})

In [None]:
# Slicing produces a list of lists for each feature
#tokenized samples is a dictionary. it has input ids, attention masks, word_ids as keys. For values, it has lists of lists. Each list represents one set of words. By flattening them, we are essentially merging all these sentences.
#3 examples have been merged into one.
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 175'
'>>> Review 1 length: 111'
'>>> Review 2 length: 312'


In [None]:
concatenated_examples = {
    k: [item for sublist in tokenized_samples[k] for item in sublist] if isinstance(tokenized_samples[k][0], list) else tokenized_samples[k]
    for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 598'


In [None]:
print(tokenizer.decode(concatenated_examples['input_ids']))
print(len(concatenated_examples['input_ids']))
#Length suggests that multiple examples have been concated with each other. Three examples have been merged into one.


[CLS]<p>Being new to ASP.NET I'm unsure of the best solution to my problem.  I have a line of code like:</p>

<pre><code>xDoc.Load("Templates/template1.cfg");
</code></pre>

<p>xDoc is an <code>XmlDocument</code>.  In my project, at the top level there is a directory called Templates.  When I run the project in debug mode, I get a <code>DirectoryNotFoundException</code>, and apparently it's looking for the Templates dir in <code>C:\Program Files\Common Files\Microsoft Shared\DevServer\10.0\Templates</code>.</p>

<p>How can correctly point to that directory without hardcoding it?</p>[SEP][CLS]<p>I am looking to create a web app written in PHP / MySQL.  It will be used to collect data on houses by staff walking around a neighbourhood.  There may be places where they have little to no cell signal, so I need a way to allow them to still view the web app when offline, as well as store the data they are submitting, to be inserted to the MySQL database once their phone has a signal / wi fi.</

In [None]:
#The long concatenated string is chopped into many chunks with 128 length each.
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 86'


In [None]:
# def group_texts(examples):
#     # Concatenate all texts
#     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
#     # Compute length of concatenated texts
#     total_length = len(concatenated_examples[list(examples.keys())[0]])
#     # We drop the last chunk if it's smaller than chunk_size
#     total_length = (total_length // chunk_size) * chunk_size
#     # Split by chunks of max_len
#     result = {
#         k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
#         for k, t in concatenated_examples.items()
#     }
#     # Create a new labels column
#     result["labels"] = result["input_ids"].copy()
#     return result

In [None]:
#This method groups together the steps above. Note that apart from input_id keys, results also has label keys. Labels is essentially the ground truth and is a copy of input id before masking.
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {
        k: [item for sublist in examples[k] for item in sublist] if isinstance(examples[k][0], list) else examples[k]
        for k in examples.keys()
    }
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/250000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 932175
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 92612
    })
})

In [None]:
tokenizer.decode(lm_datasets['train'][0]['input_ids'])

'[CLS]<p>Being new to ASP.NET I\'m unsure of the best solution to my problem.  I have a line of code like:</p>\n\n<pre><code>xDoc.Load("Templates/template1.cfg");\n</code></pre>\n\n<p>xDoc is an <code>XmlDocument</code>.  In my project, at the top level there is a directory called Templates.  When I run the project in debug mode, I get a <code>DirectoryNotFoundException</code>, and apparently it\'s looking for the Templates dir in <code>'

In [None]:
from transformers import DataCollatorForLanguageModeling



#Data Collator randomly masks certain words.
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS]<p>Being new to ASP.NET I'm unsure of the best solution to my problem.  I have a line of code like:</p>kim
<pre><code[MASK]xDoc.Load("Templates/template1.cfg");[MASK]</code></pre>

<p>xDoc is[MASK] <code>XmlDocument</code>.[MASK]In my project, at the top level there is a directory called Templates[MASK]  When I run[MASK] project in debug[MASK], I get a <code>DirectoryNotFoundException</code>, and apparently it's looking for thethaplates dir in <code>'

'>>> C[MASK]Program Files\Common Files016Microsoft Shared\DevServer\[MASK].0\Templates</code>.</p>
[MASK]<p>How can correctly[MASK] to that directory without[MASK]coding[MASK]?</p>[SEP][CLS]<p thymI am looking[MASK][MASK] a web app[MASK] in PHP / MySQL.  It will be used to collect data on houses by staff walking around a neighbourhood[MASK]  There may be places where they have[MASK] to no cell signal[MASK][MASK][MASK] need[MASK] way to ×[MASK] to still view the web app when[MASK], as well[MASK] store the data they are submitti

In [None]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [None]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS][MASK][MASK]>Being[MASK] to[MASK].NET I'm[MASK] of the best solution[MASK] my[MASK][MASK]  I[MASK] a line of code like:</p>
[MASK]<pre[MASK]code>[MASK][MASK].Load("Templates/template1.cfg");
</code[MASK]pre>

<[MASK]>[MASK][MASK] is an <[MASK]>XmlDocument</code>.  In[MASK] project,[MASK] the top level there is a directory called Templates.  When I run the project in debug mode, I[MASK][MASK] <code>DirectoryNotFoundException</code>, and[MASK] it[MASK] looking for[MASK] Templates[MASK][MASK] <code>'

'>>> [MASK]:\Program Files\Common Files\Microsoft Shared\[MASK][MASK]\10.0[MASK]Templates</code>.</p>

<p>How can correctly point[MASK] that directory without hardcoding it?</p>[SEP][CLS][MASK]p[MASK]I am looking to create a web app written in[MASK] / MySQL.  It will be[MASK] to[MASK][MASK] on houses by staff walking around[MASK] neighbourhood.  There[MASK] be places[MASK] they[MASK] little to no cell signal, so I need a[MASK] to allow them to still view the web app when offline, 

In [None]:
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 932175
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 92612
    })
})

In [None]:
downsampled_dataset = lm_datasets

In [None]:
# train_size = 10_000
# test_size = int(0.1 * train_size)

# downsampled_dataset = lm_datasets["train"].train_test_split(
#     train_size=train_size, test_size=test_size, seed=42
# )
# downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-soqa-330k",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)



In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [None]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mebitdaddy93[0m ([33mebitdaddy93-nus[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


>>> Perplexity: 3.91


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1742728619.23012d74c3d0.3738.0:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

events.out.tfevents.1742754047.23012d74c3d0.3738.1:   0%|          | 0.00/431 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/KJCHUA/distilbert-base-uncased-finetuned-soqa-330k/commit/8aab9c66199f00e5977e6fe0579a35d6216cf84e', commit_message='End of training', commit_description='', oid='8aab9c66199f00e5977e6fe0579a35d6216cf84e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/KJCHUA/distilbert-base-uncased-finetuned-soqa-330k', endpoint='https://huggingface.co', repo_type='model', repo_id='KJCHUA/distilbert-base-uncased-finetuned-soqa-330k'), pr_revision=None, pr_num=None)

In [None]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [None]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

Map:   0%|          | 0/11826 [00:00<?, ? examples/s]

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
from huggingface_hub import get_full_repo_name

model_name = "distilbert-base-uncased-finetuned-imdb-accelerate"
# model_name = "distilbert-base-uncased-finetuned-imdb"
repo_name = get_full_repo_name(model_name)
repo_name

'KJCHUA/distilbert-base-uncased-finetuned-imdb-accelerate'

In [None]:
from huggingface_hub import Repository

output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/KJCHUA/distilbert-base-uncased-finetuned-imdb-accelerate into local empty directory.


Download file model.safetensors:   0%|          | 8.00k/256M [00:00<?, ?B/s]

Download file runs/Mar23_01-16-51_d7c9c28c3aa9/events.out.tfevents.1742693054.d7c9c28c3aa9.1306.0: 100%|######…

Download file runs/Mar23_01-16-51_d7c9c28c3aa9/events.out.tfevents.1742693379.d7c9c28c3aa9.1306.1: 100%|######…

Download file training_args.bin: 100%|##########| 5.24k/5.24k [00:00<?, ?B/s]

Download file runs/Mar23_02-34-37_f11d46f3d519/events.out.tfevents.1742697285.f11d46f3d519.1019.0: 100%|######…

Clean file runs/Mar23_01-16-51_d7c9c28c3aa9/events.out.tfevents.1742693054.d7c9c28c3aa9.1306.0:  14%|#4       …

Download file runs/Mar23_02-34-37_f11d46f3d519/events.out.tfevents.1742697624.f11d46f3d519.1019.1: 100%|######…

Clean file runs/Mar23_01-16-51_d7c9c28c3aa9/events.out.tfevents.1742693379.d7c9c28c3aa9.1306.1: 100%|#########…

Clean file training_args.bin:  19%|#9        | 1.00k/5.24k [00:00<?, ?B/s]

Clean file runs/Mar23_02-34-37_f11d46f3d519/events.out.tfevents.1742697285.f11d46f3d519.1019.0:  14%|#4       …

Clean file runs/Mar23_02-34-37_f11d46f3d519/events.out.tfevents.1742697624.f11d46f3d519.1019.1: 100%|#########…

Clean file model.safetensors:   0%|          | 1.00k/256M [00:00<?, ?B/s]

In [None]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        # repo.push_to_hub(
        #     commit_message=f"Training in progress epoch {epoch}", blocking=False
        # )

  0%|          | 0/471 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 11.668111273784415
>>> Epoch 1: Perplexity: 11.299383676144934
>>> Epoch 2: Perplexity: 11.143049654917178


In [None]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="huggingface-course/distilbert-base-uncased-finetuned-imdb"
)

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> this is a great film.
>>> this is a great movie.
>>> this is a great idea.
>>> this is a great deal.
>>> this is a great adventure.
