In [1]:
!pip install transformers datasets accelerate -U transformers[torch]

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Load Model Checkpoint
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [3]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> DistilBERT number of parameters: 67M'
'>>> BERT number of parameters: 110M'


In [4]:
# Sample text to check tokenization
text = "I am feeling [MASK] since yesterday."

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
import torch

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits


#Check tokens, attention mask, raw model output and mask token id
print(f'Input Dict: \n{inputs}\nRaw Output : \n{token_logits} \nMask Token: {tokenizer.mask_token_id}')

Input Dict: 
{'input_ids': tensor([[ 101, 1045, 2572, 3110,  103, 2144, 7483, 1012,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Raw Output : 
tensor([[[ -5.5160,  -5.4974,  -5.5129,  ...,  -4.8846,  -4.7496,  -2.9866],
         [-10.2135, -10.0831, -10.1173,  ...,  -9.1217,  -8.8764,  -7.6046],
         [-11.5120, -11.3758, -11.3559,  ...,  -8.6382,  -9.3972,  -8.6519],
         ...,
         [-10.1551, -10.2914, -10.0809,  ...,  -7.0383,  -7.5913,  -8.2029],
         [-11.2972, -11.1966, -11.2800,  ...,  -8.9377,  -9.1618,  -7.3928],
         [-11.8467, -11.9511, -11.8622,  ..., -10.3753, -10.1234,  -7.9202]]],
       grad_fn=<ViewBackward0>) 
Mask Token: 103


In [7]:
#Check index of mask token
torch.where(inputs["input_ids"] == 103)

(tensor([0]), tensor([4]))

In [8]:
# Token logits for masked predictions
token_logits[0][4]

tensor([-3.3798, -3.3232, -3.3982,  ..., -2.8012, -3.5570, -2.2928],
       grad_fn=<SelectBackward0>)

In [9]:
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# Checking top 10 predictions
top_10_tokens = torch.topk(mask_token_logits, 10, dim=1).indices[0].tolist()
for token in top_10_tokens:
    print(f"'{text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'I am feeling better since yesterday.'
'I am feeling guilty since yesterday.'
'I am feeling sick since yesterday.'
'I am feeling lonely since yesterday.'
'I am feeling dizzy since yesterday.'
'I am feeling restless since yesterday.'
'I am feeling depressed since yesterday.'
'I am feeling awkward since yesterday.'
'I am feeling sleepy since yesterday.'
'I am feeling uneasy since yesterday.'


In [11]:
# Import dataset and convert to a hugging datasetdict
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
import json

file_path = '/content/sample_data/News_Category_Dataset_v3.json'
df = pd.read_json(file_path, lines=True).drop(columns = ['link','headline',	'authors',	'date']).rename(columns = {'category': 'label', 'short_description': 'text'})
dataset = Dataset.from_pandas(df)

In [12]:
dataset

Dataset({
    features: ['label', 'text'],
    num_rows: 209527
})

In [13]:
df.loc[0]['text']

'Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.'

In [14]:
df.loc[0]['label']

'U.S. NEWS'

In [15]:
train_test_split = dataset.train_test_split(test_size=0.2)
train_validation_split = train_test_split['train'].train_test_split(test_size=0.25)
dataset_dict = DatasetDict({
    'train': train_validation_split['train'],
    'validation': train_validation_split['test'],
    'test': train_test_split['test']
})

In [16]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 125715
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 41906
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 41906
    })
})

In [17]:
sample = dataset_dict["train"].shuffle(seed=42).select(range(10))

for row in sample:
    print(f"\n'Article: {row['text']}'")
    print(f"'Label: {row['label']}'")


'Article: Establishment-bashing Ted Cruz nabs four of six top donors.'
'Label: POLITICS'

'Article: Local flavor remains central to the enjoyment factor in Austin and for those who prefer to stay close to it, here are some of the best tucked-away and well-loved boutique options in town.'
'Label: TRAVEL'

'Article: Watch the video above (by Portland wedding videography company Moetic) to see their impressive moves. Groom Faheem and his'
'Label: WEDDINGS'

'Article: The WikiLeaks co-founder avoided extradition in the Ecuadorian embassy in London for 7 years.'
'Label: WORLD NEWS'

'Article: She's still unbreakable, y'all.'
'Label: ENTERTAINMENT'

'Article: You may have spent years studiously ignoring your mother's advice (and she was right about that guy in high school, admit it), but eventually all that bossing and nagging adds up to something truly useful. Here, Allure staffers share their mothers' wisest words about beauty.'
'Label: STYLE'

'Article: He also said the bill should be ca

In [18]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [19]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


tokenized_datasets = dataset_dict.map(
    tokenize_function, batched=True, remove_columns=["text", "label"]
)
tokenized_datasets

Map:   0%|          | 0/125715 [00:00<?, ? examples/s]

Map:   0%|          | 0/41906 [00:00<?, ? examples/s]

Map:   0%|          | 0/41906 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 125715
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 41906
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 41906
    })
})

In [20]:
tokenizer.model_max_length

512

In [21]:
chunk_size = 32

In [22]:
# View first 10 article token lengths
tokenized_samples = tokenized_datasets["train"][100:110]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'Article {idx + 1} length: {len(sample)}'")

'Article 1 length: 44'
'Article 2 length: 25'
'Article 3 length: 35'
'Article 4 length: 27'
'Article 5 length: 30'
'Article 6 length: 28'
'Article 7 length: 29'
'Article 8 length: 10'
'Article 9 length: 29'
'Article 10 length: 23'


In [23]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'Concatenated Articles length: {total_length}'")

'Concatenated Articles length: 280'


In [24]:
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'Chunk length: {len(chunk)}'")

'Chunk length: 32'
'Chunk length: 32'
'Chunk length: 32'
'Chunk length: 32'
'Chunk length: 32'
'Chunk length: 32'
'Chunk length: 32'
'Chunk length: 32'
'Chunk length: 24'


In [25]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/125715 [00:00<?, ? examples/s]

Map:   0%|          | 0/41906 [00:00<?, ? examples/s]

Map:   0%|          | 0/41906 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 106751
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 35716
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 35670
    })
})

In [26]:
tokenizer.decode(lm_datasets["train"][0]["input_ids"])

"[CLS] you can't explain to a 10 - year - old child that you can't call them because mommy went to court to prohibit it. while your"

In [27]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'anger may make you want to tell them, you know it is not in their best interests to possess that information. [SEP] [CLS] if you like to cook, you'

In [28]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [29]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] you can't explain to a 10 - [MASK] - old child that you can't [MASK] them because mommy went to court to prohibit [MASK]. while your'

'>>> anger may make you want [MASK] tell them, you know [MASK] is not in their best interests to possess that information. [SEP] [CLS] if you like to cook, you'


In [30]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [31]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] you can't [MASK] to [MASK] 10 - year - [MASK] child that you can'[MASK] [MASK] them because mommy went to court to prohibit [MASK]. while your'

'>>> [MASK] may make you want to tell them [MASK] you know it is [MASK] in their best interests to possess that [MASK]. [SEP] [CLS] if you like to [MASK], you'


In [33]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [34]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [36]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

In [37]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [38]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 43.42


In [39]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.3996,3.133706
2,3.1913,3.083951
3,3.1346,3.009249


TrainOutput(global_step=471, training_loss=3.2428497346849703, metrics={'train_runtime': 8618.8243, 'train_samples_per_second': 3.481, 'train_steps_per_second': 0.055, 'total_flos': 248552167680000.0, 'train_loss': 3.2428497346849703, 'epoch': 3.0})

In [45]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 21.81


In [40]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

events.out.tfevents.1719192083.967d78b23970.495.0:   0%|          | 0.00/6.76k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rohit5895/distilbert-base-uncased-finetuned-imdb/commit/f201b44dcdd9d36404b75e365d8f44a132746526', commit_message='End of training', commit_description='', oid='f201b44dcdd9d36404b75e365d8f44a132746526', pr_url=None, pr_revision=None, pr_num=None)

In [49]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="huggingface-course/distilbert-base-uncased-finetuned-imdb"
)

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [50]:
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> i am feeling better since yesterday.
>>> i am feeling guilty since yesterday.
>>> i am feeling good since yesterday.
>>> i am feeling sick since yesterday.
>>> i am feeling bad since yesterday.


In [60]:
dataset_dict['test']['text'][1]

"Jimmy Fallon wouldn't let a Friday go by without sending out his thank you notes. And for that, we're pretty thankful. Watch"

In [61]:
test_text = "Jimmy Fallon wouldn't let a [MASK] go by without sending out his thank you notes. And for that, we're pretty thankful. Watch"
mask_filler(test_text)

[{'score': 0.05406205728650093,
  'token': 2265,
  'token_str': 'show',
  'sequence': "jimmy fallon wouldn't let a show go by without sending out his thank you notes. and for that, we're pretty thankful. watch"},
 {'score': 0.045068882405757904,
  'token': 2154,
  'token_str': 'day',
  'sequence': "jimmy fallon wouldn't let a day go by without sending out his thank you notes. and for that, we're pretty thankful. watch"},
 {'score': 0.0443171001970768,
  'token': 3371,
  'token_str': 'minute',
  'sequence': "jimmy fallon wouldn't let a minute go by without sending out his thank you notes. and for that, we're pretty thankful. watch"},
 {'score': 0.044280603528022766,
  'token': 2095,
  'token_str': 'year',
  'sequence': "jimmy fallon wouldn't let a year go by without sending out his thank you notes. and for that, we're pretty thankful. watch"},
 {'score': 0.043289586901664734,
  'token': 3185,
  'token_str': 'movie',
  'sequence': "jimmy fallon wouldn't let a movie go by without sending 