https://huggingface.co/learn/nlp-course/en/chapter7/3

In [2]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [4]:
import torch

text = "This is a great [MASK]."
inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> This is a great deal.'
'>>> This is a great success.'
'>>> This is a great adventure.'
'>>> This is a great idea.'
'>>> This is a great feat.'


In [6]:
from tqdm.auto import tqdm
import datasets

dataset = datasets.load_dataset('csv', data_files='sample.csv')
# dataset = datasets.load_dataset('csv', data_files='NLPed_sample.csv')
# dataset = datasets.load_dataset('csv', data_files='cleaned_sample.csv')

dataset = dataset['train'].train_test_split(test_size=0.01)  # 10% for validation

# Access the train and validation sets
train_dataset = dataset['train']
validation_dataset = dataset['test']


print(f"found {len(dataset['train'])} data")
print(dataset)
print(dataset['train'][0])

found 9900 data
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 9900
    })
    test: Dataset({
        features: ['text'],
        num_rows: 100
    })
})
{'text': 'boldog születesnapot kivanok\ti also speak the language of mordor\tdo not utter it here\tenglish word language like words think french people speak know use means german re saying ve way spanish mean languages sounds actually heard correct speaking said wrong learn understand time thought pronounced sound right pronounce sense meaning makes grammar different thing literally instead native pretty sure dutch spelling meant yes says yeah phrase called term latin probably sentence italian thinking fuck context read common spell talking lol japanese letters maybe translate translation learning good person url example american weird learned pronunciation better accent trying speaks edit letter ll written things hard comes old similar write spoken speaker usually point kind'}


In [7]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

# Use batched=True to activate fast multithreading!
tokenized_datasets = dataset.map(
    tokenize_function, batched=True, remove_columns=["text"]
)
tokenized_datasets

Map:   0%|          | 0/9900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 9900
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 100
    })
})

In [8]:
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Review 0 length: 130'
'>>> Review 1 length: 123'
'>>> Review 2 length: 131'
'>>> Concatenated reviews length: 384'


In [9]:
chunk_size = 128
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [10]:
chunked_datasets = tokenized_datasets.map(group_texts, batched=True)
chunked_datasets

Map:   0%|          | 0/9900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 11981
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 119
    })
})

In [11]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [12]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [13]:
from transformers import TrainingArguments

batch_size = 32
# Show the training loss with every 10 epoch
logging_steps = len(chunked_datasets["train"]) // batch_size
# logging_steps = len(chunked_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    # output_dir=f"./origin",
    # output_dir=f"./nlp",
    output_dir=f"./priv",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
    save_strategy="epoch",
    load_best_model_at_end=True,
    num_train_epochs=30,
)



In [12]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=chunked_datasets["train"],
    eval_dataset=chunked_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
# load the trained model
model = AutoModelForMaskedLM.from_pretrained("./origin/checkpoint-18750")
# model = AutoModelForMaskedLM.from_pretrained("./nlp/checkpoint-18750")
# model = AutoModelForMaskedLM.from_pretrained("./priv/checkpoint-18750")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=chunked_datasets["train"],
    eval_dataset=chunked_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [23]:
import math

eval_results = trainer.evaluate()

print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/4 [00:00<?, ?it/s]

>>> Perplexity: 2.09


In [19]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/4 [00:00<?, ?it/s]

>>> Perplexity: 674.21


## Model Evaluation

In [16]:
from transformers import pipeline

model_path = "./origin/checkpoint-18750"
# model_path = "./nlp/checkpoint-18750"
# model_path = "./priv/checkpoint-18750"

fill_mask = pipeline("fill-mask", model=model_path, tokenizer=model_path, device="cuda")

fill_mask("This is a great [MASK].")

[{'score': 0.11428414285182953,
  'token': 2801,
  'token_str': 'idea',
  'sequence': 'this is a great idea.'},
 {'score': 0.09483437240123749,
  'token': 3185,
  'token_str': 'movie',
  'sequence': 'this is a great movie.'},
 {'score': 0.0480717308819294,
  'token': 2518,
  'token_str': 'thing',
  'sequence': 'this is a great thing.'},
 {'score': 0.041025906801223755,
  'token': 2265,
  'token_str': 'show',
  'sequence': 'this is a great show.'},
 {'score': 0.02726903185248375,
  'token': 3066,
  'token_str': 'deal',
  'sequence': 'this is a great deal.'}]

In [None]:
import math

# Initialize a list to store the perplexity values
perplexities = []

# Run the evaluation 10 times
for _ in range(10):
    eval_results = trainer.evaluate()
    perplexity = math.exp(eval_results['eval_loss'])
    perplexities.append(perplexity)

# Calculate the average perplexity
average_perplexity = sum(perplexities) / len(perplexities)

print(f">>> Average Perplexity over 10 runs: {average_perplexity:.2f}")


In [None]:
import math

# Initialize a list to store the perplexity values
perplexities = []

# Run the evaluation 10 times
for _ in range(10):
    eval_results = trainer.evaluate()
    perplexity = math.exp(eval_results['eval_loss'])
    perplexities.append(perplexity)

# Calculate the average perplexity
average_perplexity = sum(perplexities) / len(perplexities)

print(f">>> Average Perplexity over 10 runs: {average_perplexity:.2f}")
