Ref:

https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb#scrollTo=LTXXutqeDzPi

In [1]:
!pip install transformers datasets



In [2]:
import os
from datasets import load_dataset, concatenate_datasets
from transformers import (
    BertTokenizer,
    BertTokenizerFast,
    BertConfig,
    BertModel,
    BertForMaskedLM, 
    Trainer, 
    TrainingArguments,
    LineByLineTextDataset,
    DataCollatorForLanguageModeling
)
import torch

## Configuration

In [3]:
RANDOM_SEED=37

#### Training

In [None]:
DATASET_LIMIT = 250_000
MODEL_MAX_LEN = 512

MLM_MASKING_PROB = .15
MLM_EPOCHS = 5

MLM_TRAIN_DATESET_PATH = 'mlm_train.txt'
MLM_TEST_DATESET_PATH = 'mlm_test.txt'

MODEL_NAME = "bert-base-uncased"
# MODEL_NAME = "bert-base-multilingual-uncased"

VOCAB_NAME = 'bert-base-uncased'
# VOCAB_NAME = "bert-base-multilingual-uncased"

### Drive

In [None]:
# MOUNTING DRIVE TO ACCESS DATASET
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# MOUNT PATH
DRIVE_PATH = os.path.join('drive','MyDrive','collab','research', 'bert_scratch')
VOCAB = 'eng'
MODEL_SAVE_PATH = os.path.join(DRIVE_PATH, f"{MODEL_NAME.replace('-','_')}_{VOCAB}_wiki_mlm")

## Load Dataset

In [4]:
wiki = load_dataset("wikipedia", "20200501.en", split="train")
# bookcorpus = load_dataset("bookcorpus", split="train")
# print(wiki.column_names, bookcorpus.column_names)
# # ['title', 'text'] ['text']

# wiki.remove_columns_("title")
# bert_dataset = concatenate_datasets([wiki, bookcorpus])


# dataset = load_dataset("cc_news", split="train")

bert_dataset = wiki

Reusing dataset wikipedia (/root/.cache/huggingface/datasets/wikipedia/20200501.en/1.0.0/009f923d9b6dd00c00c8cdc7f408f2b47f45dd4f5fb7982a21f9448f4afbe475)


In [5]:
bert_dataset

Dataset({
    features: ['title', 'text'],
    num_rows: 6078422
})

In [6]:
# split the dataset into training (90%) and testing (10%)
dataset_split = bert_dataset.train_test_split(test_size=0.1)

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/wikipedia/20200501.en/1.0.0/009f923d9b6dd00c00c8cdc7f408f2b47f45dd4f5fb7982a21f9448f4afbe475/cache-2ccb569d81cbadc4.arrow and /root/.cache/huggingface/datasets/wikipedia/20200501.en/1.0.0/009f923d9b6dd00c00c8cdc7f408f2b47f45dd4f5fb7982a21f9448f4afbe475/cache-f88d4876eeac61dd.arrow


In [7]:
# if you want to train the tokenizer from scratch (especially if you have custom
# dataset loaded as datasets object), then run this cell to save it as files
# but if you already have your custom data as text files, there is no point using this

def dataset_to_text(dataset, output_filename="data.txt"):
    """Utility function to save dataset text to disk,
    useful for using the texts to train the tokenizer 
    (as the tokenizer accepts files)"""
    with open(output_filename, "w") as f:
        for t in dataset["text"]:
           print(t.replace('\n', ''), file=f)

In [8]:
# save the training set to train.txt
dataset_to_text(dataset_split["train"][:DATASET_LIMIT], MLM_TRAIN_DATESET_PATH)
dataset_to_text(dataset_split["test"][:DATASET_LIMIT//4], MLM_TEST_DATESET_PATH)

In [9]:
# with open(MLM_TRAIN_DATESET_PATH, 'r') as f:
#     text = f.read()
#     lines = text.split('\n')
#     print(lines[2])

## Tokenizer

In [10]:
# Save the slow pretrained tokenizer
# bert_tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

bert_tokenizer = BertTokenizerFast.from_pretrained(VOCAB_NAME, max_len=MODEL_MAX_LEN)

In [11]:
bert_tokenizer.encode("This is a test.")

[101, 2023, 2003, 1037, 3231, 1012, 102]

## Build Dataset For Training

In [12]:
dataset = LineByLineTextDataset(
    tokenizer=bert_tokenizer,
    file_path=MLM_TRAIN_DATESET_PATH,
    block_size=256,
)



In [13]:
# eval_dataset = LineByLineTextDataset(
#     tokenizer=bert_tokenizer,
#     file_path=MLM_TEST_DATESET_PATH,
#     block_size=512,
# )

In [14]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=bert_tokenizer, mlm=True, mlm_probability=MLM_MASKING_PROB
)

## Train

In [15]:
# Check that PyTorch sees it
torch.cuda.is_available()

True

Configuring Model from Scratch using config

In [16]:
config = BertConfig(
    num_hidden_layers=4, 
    num_attention_heads=4, 
    vocab_size= bert_tokenizer.vocab_size
)

model = BertForMaskedLM(config=config)

### GPU

In [17]:
device = torch.device('cuda')# and move our model over to the selected device
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

### Trainer

In [18]:
training_args = TrainingArguments(
    output_dir=MODEL_NAME,
    overwrite_output_dir=True,
    num_train_epochs=MLM_EPOCHS,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    # eval_dataset=eval_dataset
)

In [19]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 250017
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 78135
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss
500,7.4932
1000,7.0004
1500,6.8446
2000,6.7046
2500,6.6302
3000,6.5463
3500,6.5019
4000,6.4486
4500,6.3862
5000,6.3528


Saving model checkpoint to bert-base-uncased/checkpoint-10000
Configuration saved in bert-base-uncased/checkpoint-10000/config.json
Model weights saved in bert-base-uncased/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to bert-base-uncased/checkpoint-20000
Configuration saved in bert-base-uncased/checkpoint-20000/config.json
Model weights saved in bert-base-uncased/checkpoint-20000/pytorch_model.bin
Saving model checkpoint to bert-base-uncased/checkpoint-30000
Configuration saved in bert-base-uncased/checkpoint-30000/config.json
Model weights saved in bert-base-uncased/checkpoint-30000/pytorch_model.bin
Deleting older checkpoint [bert-base-uncased/checkpoint-10000] due to args.save_total_limit
Saving model checkpoint to bert-base-uncased/checkpoint-40000
Configuration saved in bert-base-uncased/checkpoint-40000/config.json
Model weights saved in bert-base-uncased/checkpoint-40000/pytorch_model.bin
Deleting older checkpoint [bert-base-uncased/checkpoint-20000] due to args.s

CPU times: user 4h 51min 50s, sys: 1min 1s, total: 4h 52min 52s
Wall time: 4h 51min 16s


TrainOutput(global_step=78135, training_loss=3.778586981100794, metrics={'train_runtime': 17476.5866, 'train_samples_per_second': 71.529, 'train_steps_per_second': 4.471, 'total_flos': 5.563704967505849e+16, 'train_loss': 3.778586981100794, 'epoch': 5.0})

In [57]:
trainer.save_model(MODEL_SAVE_PATH)

Saving model checkpoint to drive/MyDrive/collab/research/bert_scratch/bert_base_uncased_eng_wiki_mlm
Configuration saved in drive/MyDrive/collab/research/bert_scratch/bert_base_uncased_eng_wiki_mlm/config.json
Model weights saved in drive/MyDrive/collab/research/bert_scratch/bert_base_uncased_eng_wiki_mlm/pytorch_model.bin


## Test Model

### Fill Mask

In [None]:
from transformers import pipeline

In [25]:
fill_mask = pipeline(
    "fill-mask",
    # model="./bert_uncased_based_wiki",
    model=MODEL_SAVE_PATH,
    tokenizer=bert_tokenizer
)

loading configuration file drive/MyDrive/collab/research/bert_scratch/bert_base_uncased_eng_wiki/config.json
Model config BertConfig {
  "_name_or_path": "drive/MyDrive/collab/research/bert_scratch/bert_base_uncased_eng_wiki",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.14.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading configuration file drive/MyDrive/collab/research/bert_scratch/bert_base_uncased_eng_wiki/config.json
Model config BertConfig {
  "_name_or_path": "drive/MyDrive/collab/research/be

In [31]:
fill_mask("What a nice [MASK]")

[{'score': 0.07262233644723892,
  'sequence': 'what a nice "',
  'token': 1000,
  'token_str': '"'},
 {'score': 0.0201098769903183,
  'sequence': 'what a nice people',
  'token': 2111,
  'token_str': 'people'},
 {'score': 0.018932225182652473,
  'sequence': 'what a nices',
  'token': 2015,
  'token_str': '##s'},
 {'score': 0.01620546169579029,
  'sequence': 'what a nice page',
  'token': 3931,
  'token_str': 'page'},
 {'score': 0.013936794362962246,
  'sequence': 'what a nice is',
  'token': 2003,
  'token_str': 'is'}]

### Test [CLS] Embeddings

In [33]:
model = BertModel.from_pretrained(MODEL_SAVE_PATH)

loading configuration file drive/MyDrive/collab/research/bert_scratch/bert_base_uncased_eng_wiki/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.14.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file drive/MyDrive/collab/research/bert_scratch/bert_base_uncased_eng_wiki/pytorch_model.bin
Some weights of the model checkpoint at drive/MyDrive/collab/research/bert_scratch/bert_base_uncased_eng_wiki were not used when initializing BertModel: ['cls.predictions

In [44]:
sequence_list = ['this is a test', 'this is another test']
encoded_input = bert_tokenizer(
        sequence_list, padding=True, truncation=True, return_tensors="pt"
    )
with torch.no_grad():
    pt_output = model(**encoded_input)

In [45]:
a = pt_output["last_hidden_state"][0][0]

In [42]:
b = pt_output["last_hidden_state"][0][0]

### Compare similarity between embeddings

In [46]:
from scipy import spatial

1 - spatial.distance.cosine(a, b)

1.0

### Tensorflow Embeddings similarity computation

In [52]:
from transformers import TFBertModel

tf_model = TFBertModel.from_pretrained(MODEL_SAVE_PATH, from_pt=True)

encoded_input_tf = bert_tokenizer(
        sequence_list, padding=True, truncation=True, return_tensors="tf"
    )

tf_output = tf_model(encoded_input_tf)

loading configuration file drive/MyDrive/collab/research/bert_scratch/bert_base_uncased_eng_wiki/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.14.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file drive/MyDrive/collab/research/bert_scratch/bert_base_uncased_eng_wiki/pytorch_model.bin
Loading PyTorch weights from /content/drive/MyDrive/collab/research/bert_scratch/bert_base_uncased_eng_wiki/pytorch_model.bin
PyTorch checkpoint contains 76,283,252 param

In [51]:
tf_a = tf_output[0][0][0]

In [53]:
tf_b = tf_output[0][0][0]

In [54]:
1 - spatial.distance.cosine(tf_a, tf_b)

1.0

In [55]:
1 - spatial.distance.cosine(a, tf_a)

1.0