# MonteBERT
Downstream masked Language Modeling task using a Transformer, based on DistilBERT and trained from scratch.

## Import Libraries

In [40]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
import os
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
import torch
from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM, LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

## Get Dataset

In [3]:
paths = [str(x) for x in Path(".").glob("**/*.txt")]
print(paths)

['data\\cristo.txt']


## Train a Tokenizer
I don't use a pretrained tokenizer instead I use bytelevel tokenizer to break monte cristo string or word down into a sub-string or sub-word. "smaller" and smallest" become "small," "er," and "est.". WorkPiece
level encoding will allow strings classified as unkown (unk_token) to disappear.
* " the tokenizer" turns into "'Ġthe', 'Ġtoken', 'izer'" where Ġ means blank space
* Each token is given an indice value 'Ġthe' = 150, 'Ġtoken'=5430, 'izer'=4712
* merges.txt, which contains the merged tokenized sub-strings
* vocab.json, which contains the indices of the tokenized sub-strings

In [5]:
tokenizer = ByteLevelBPETokenizer() #initialize tokenizer
#customise training
tokenizer.train(files=paths, #path to the dataset.
                vocab_size=52_000, #the size of our tokenizer's model length.
                min_frequency=2, #the minimum frequency threshold.
                special_tokens=[
    "<s>",#a start token
    "<pad>",# a padding token
    "</s>",# an end token
    "<unk>", #an unknown token
    "<mask>", #the mask token for language modeling
])

## Save files to disk

In [11]:
token_dir = './content/MonteBERT'
if not os.path.exists(token_dir):
    os.makedirs(token_dir)
tokenizer.save_model(token_dir)

['./content/MonteBERT\\vocab.json', './content/MonteBERT\\merges.txt']

## load trained tokenizer

In [3]:
tokenizer = ByteLevelBPETokenizer(
    './content/MonteBERT/vocab.json',
    './content/MonteBERT/merges.txt',
)

In [12]:
t = tokenizer.encode("Dantès had learned how to wait; he had waited fourteen years for his liberty," +
                       "and now he was free he could wait at least six months or a year for wealth")
print(t.tokens) #the tokens
print(t) # meta info like # of tokens (e.g 33)

['DantÃ¨s', 'Ġhad', 'Ġlearned', 'Ġhow', 'Ġto', 'Ġwait', ';', 'Ġhe', 'Ġhad', 'Ġwaited', 'Ġfourteen', 'Ġyears', 'Ġfor', 'Ġhis', 'Ġliberty', ',', 'and', 'Ġnow', 'Ġhe', 'Ġwas', 'Ġfree', 'Ġhe', 'Ġcould', 'Ġwait', 'Ġat', 'Ġleast', 'Ġsix', 'Ġmonths', 'Ġor', 'Ġa', 'Ġyear', 'Ġfor', 'Ġwealth']
Encoding(num_tokens=33, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [13]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
    )
tokenizer.enable_truncation(max_length=512)

In [14]:
t = tokenizer.encode("Dantès had learned how to wait; he had waited fourteen years for his liberty," +
                       "and now he was free he could wait at least six months or a year for wealth")
print(t.tokens) #the tokens
print(t) # meta info like # of tokens (now 35 due to start and end tokens)

['<s>', 'DantÃ¨s', 'Ġhad', 'Ġlearned', 'Ġhow', 'Ġto', 'Ġwait', ';', 'Ġhe', 'Ġhad', 'Ġwaited', 'Ġfourteen', 'Ġyears', 'Ġfor', 'Ġhis', 'Ġliberty', ',', 'and', 'Ġnow', 'Ġhe', 'Ġwas', 'Ġfree', 'Ġhe', 'Ġcould', 'Ġwait', 'Ġat', 'Ġleast', 'Ġsix', 'Ġmonths', 'Ġor', 'Ġa', 'Ġyear', 'Ġfor', 'Ġwealth', '</s>']
Encoding(num_tokens=35, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


## check GPU availability

In [20]:
!nvidia-smi

Sun Nov 26 20:57:03 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 546.01                 Driver Version: 546.01       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1060      WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   58C    P0              24W /  78W |    701MiB /  6144MiB |      2%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [21]:
"torch sees CUDA" if torch.cuda.is_available() else "torch doesnt CUDA" #torch sees CUDA?

'torch sees CUDA'

## configuration of the Model

In [28]:
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
print(config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}



## Reload Tokenizer in Transformer

In [29]:
tokenizer = RobertaTokenizer.from_pretrained("./content/MonteBERT", max_length=512)

## Import RoBERTa model

In [32]:
model = RobertaForMaskedLM(config=config)
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [33]:
print(model.num_parameters()) #83.5 million parameters

83504416


In [34]:
LP=list(model.parameters())
lp=len(LP)
print(lp)#106 matrices and vectors

106


In [35]:
for p in range(0,lp):
    print(LP[p])
#number of parameters is calculated by taking all parameters in the model and adding them up
#52k vocab x 768 dimensions; 768dimensions/12heads = 64 dimensions per head

Parameter containing:
tensor([[-0.0092, -0.0243, -0.0164,  ..., -0.0135,  0.0142, -0.0039],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0263,  0.0276, -0.0013,  ...,  0.0035,  0.0210, -0.0021],
        ...,
        [-0.0138,  0.0098, -0.0406,  ..., -0.0328, -0.0539,  0.0080],
        [ 0.0146,  0.0008,  0.0026,  ..., -0.0055,  0.0398, -0.0367],
        [-0.0127, -0.0127,  0.0032,  ...,  0.0338,  0.0120, -0.0231]],
       requires_grad=True)
Parameter containing:
tensor([[-0.0205,  0.0112,  0.0201,  ..., -0.0187, -0.0416,  0.0376],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0330, -0.0033,  0.0144,  ...,  0.0425, -0.0170,  0.0115],
        ...,
        [ 0.0266,  0.0420,  0.0043,  ...,  0.0221,  0.0056, -0.0017],
        [-0.0249,  0.0075, -0.0005,  ..., -0.0118, -0.0035, -0.0232],
        [-0.0208,  0.0070, -0.0171,  ..., -0.0090,  0.0038, -0.0019]],
       requires_grad=True)
Parameter containing:
tensor([[ 8.

In [36]:
#count the number of parameters of each tensor
np=0
for p in range(0,lp):#number of tensors
    PL2=True
    try:
        L2=len(LP[p][0]) #2d check
    except:
        L2 = 1 #1d
        PL2=False
    L1=len(LP[p])
    L3=L1*L2
    np+=L3 #params per tensor
    if PL2==True:
        print(p,L1,L2,L3) # size of params
    if PL2==False:
        print(p,L1,L3)
print(np) #total # of params

0 52000 768 39936000
1 514 768 394752
2 1 768 768
3 768 768
4 768 768
5 768 768 589824
6 768 768
7 768 768 589824
8 768 768
9 768 768 589824
10 768 768
11 768 768 589824
12 768 768
13 768 768
14 768 768
15 3072 768 2359296
16 3072 3072
17 768 3072 2359296
18 768 768
19 768 768
20 768 768
21 768 768 589824
22 768 768
23 768 768 589824
24 768 768
25 768 768 589824
26 768 768
27 768 768 589824
28 768 768
29 768 768
30 768 768
31 3072 768 2359296
32 3072 3072
33 768 3072 2359296
34 768 768
35 768 768
36 768 768
37 768 768 589824
38 768 768
39 768 768 589824
40 768 768
41 768 768 589824
42 768 768
43 768 768 589824
44 768 768
45 768 768
46 768 768
47 3072 768 2359296
48 3072 3072
49 768 3072 2359296
50 768 768
51 768 768
52 768 768
53 768 768 589824
54 768 768
55 768 768 589824
56 768 768
57 768 768 589824
58 768 768
59 768 768 589824
60 768 768
61 768 768
62 768 768
63 3072 768 2359296
64 3072 3072
65 768 3072 2359296
66 768 768
67 768 768
68 768 768
69 768 768 589824
70 768 768
71 768 768

## Build the dataset

In [38]:
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./data/cristo.txt",
    block_size=128,#limits the length of an example
)



## Create a data collector

In [41]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

## Initalise the Trainer

In [43]:
training_args = TrainingArguments(
    output_dir="./content/MonteBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`