## Train a BERT model (from scratch)

In [25]:
import torch
from transformers import BertConfig, BertForMaskedLM
from transformers import Trainer, TrainingArguments
from tokenizers import BertWordPieceTokenizer

from _dataset import BERT16SDataset
from _collator import DataCollatorForBertWordPieceTokenizer

#### Check resources

In [2]:
# Check that we have a GPU
!nvidia-smi

/bin/sh: nvidia-smi: command not found


In [3]:
# Check that PyTorch sees it
torch.cuda.is_available()

False

### Prepare model configurations

In [4]:
vocab_size = 15621  # parallel to k=8 in classic k-mers (for this corpus)

In [18]:
config = BertConfig(
    vocab_size=vocab_size,
    hidden_size=64,
    intermediate_size=1024,
    num_hidden_layers=4,
    num_attention_heads=4,
    max_position_embeddings=512
)

### Create BERT model

In [26]:
model = BertForMaskedLM(config=config)

print(f"BERT model has {model.num_parameters()/10**6}M parameters")

BERT model has 1.653061M parameters


### Create Dataset of the 16S corpus

In [27]:
vocab_path = 'vocab.txt'
data_path = 'SILVA_parsed_V2.tsv'

In [28]:
dataset = BERT16SDataset(
    vocab_path=vocab_path,
    data_path=data_path,
    block_size=512
)

I0718 13:19:06.994493 4627736000 _dataset.py:23] Loading BERT tokenizer using vocab file vocab.txt
I0718 13:19:07.010282 4627736000 _dataset.py:31] Loading 16S dataset file at SILVA_parsed_V2.tsv...
  if (await self.run_code(code, result,  async_=asy)):
I0718 13:19:12.394495 4627736000 _dataset.py:33] 16S corpus is of shape (432033, 13)


### Create a Data Collator object

In [29]:
tokenizer = BertWordPieceTokenizer(
    vocab_path,
    handle_chinese_chars=False,
    lowercase=False,
    unk_token="[UNK]",
    sep_token="[SEP]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    mask_token="[MASK]")

tokenizer.enable_truncation(512)
tokenizer.enable_padding(max_length=512)

In [30]:
len(tokenizer.get_vocab())

15621

In [31]:
data_collator = DataCollatorForBertWordPieceTokenizer(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Configure Tranining

In [32]:
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

In [33]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

I0718 13:19:14.260339 4627736000 training_args.py:159] PyTorch: setting up devices
W0718 13:19:14.264362 4627736000 trainer.py:208] You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it.
I0718 13:19:14.265350 4627736000 trainer.py:214] You are instantiating a Trainer but W&B is not installed. To use wandb logging, run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface.


## Train!

In [None]:
%%time
trainer.train()