In [26]:

from argparse import Namespace
from transformers import (
    BertConfig,
    BertForMaskedLM,
    BertTokenizer,
    DataCollatorForLanguageModeling
)
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

args = Namespace()
args.train = "medium.txt"
args.max_len = 128
args.epochs = 1
args.batch_size = 4
args.token_path = 'georgian-vocab.txt'

In [27]:
from tokenizers import BertWordPieceTokenizer

# initialize
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False
)
# and train
tokenizer.train(files=args.train, vocab_size=30_000, min_frequency=2,
                limit_alphabet=1000, wordpieces_prefix='##',
                special_tokens=[
                    '[PAD', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])

# Save files to disk
tokenizer.save_model(".", "georgian")






['./georgian-vocab.txt']

In [28]:

tok = BertTokenizer(
    args.token_path
)
# and trai

In [29]:
tok('გამარჯობა როგორ ხარ მებადური ზღვაშიშევარდნილი')

{'input_ids': [2, 25401, 289, 2306, 9937, 13489, 155, 12375, 927, 11255, 1393, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [30]:
class MaskedLMDataset(Dataset):
    def __init__(self, file, tokenizer):
        self.tokenizer = tokenizer
        self.lines = self.load_lines(file)
        self.ids = self.encode_lines(self.lines)
        
    def load_lines(self, file):
        with open(file) as f:
            lines = [
                line
                for line in f.read().splitlines()
                if (len(line) > 0 and not line.isspace())
            ]
        return lines
    
    def encode_lines(self, lines):
        batch_encoding = self.tokenizer(
            lines, add_special_tokens=True, truncation=True, max_length=args.max_len
        )
        return batch_encoding["input_ids"]

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        return torch.tensor(self.ids[idx], dtype=torch.long)
        
train_dataset = MaskedLMDataset(args.train, tok)

In [31]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tok, mlm=True, mlm_probability=0.15
)

train_loader = DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    collate_fn=data_collator
)

In [32]:
config = BertConfig(vocab_size=30_000,
    max_position_embeddings=514,
    num_attention_heads=6,
    num_hidden_layers=6)

In [40]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
bert = BertForMaskedLM(config).to(device)
device

device(type='cuda', index=0)

In [41]:
model_size = sum(t.numel() for t in bert.parameters())
model_size

66587184

In [42]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="model",
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=64,
    save_steps=5000,  
)

trainer = Trainer(
    model=bert,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [43]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer.train()

***** Running training *****
  Num examples = 200000
  Num Epochs = 20
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 62500


Step,Training Loss
500,9.2558
1000,8.7841
1500,8.5307
2000,8.3699
2500,8.2356
3000,8.0432
3500,7.9143
4000,7.7965
4500,7.6877
5000,7.5747


Saving model checkpoint to model/checkpoint-5000
Configuration saved in model/checkpoint-5000/config.json
Model weights saved in model/checkpoint-5000/pytorch_model.bin
