In [3]:
#!pip install pynvml==11.5.0

In [1]:
#!pip install numpy --pre torch torchvision torchaudio --force-reinstall --index-url https://download.pytorch.org/whl/nightly/cu117

In [5]:
# !pip install git+https://github.com/huggingface/transformers
# !pip list | grep -E 'transformers|tokenizers'

In [1]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [2]:
import pynvml
print(pynvml.__version__)

11.5.0


In [3]:
print_gpu_utilization()

GPU memory occupied: 5290 MB.


In [4]:
import random
import os
import torch
import numpy as np

In [5]:
print(torch.__version__)

2.0.0.dev20230125+cu117


In [6]:
import transformers
print(transformers.__version__)

4.27.1


In [7]:
from platform import python_version

print(python_version())

3.7.12


In [8]:
torch.cuda.empty_cache()

In [9]:
torch.backends.cudnn.enabled

True

In [10]:
torch.cuda.is_available()

True

In [11]:
print(torch.version.cuda)

11.7


In [12]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [13]:
!nvidia-smi

Wed Mar 22 21:32:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.142.00   Driver Version: 450.142.00   CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM3...  On   | 00000000:36:00.0 Off |                    0 |
| N/A   33C    P0    63W / 350W |   6921MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [14]:
seed=40

In [15]:
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [16]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')

In [17]:
VOCAB_SIZE = 83828

In [18]:
from transformers import BertConfig

config = BertConfig(
    vocab_size=VOCAB_SIZE,
    max_position_embeddings=512,
    num_attention_heads=12,
    num_hidden_layers=12,
    type_vocab_size=2,
  )

In [25]:
from transformers import BertForMaskedLM

model = BertForMaskedLM(config=config)

In [26]:
from datasets import load_dataset
data = load_dataset('text', data_files="preprocessed_ru_convers.txt")

Found cached dataset text (/home/jovyan/.cache/huggingface/datasets/text/default-d55e9e324edd0a62/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [27]:
data['train']

Dataset({
    features: ['text'],
    num_rows: 26508030
})

In [28]:
#data['train']['text'][:6]

https://discuss.huggingface.co/t/help-understanding-how-to-build-a-dataset-for-language-as-with-the-old-textdataset/5870

In [29]:
max_length = 512
import multiprocessing

num_proc = multiprocessing.cpu_count()

def group_texts(examples):
    tokenized_inputs = tokenizer(
       examples["text"], return_special_tokens_mask=True, truncation=True, max_length=max_length
    )
    return tokenized_inputs


train_dataset = data["train"].map(group_texts, batched=True, remove_columns=["text"],
    num_proc=num_proc,
)

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/text/default-d55e9e324edd0a62/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-03e084708e3dce57_*_of_00096.arrow


In [30]:
from itertools import chain
max_length = 128
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts2(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

train_dataset2 = train_dataset.map(group_texts2, batched=True, num_proc=num_proc)

print(f"the dataset contains in total {len(train_dataset2)*max_length} tokens")

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/text/default-d55e9e324edd0a62/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2/cache-2098b2a5eac88d3f_*_of_00096.arrow


the dataset contains in total 466934400 tokens


In [31]:
#len(train_dataset2['input_ids'][0])

In [32]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [33]:
torch.cuda.is_available()

True

In [37]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./ourBERTF",
    max_steps = 200000,
    per_device_train_batch_size=32,
    fp16=True,
    save_strategy='steps',
    save_steps = 30000,
    seed=seed,
    logging_steps=1000,
    prediction_loss_only=True,
    dataloader_num_workers=4)

In [39]:
#!pip install bitsandbytes==0.37.1

In [39]:
import bitsandbytes



In [40]:
import bitsandbytes as bnb
from torch import nn
from transformers.trainer_pt_utils import get_parameter_names


decay_parameters = get_parameter_names(model, [nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": training_args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]

optimizer_kwargs = {
    "betas": (training_args.adam_beta1, training_args.adam_beta2),
    "eps": training_args.adam_epsilon,
}
optimizer_kwargs["lr"] = training_args.learning_rate
adam_bnb_optim = bnb.optim.Adam8bit(
    optimizer_grouped_parameters,
    betas=(training_args.adam_beta1, training_args.adam_beta2),
    eps=training_args.adam_epsilon,
    lr=training_args.learning_rate,
 )

In [41]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset2,
    optimizers=(adam_bnb_optim, None)
)

In [42]:
torch.cuda.is_available()

True

In [43]:
print_gpu_utilization()

GPU memory occupied: 8524 MB.


In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  else: return ct.c_void_p(A.data.storage().data_ptr())


Step,Training Loss
1000,8.0448


In [None]:
trainer.save_model("./ourBERTF")

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./ourBERTF",
    tokenizer=tokenizer
)

In [None]:
fill_mask('Достоевский написал несколько хороших [MASK].')

In [None]:
fill_mask('Я [MASK] в школу.')