## Model training

In [None]:
from transformers import DebertaV2Tokenizer, AutoTokenizer, DebertaV2ForMaskedLM
from transformers import PreTrainedTokenizerFast
from datasets import load_from_disk
from tqdm.auto import tqdm

### Load data

In [None]:
dataset = load_from_disk("../data/c4ai-wik-tokenized-aux")

### Load model and tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("../models/deberta_pt_tokenizer")
model = DebertaV2ForMaskedLM.from_pretrained(
    "microsoft/deberta-v3-base",
    vocab_size=tokenizer.vocab_size,
    ignore_mismatched_sizes=True,
)

### Splitting dev training

In [None]:
dev_train = dataset.train_test_split(test_size=5, seed=42)

### Trainer Parameters

In [None]:
from transformers import TrainingArguments

params = {
    "output_dir": "model/model_deberta",
    "per_device_train_batch_size": 2,
    "learning_rate": 1e-3,
    "seed": 42,
    "max_steps": 10000,
    "logging_dir": "model/logs",
    "logging_strategy": "steps",
    "logging_steps": 100,
    "save_strategy": "steps",
    "save_steps": 5_000,
    "save_total_limit": 2,
    "report_to": "tensorboard",
    "ddp_find_unused_parameters": False,
    "warmup_steps": 2,
}


training_args = TrainingArguments(
    output_dir=params["output_dir"],
    per_device_train_batch_size=params["per_device_train_batch_size"],
    learning_rate=params["learning_rate"],
    seed=params["seed"],
    max_steps=params["max_steps"],
    # logging & evaluation strategies
    logging_dir=params["logging_dir"],
    logging_strategy=params["logging_strategy"],
    logging_steps=params["logging_steps"],
    save_strategy=params["save_strategy"],
    save_steps=params["save_steps"],
    save_total_limit=params["save_total_limit"],
    # report_to=params['report_to'],
    # push to hub parameters
    # push_to_hub=True,
    # hub_strategy="every_save",
    # hub_model_id=script_args.repository_id,
    # hub_token=script_args.hf_hub_token,
    # pretraining
    ddp_find_unused_parameters=params["ddp_find_unused_parameters"],
    warmup_steps=params["warmup_steps"],
    fp16=True,
    fp16_full_eval=True,
)

### Data collator

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,
    pad_to_multiple_of=8,
)

### Train model

In [None]:
from transformers import Trainer

model = model = DebertaV2ForMaskedLM.from_pretrained(
    "microsoft/deberta-v3-xsmall",
    vocab_size=tokenizer.vocab_size,
    ignore_mismatched_sizes=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dev_train["test"],
    eval_dataset=dev_train["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
    # preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

In [None]:
model.save_pretrained("../model/model_deberta")