## Preparing the dataset

In [34]:
import os
import comet_ml

from accelerate import notebook_launcher
from accelerate.utils import set_seed
from datasets import load_dataset, ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML
from transformers import AutoTokenizer
import torch
import gc
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForCausalLM
from transformers import AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling

In [2]:
dataset = load_dataset("text", data_dir='./', split='train')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
dataset = dataset.train_test_split(test_size=0.2)

In [4]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [5]:
show_random_elements(dataset['train'])

Unnamed: 0,text
0,"» Ачуы килсә, ни эшләргә уйлагандыр, анысын сәхия әйтеп бетермәде, һич көтмәгәндә күкле-яшелле тавыш белән тыштагыларга җырлап җавап кайтарды: әллә, дустым, укыдыңмы Каргалар курсысында; Карга кебек карылдыйсың Тәрәзә турысында."
1,"""дигән сорауга каршы тавыш бирүгә чакырып, республиканы төрле плакат-листовкалар өеме эчендә калдырдылар."
2,"Кояшта тән пешкәндә Бер аш кашыгы үсемлек мае, ике аш кашыгы каймак һәм бер йомырка сарысы кирәк."
3,"Алар мине “өф” ләп үстермәде, Булсын диеп безгә яраклы."
4,Шуннан соң үзенең чыгышында Русия җәмәгать пулатының комиссия башлыгы бүгенге җәмгыятьтәге әхлакый нормаларга таянырга кирәклеге турында сөйләде.
5,"Симез Мәче йортның юлларын белә иде, тиз бер якка сыенды."
6,"әллә кайчан кулын кыскартырга иде инде,."
7,"әмма председательлекне дә кулына алган секретарьлар кемнәр алар, барысы да лаек кешеләрме?"
8,Судан моны израильнең хәрби операциясе булмагае дип шикләнә.
9,Шуның белән җәләл хәзрәт ал да гөл булыр.


## Causal Language modeling

In [6]:
os.environ["COMET_LOG_ASSETS"] = "True"
model_checkpoint = "distilgpt2"

In [7]:
old_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [8]:
tokenizer = old_tokenizer.train_new_from_iterator(dataset, 52_000, length=1024)
tokenizer.save_pretrained("tatar_tokenizer-gpt2")






('tatar_tokenizer/tokenizer_config.json',
 'tatar_tokenizer/special_tokens_map.json',
 'tatar_tokenizer/vocab.json',
 'tatar_tokenizer/merges.txt',
 'tatar_tokenizer/added_tokens.json',
 'tatar_tokenizer/tokenizer.json')

In [9]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [10]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=8, remove_columns=["text"])

Map (num_proc=8):   0%|          | 0/400828 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1113 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1245 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1080 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1588 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1732 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

Map (num_proc=8):   0%|          | 0/100207 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1057 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1110 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1039 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1191 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1147 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

In [11]:
# block_size = tokenizer.model_max_length
block_size = 128
BATCH_SIZE = 128

In [12]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [13]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=BATCH_SIZE,
    num_proc=8,
)

Map (num_proc=8):   0%|          | 0/400828 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/100207 [00:00<?, ? examples/s]

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 400828
    })
    test: Dataset({
        features: ['text'],
        num_rows: 100207
    })
})

In [15]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'ртына мин, иң беренче чиратта, нәрсә тәкъдим итәр идем? Мәсьәләне спон�'

In [16]:
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

### GPU runtime

In [17]:
torch.cuda.empty_cache()
gc.collect()

74

In [18]:
!nvidia-smi

Fri Sep 15 13:25:53 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:01:00.0 Off |                    0 |
| N/A   28C    P0    25W / 250W |      0MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  On   | 00000000:02:00.0 Off |                    0 |
| N/A   31C    P0    27W / 250W |      0MiB / 32768MiB |      0%      Default |
|       

### Training

In [19]:
comet_ml.init( project_name = "TatNlp", experiment_name = "TatNlp-distill-gp2")

In [20]:
def training_function():
    model_name = model_checkpoint.split("/")[-1]
    training_args = TrainingArguments(
        f"{model_name}-finetuned-tatar_nlp_1",
        evaluation_strategy = "epoch",
        overwrite_output_dir=True, 
        num_train_epochs=5,
        per_device_train_batch_size=BATCH_SIZE,
        save_steps=500, 
        save_total_limit=2,
        do_train=True,
    )
    set_seed(42)
    torch.manual_seed(7)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=lm_datasets["train"],
        eval_dataset=lm_datasets["test"],
    )
    trainer.train()
    return trainer

In [21]:
trainer = notebook_launcher(training_function, num_processes=2, mixed_precision='fp16')

Launching training on 2 GPUs.


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/gumaonelove/tatnlp/9344357af5074c9c9fb969f019781ce1



Epoch,Training Loss,Validation Loss
1,0.9922,0.878971
2,0.8819,0.82271
3,0.8453,0.797918
4,0.8269,0.786087
5,0.8198,0.782212


[1;38;5;196mCOMET ERROR:[0m We failed to read file /home/asr/projects/speach/research/NLP/distilgpt2-finetuned-tatar_nlp_1/checkpoint-9500/optimizer.pt for uploading.
Please double-check the file path and permissions
[1;38;5;196mCOMET ERROR:[0m We failed to read file /home/asr/projects/speach/research/NLP/distilgpt2-finetuned-tatar_nlp_1/checkpoint-9500/rng_state_0.pth for uploading.
Please double-check the file path and permissions
[1;38;5;196mCOMET ERROR:[0m We failed to read file /home/asr/projects/speach/research/NLP/distilgpt2-finetuned-tatar_nlp_1/checkpoint-9500/trainer_state.json for uploading.
Please double-check the file path and permissions
[1;38;5;196mCOMET ERROR:[0m We failed to read file /home/asr/projects/speach/research/NLP/distilgpt2-finetuned-tatar_nlp_1/checkpoint-9500/scheduler.pt for uploading.
Please double-check the file path and permissions
[1;38;5;196mCOMET ERROR:[0m We failed to read file /home/asr/projects/speach/research/NLP/distilgpt2-finetuned-ta

[1;38;5;39mCOMET INFO:[0m     args/fsdp_transformer_layer_cls_to_wrap : None
[1;38;5;39mCOMET INFO:[0m     args/full_determinism                   : False
[1;38;5;39mCOMET INFO:[0m     args/gradient_accumulation_steps        : 1
[1;38;5;39mCOMET INFO:[0m     args/gradient_checkpointing             : False
[1;38;5;39mCOMET INFO:[0m     args/greater_is_better                  : None
[1;38;5;39mCOMET INFO:[0m     args/group_by_length                    : False
[1;38;5;39mCOMET INFO:[0m     args/half_precision_backend             : auto
[1;38;5;39mCOMET INFO:[0m     args/hub_always_push                    : False
[1;38;5;39mCOMET INFO:[0m     args/hub_model_id                       : None
[1;38;5;39mCOMET INFO:[0m     args/hub_private_repo                   : False
[1;38;5;39mCOMET INFO:[0m     args/hub_strategy                       : HubStrategy.EVERY_SAVE
[1;38;5;39mCOMET INFO:[0m     args/hub_token                          : None
[1;38;5;39mCOMET INFO:[0m     

[1;38;5;39mCOMET INFO:[0m     config/cross_attention_hidden_size      : None
[1;38;5;39mCOMET INFO:[0m     config/decoder_start_token_id           : None
[1;38;5;39mCOMET INFO:[0m     config/diversity_penalty                : 0.0
[1;38;5;39mCOMET INFO:[0m     config/do_sample                        : False
[1;38;5;39mCOMET INFO:[0m     config/early_stopping                   : False
[1;38;5;39mCOMET INFO:[0m     config/embd_pdrop                       : 0.1
[1;38;5;39mCOMET INFO:[0m     config/encoder_no_repeat_ngram_size     : 0
[1;38;5;39mCOMET INFO:[0m     config/eos_token_id                     : 50256
[1;38;5;39mCOMET INFO:[0m     config/exponential_decay_length_penalty : None
[1;38;5;39mCOMET INFO:[0m     config/finetuning_task                  : None
[1;38;5;39mCOMET INFO:[0m     config/forced_bos_token_id              : None
[1;38;5;39mCOMET INFO:[0m     config/forced_eos_token_id              : None
[1;38;5;39mCOMET INFO:[0m     config/id2label       

[1;38;5;39mCOMET INFO:[0m Still uploading 2 asset(s), remaining 877.74 MB/937.61 MB, Throughput 497.94 KB/s, ETA ~1806s
[1;38;5;39mCOMET INFO:[0m Still uploading 2 asset(s), remaining 869.41 MB/937.61 MB, Throughput 568.86 KB/s, ETA ~1566s
[1;38;5;39mCOMET INFO:[0m Still uploading 2 asset(s), remaining 861.73 MB/937.61 MB, Throughput 524.08 KB/s, ETA ~1684s
[1;38;5;39mCOMET INFO:[0m Still uploading 2 asset(s), remaining 851.55 MB/937.61 MB, Throughput 694.13 KB/s, ETA ~1257s
[1;38;5;39mCOMET INFO:[0m Still uploading 2 asset(s), remaining 841.44 MB/937.61 MB, Throughput 689.84 KB/s, ETA ~1250s
[1;38;5;39mCOMET INFO:[0m Still uploading 2 asset(s), remaining 831.48 MB/937.61 MB, Throughput 680.26 KB/s, ETA ~1252s
[1;38;5;39mCOMET INFO:[0m Still uploading 2 asset(s), remaining 821.89 MB/937.61 MB, Throughput 654.15 KB/s, ETA ~1287s
[1;38;5;39mCOMET INFO:[0m Still uploading 2 asset(s), remaining 814.18 MB/937.61 MB, Throughput 526.20 KB/s, ETA ~1585s
[1;38;5;39mCOMET INFO:

KeyboardInterrupt: 

In [22]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

NameError: name 'trainer' is not defined

### Pipeline API

In [None]:
from transformers import pipeline

text_generator = pipeline(
    "text-generation", 
    "Rocketknight1/distilgpt2-finetuned-wikitext2",
    framework="tf",
)

In [None]:
text_generator(test_sentence)

## Masked language modeling

In [39]:
model_checkpoint = "distilroberta-base"
BATCH_SIZE = 192

In [24]:
old_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenizer = old_tokenizer.train_new_from_iterator(dataset, 52_000, length=1024)
tokenizer.save_pretrained("tatar_tokenizer-distill-bert")

Downloading (…)lve/main/config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]






('tatar_tokenizer-distill-bert/tokenizer_config.json',
 'tatar_tokenizer-distill-bert/special_tokens_map.json',
 'tatar_tokenizer-distill-bert/vocab.json',
 'tatar_tokenizer-distill-bert/merges.txt',
 'tatar_tokenizer-distill-bert/added_tokens.json',
 'tatar_tokenizer-distill-bert/tokenizer.json')

In [25]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=8, remove_columns=["text"])

Map (num_proc=8):   0%|          | 0/400828 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (713 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (609 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (539 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (731 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

Map (num_proc=8):   0%|          | 0/100207 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (935 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (598 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

In [41]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=BATCH_SIZE,
    num_proc=8,
)

Map (num_proc=8):   0%|          | 0/400828 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/100207 [00:00<?, ? examples/s]

In [30]:
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Downloading model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
comet_ml.init( project_name = "TatNlp", experiment_name = "TatNlp-distill-bert")

In [42]:
def training_function():
    model_name = model_checkpoint.split("/")[-1]
    training_args = TrainingArguments(
        f"{model_name}-finetuned-tatar_nlp_2",
        evaluation_strategy = "epoch",
        overwrite_output_dir=True, 
        num_train_epochs=5,
        per_device_train_batch_size=BATCH_SIZE,
        save_steps=500, 
        save_total_limit=2,
        do_train=True,
    )


    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
    set_seed(42)
    torch.manual_seed(7)
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=lm_datasets["train"],
        eval_dataset=lm_datasets["test"],
        data_collator=data_collator,
    )
    trainer.train()
    return trainer

In [None]:
trainer = notebook_launcher(training_function, num_processes=2, mixed_precision='fp16')

Launching training on 2 GPUs.


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/gumaonelove/tatnlp/61a822b5e1ac47489ee22194071faef0

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


## Inference

In [None]:
from transformers import pipeline

# You can of course use your own model checkpoint here instead of mine
mask_filler = pipeline(
    "fill-mask", 
    "Rocketknight1/distilroberta-base-finetuned-wikitext2",
    framework="tf",
)

In [None]:
mask_filler("The most common household pets are <mask> and dogs.", top_k=1)

In [None]:
mask_filler("The Gulf War was a conflict that took place in <mask> in 1990-1991.", top_k=3)