In [None]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install datasets
!pip install transformers torch
!pip install accelerate
!apt install git-lfs

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

In [None]:
import torch
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
torch.cuda.empty_cache()

from typing import Dict, Tuple
from datasets import list_datasets, load_dataset, DatasetDict,Dataset
from collections import Counter
from typing import List, Dict, Union, Callable, Any
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pprint import pprint
import torch
import torch.nn as nn


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)     

cuda


In [None]:
# dataset: DatasetDict = load_dataset("Sree1994/babylm_childstories")

ds_train = load_dataset("Sree1994/blm_strict_small", split="train")
ds_valid = load_dataset("Sree1994/blm_strict_small", split="valid")

raw_datasets = DatasetDict(
    {
        "train": ds_train,
        "valid": ds_valid
    }
)

raw_datasets



DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 20000
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 5000
    })
})

In [None]:
from transformers import RobertaTokenizer

context_length = 128
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
vocab_size = tokenizer.vocab_size

outputs = tokenizer(
    raw_datasets["train"]["text"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
    pad_to_max_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['attention_mask']}")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length <= context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets



DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 20000
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 5000
    })
})

In [None]:
from transformers import AutoTokenizer, RobertaForMaskedLM, AutoConfig
import torch

config = AutoConfig.from_pretrained(
    "roberta-base",
    vocab_size=len(tokenizer),
    is_decoder=False,
    random_init=True,
    no_deprecation_warning=True,
)
print(len(tokenizer))
# model = RobertaForCausalLM.from_pretrained("roberta-base", is_decoder=True, vocab_size=10_000)
model = RobertaForMaskedLM(config).to(device)
# model.init_weights()
model_size = sum(t.numel() for t in model.parameters())
print(f"RoBERTa size: {model_size/1000**2:.1f}M parameters")
# print(config)

50265
RoBERTa size: 124.7M parameters


In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=True, mlm_probability=0.15)
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Baby_Lm/BLM_Roberta_Baseline_MLM",
    overwrite_output_dir=True,
    evaluation_strategy = 'epoch',    
    do_train=True,
    do_eval=True,
    do_predict=True,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    # evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=15,
    weight_decay=0.01,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=1000,
    fp16=True,
    push_to_hub=False,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    # compute_metrics=my_compute_metrics,
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
0,No log,9.618752
1,No log,8.481133
2,No log,7.558644
3,No log,7.223675
4,No log,7.08587
5,No log,7.015002
6,No log,6.939378
7,No log,6.795554
8,No log,6.772528
9,No log,6.684867


TrainOutput(global_step=585, training_loss=7.2539855435363245, metrics={'train_runtime': 3135.602, 'train_samples_per_second': 95.675, 'train_steps_per_second': 0.187, 'total_flos': 1.974273780712781e+16, 'train_loss': 7.2539855435363245, 'epoch': 15.0})

In [None]:
# trn = trainer.train()
# model = trainer.model  # make sure to load_best_model_at_end=True!

# run a final evaluation on the test set
val = trainer.evaluate(metric_key_prefix="test", eval_dataset=tokenized_datasets["valid"])
valid_loss = val.get("test_loss")
# print(f"Training Loss: {trn.training_loss}")
print(f"Validation Loss: {valid_loss}")
print(f"Validation Perplexity: {torch.exp(torch.tensor(valid_loss))}")

Validation Loss: 6.185730457305908
Validation Perplexity: 485.7676696777344


In [None]:
print(f"Best Validation Perplexity: {torch.exp(torch.tensor(5.40))}")

Best Validation Perplexity: 221.40643310546875


In [None]:
# trn.metrics

In [None]:
# trainer.push_to_hub()

#Let's try some Predictions 

In [None]:
# import torch
# from transformers import pipeline

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# pipe = pipeline(
#     "text-generation", model="Sree1994/BLM_Roberta_Baseline", device=device
# )

In [None]:
# import torch
# from transformers import pipeline

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# pipe = pipeline("text-generation", model=model, device=device, tokenizer=tokenizer)

In [None]:
# pipe.predict("</s>")

In [None]:
# text = "who is Brother Lustig?"
# print(pipe(text, num_return_sequences=1)[0]["generated_text"])

#Let's collect some graphs