In [None]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install datasets
!pip install transformers torch
!pip install accelerate
!apt install git-lfs

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import torch
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
torch.cuda.empty_cache()

from typing import Dict, Tuple
from datasets import list_datasets, load_dataset, DatasetDict,Dataset
from collections import Counter
from typing import List, Dict, Union, Callable, Any
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pprint import pprint
import torch
import torch.nn as nn


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)     

In [None]:


ds_train = load_dataset("Sree1994/Babylm_100M", split="train")
ds_valid = load_dataset("Sree1994/Babylm_100M", split="valid")

raw_datasets = DatasetDict(
    {
        "train": ds_train,
        "valid": ds_valid
    }
)

raw_datasets

In [None]:
from transformers import RobertaTokenizer

context_length = 128
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
vocab_size = tokenizer.vocab_size

outputs = tokenizer(
    raw_datasets["train"]["text"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
    pad_to_max_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['attention_mask']}")

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length <= context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

In [None]:
from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
import torch

config = AutoConfig.from_pretrained(
    "roberta-base",
    vocab_size=len(tokenizer),
    is_decoder=True,
    random_init=True
)
print(len(tokenizer))
# model = RobertaForCausalLM.from_pretrained("roberta-base", is_decoder=True, vocab_size=10_000)
model = RobertaForCausalLM(config).to(device)
# model.init_weights()
model_size = sum(t.numel() for t in model.parameters())
print(f"RoBERTa size: {model_size/1000**2:.1f}M parameters")
# print(config)

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

In [None]:
!pip install evaluate

In [None]:
import evaluate
from evaluate import TextClassificationEvaluator, Metric, EvaluationModuleInfo
class Cal_Perplexity(Metric):
    """
    You can define custom metrics! In this case I do this to compute Macro-F1, which averages per-class F1 scores
    """
    pp_metric_info: EvaluationModuleInfo = evaluate.load("perplexity")._info()

    def _info(self) -> EvaluationModuleInfo:
        # we'll just say the info is the same in this case
        return MyMacroF1Metric.pp_metric_info

    def _compute(self, loss) -> Dict[str, Any]:
        # we can just call the sklearn implementation! Metrics in huggingface generally correspond with sklearn metrics
        # when applicable
        pp = torch.exp()
        return {"perplexity": float(pp) if pp.size == 1 else pp}

In [None]:
from transformers import Trainer, TrainingArguments

# # PP: Cal_Perplexity = Cal_Perplexity()
# my_evaluation: Cal_Perplexity = Cal_Perplexity()

# def Cal_Perplexity(eval_pred: EvalPrediction) -> Dict[str, float]:
#         logits, labels = eval_pred.predictions, eval_pred.label_ids
#         predictions: Tensor = logits.argmax(axis=-1)
#         return my_evaluation.compute(predictions=predictions, references=labels)

args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Baby_Lm",
    overwrite_output_dir=True,
    evaluation_strategy = 'epoch',    
    do_train=True,
    do_eval=True,
    do_predict=True,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    # evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    weight_decay=0.01,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=1000,
    fp16=True,
    push_to_hub=False,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    # compute_metrics=my_compute_metrics,
)

In [None]:
trainer.train()

In [None]:
# trn = trainer.train()
# model = trainer.model  # make sure to load_best_model_at_end=True!

# run a final evaluation on the test set
val = trainer.evaluate(metric_key_prefix="test", eval_dataset=tokenized_datasets["valid"])
valid_loss = val.get("test_loss")
# print(f"Training Loss: {trn.training_loss}")
print(f"Validation Loss: {valid_loss}")
print(f"Validation Perplexity: {torch.exp(torch.tensor(valid_loss))}")

In [None]:
trn = trainer.train()
model = trainer.model  # make sure to load_best_model_at_end=True!

# run a final evaluation on the test set
val = trainer.evaluate(metric_key_prefix="test", eval_dataset=tokenized_datasets["valid"])

In [None]:
trn.training_loss

In [None]:
val

In [None]:
valid = val.get("test_loss")
torch.exp(torch.tensor(valid))

In [None]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline("text-generation", model=model, device=device, tokenizer=tokenizer)

In [None]:
pipe.predict("</s>")

In [None]:
text = "who is Brother Lustig?"
print(pipe(text, num_return_sequences=1)[0]["generated_text"])

# Input text for prediction
# input_text = "who is Brother Lustig?"

# # Encode input text
# input_ids = tokenizer.encode(input_text, add_special_tokens=False, return_tensors='pt')

# # Generate next token predictions
# next_token_logits = model(input_ids).logits[:, -1, :]
# next_token_id = next_token_logits.argmax().item()
# next_token = tokenizer.decode([next_token_id])

# # Print next predicted token
# print(f"Next token prediction: {next_token}")