# Imports

In [1]:
%%capture
!pip install nvidia-ml-py3==7.352.0 pytorch-lightning==2.0.1.post0 transformers==4.28.0 torchvision==0.15.1 rouge-score==0.1.2 tensorboardx==2.6 accelerate==0.18.0 deepspeed==0.9.0 peft==0.2.0

In [12]:
import pytorch_lightning as pl
from transformers import T5ForConditionalGeneration
from transformers import AutoTokenizer
from rouge_score import rouge_scorer
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup, AdamW
import os
from peft import get_peft_model, LoraConfig, TaskType
import torch
import argparse
from pytorch_lightning.loggers import TensorBoardLogger
import os, json
from torch.utils.data import Dataset
from torch.utils.data import default_collate

# Import Helper Functions

In [3]:
!git clone https://github.com/shaankhosla/NLP_with_LLMs/
%cd "NLP_with_LLMs"

Cloning into 'NLP_with_LLMs'...
remote: Enumerating objects: 132, done.[K
remote: Counting objects: 100% (132/132), done.[K
remote: Compressing objects: 100% (98/98), done.[K
remote: Total 132 (delta 77), reused 75 (delta 32), pack-reused 0[K
Receiving objects: 100% (132/132), 127.05 KiB | 14.12 MiB/s, done.
Resolving deltas: 100% (77/77), done.
/content/NLP_with_LLMs


In [4]:
import gpu_utilities, generate_data

# Assumptions

In [5]:
MODEL_NAME = "t5-small"

# Generate Data

In [6]:
generate_data.main(num_train=1000, num_val=200)

Generating synthetic dataset (1000 train, 200 val)...




In [7]:
!cat './data/train/0.json'

{"number": 30702498, "words": "thirty million seven hundred two thousand four hundred ninety eight"}

In [10]:
class StreamingDataset(Dataset):
    def __init__(self, path):
        self.path = path
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

    def __len__(self):
        return len(os.listdir(self.path))

    def encode_text(self, text_input, text_ouput):
        inputs = self.tokenizer(
            text_input,
            max_length=16,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        labels = self.tokenizer(
            text_ouput,
            max_length=16,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        ).input_ids[0]
        input_ids = inputs["input_ids"][0]
        attention_mask = inputs["attention_mask"][0]
        labels = torch.tensor([label if label != 0 else -100 for label in labels])
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

    def __getitem__(self, idx):
        file_path = os.path.join(self.path, str(idx) + ".json")
        with open(file_path, "r") as infile:
            data = json.load(infile)
        number, words = str(data["number"]), data["words"]
        return self.encode_text(number, words)

In [11]:
train_data = StreamingDataset("./data/train/")
val_data = StreamingDataset("./data/val")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
for i in range(3):
    sample = train_data[i]
    print(sample, "\n")

(tensor([[ 460, 4327, 1755, 4867,    1,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, 

# Set up Lightning training module

In [29]:
class T5Finetuner(pl.LightningModule):
    def __init__(self, args, train_data, val_data):
        super().__init__()
        self.save_hyperparameters()
        self.args = args
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
        self.train_data, self.val_data = train_data, val_data

    def forward(self, batch, batch_idx):
        return self.model(**batch)

    def training_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {"loss": loss, "log": {"train_loss": loss}}

    def validation_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {"loss": loss}

    def train_dataloader(self):
        return DataLoader(
            self.train_data,
            batch_size=self.args["batch_size"],
            num_workers=os.cpu_count(),
            pin_memory=True,
            collate_fn=default_collate,
            prefetch_factor=50,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_data,
            batch_size=self.args["batch_size"],
            num_workers=os.cpu_count(),
            pin_memory=True,
            collate_fn=default_collate,
            prefetch_factor=50,
        )

    def configure_optimizers(self):
        optimizer = AdamW(
            self.trainer.model.parameters(), lr=self.args["lr"], weight_decay=0.01
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=5,
            num_training_steps=self.args["epochs"]
            * len(self.train_data)
            / self.args["batch_size"],
        )
        return {"optimizer": optimizer, "lr_scheduler": scheduler}

# Vanilla Training

In [30]:
args = {"batch_size": 4, "epochs": 1, "lr": 1e-4}
pl_model = T5Finetuner(args, train_data, val_data)
trainer = pl.Trainer(
    max_epochs=args["epochs"],
)
trainer.fit(pl_model)
gpu_utilities.print_gpu_utilization()

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


Device 0 : b'Tesla T4'
GPU memory occupied: 1105 MB.


# Gradient Accumulation

In [None]:
args = {"batch_size": 4, "epochs": 1, "lr": 1e-4}
pl_model = T5Finetuner(args, train_data, val_data)

trainer = pl.Trainer(
    max_epochs=args["epochs"],
    accumulate_grad_batches=4,
)
trainer.fit(pl_model)
gpu_utilities.print_gpu_utilization()

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


Device 0 : b'Tesla T4'
GPU memory occupied: 403 MB.


# Mixed Precision

In [None]:
args = {"batch_size": 4, "epochs": 1, "lr": 1e-4}
pl_model = T5Finetuner(args, train_data, val_data)

trainer = pl.Trainer(
    max_epochs=args["epochs"],
    precision="16-mixed",
    accumulate_grad_batches=4,
)
trainer.fit(pl_model)
gpu_utilities.print_gpu_utilization()

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


Device 0 : b'Tesla T4'
GPU memory occupied: 403 MB.


# LoRA: Low-Rank Adaptation of Large Language Models


In [None]:
class T5Finetuner(pl.LightningModule):
    def __init__(self, args, train_data, val_data):
        super().__init__()
        self.save_hyperparameters()
        self.model = model
        self.args = args
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
        self.train_data, self.val_data = train_data, val_data
        get_peft()

    def get_peft():
        self.model.enable_input_require_grads()
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_2_SEQ_LM,
            inference_mode=False,
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
        )
        self.model = get_peft_model(self.model, peft_config)
        self.model.print_trainable_parameters()

    def forward(self, batch, batch_idx):
        return self.model(**batch)

    def training_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {"loss": loss, "log": {"train_loss": loss}}

    def validation_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        return {"loss": loss}

    def train_dataloader(self):
        return DataLoader(
            self.train_data,
            batch_size=self.args["batch_size"],
            num_workers=os.cpu_count(),
            pin_memory=True,
            collate_fn=default_collate,
            prefetch_factor=50,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_data,
            batch_size=self.args["batch_size"],
            num_workers=os.cpu_count(),
            pin_memory=True,
            collate_fn=default_collate,
            prefetch_factor=50,
        )

    def configure_optimizers(self):
        optimizer = AdamW(
            self.trainer.model.parameters(), lr=self.args["lr"], weight_decay=0.01
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=5,
            num_training_steps=self.args["epochs"]
            * len(self.train_data)
            / self.args["batch_size"],
        )
        return {"optimizer": optimizer, "lr_scheduler": scheduler}

In [None]:
args = {"batch_size": 4, "epochs": 1, "lr": 1e-4}
pl_model = T5Finetuner(args, train_data, val_data)

trainer = pl.Trainer(
    max_epochs=args["epochs"],
    precision="16-mixed",
    accumulate_grad_batches=4,
)
trainer.fit(pl_model)
gpu_utilities.print_gpu_utilization()

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                  | Params
------------------------------------------------
0 | model | PeftModelForSeq2SeqLM | 60.8 M
------------------------------------------------
294 K     Trainable params
60.5 M    Non-trainable params
60.8 M    Total params
243.206   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


Device 0 : b'Tesla T4'
GPU memory occupied: 403 MB.


# Inference

In [44]:
ckpt_path = f"./lightning_logs/version_{trainer.logger.version}/checkpoints/epoch={trainer.current_epoch-1}-step={trainer.global_step}.ckpt"
training_model = pl_model.load_from_checkpoint(ckpt_path)
training_model.model.save_pretrained("digit_conversion")
training_model.tokenizer.save_pretrained("digit_conversion")

# Run `huggingface-cli login` or `poetry run huggingface-cli login`
# training_model.model.push_to_hub("digit_conversion")
# training_model.tokenizer.push_to_hub("digit_conversion")

('digit_conversion/tokenizer_config.json',
 'digit_conversion/special_tokens_map.json',
 'digit_conversion/tokenizer.json')

In [45]:
model = T5ForConditionalGeneration.from_pretrained("digit_conversion")
tokenizer = AutoTokenizer.from_pretrained("digit_conversion")

In [46]:
input_ids = tokenizer("123", return_tensors="pt").input_ids
generated_ids = model.generate(
    input_ids,
    do_sample=True,
    max_length=16,
    top_k=3,
    temperature=0.7,
)

prediction = tokenizer.decode(
    generated_ids.squeeze(),
    skip_special_tokens=True,
)
print(prediction)

123
