- Step 1 â€” Loading LaMini-instruction dataset using load_dataset from huggingface
- Step 2 â€” Loading Dolly Tokenizer and Model using huggingface (again!)
- Step 3 â€” Data Preparation â€” Tokenize, split dataset and prepare for batch processing
- Step 4 â€” Configuring LoRA and getting the PEFT model
- Step 5 â€” Training the model and saving
- Step 6 â€” Prediction with the finetuned model

### Importing libraries

In [1]:
from typing import Dict, List
from datasets import Dataset, load_dataset, disable_caching 
disable_caching()
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
from torch.utils.data import Dataset
from IPython.display import Markdown

  from .autonotebook import tqdm as notebook_tqdm


### Data Loading

In [2]:
dataset = load_dataset('MBZUAI/LaMini-instruction', split='train')
small_dataset = dataset.select(list(range(200)))
print(small_dataset[0])

{'instruction': 'List 5 reasons why someone should learn to code', 'response': '1. High demand for coding skills in the job market\n2. Increased problem-solving and analytical skills\n3. Ability to develop new products and technologies\n4. Potentially higher earning potential\n5. Opportunity to work remotely and/or freelance', 'instruction_source': 'alpaca'}


In [3]:
prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request. Instruction: {instruction}\n Response:"""
answer_template = """{response}"""

def _add_text(rec):
    instruction = rec['instruction']
    response = rec['response']
    
    if not instruction:
        raise ValueError(f"Expected an instruction in {rec}")
    if not response:
        raise ValueError(f"Expected a response in {rec}")
    
    rec['prompt'] = prompt_template.format(instruction=instruction)
    rec['answer'] = answer_template.format(response=response)
    rec['text'] = rec['prompt'] + rec['answer']
    return rec

small_dataset = small_dataset.map(_add_text)
print(small_dataset[0])

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 200/200 [00:00<00:00, 6568.07 examples/s]

{'instruction': 'List 5 reasons why someone should learn to code', 'response': '1. High demand for coding skills in the job market\n2. Increased problem-solving and analytical skills\n3. Ability to develop new products and technologies\n4. Potentially higher earning potential\n5. Opportunity to work remotely and/or freelance', 'instruction_source': 'alpaca', 'prompt': 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Instruction: List 5 reasons why someone should learn to code\n Response:', 'answer': '1. High demand for coding skills in the job market\n2. Increased problem-solving and analytical skills\n3. Ability to develop new products and technologies\n4. Potentially higher earning potential\n5. Opportunity to work remotely and/or freelance', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Instruction: List 5 reasons why someone should learn to code\n Response:




### Tokenizer and Model Load

In [4]:
# model_id = 'mistralai/Mistral-7B-v0.1'
model_id = 'EleutherAI/pythia-2.8b'
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    # load_in_8bit=True,
    dtype=torch.float16,
).to('cuda')

Loading weights: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 388/388 [00:02<00:00, 186.34it/s, Materializing param=gpt_neox.layers.31.post_attention_layernorm.weight] 


In [6]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50277, 2560)

### Data Preparation

In [7]:
from functools import partial
import copy
from transformers import DataCollatorForSeq2Seq

MAX_LENGTH = 512


def _preprocess_batch(batch: Dict[str, List]):
    model_inputs = tokenizer(
        batch["text"],
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length",
    )
    model_inputs["labels"] = copy.deepcopy(model_inputs["input_ids"])
    return model_inputs


_preprocessing_function = partial(_preprocess_batch)

encoded_small_dataset = small_dataset.map(
    _preprocessing_function,
    batched=True,
    remove_columns=["instruction", "response", "prompt", "answer"],
)
processed_dataset = encoded_small_dataset.filter(
    lambda rec: len(rec["input_ids"]) <= MAX_LENGTH
)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 200/200 [00:00<00:00, 2559.42 examples/s]
Filter: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 200/200 [00:00<00:00, 2409.62 examples/s]


In [8]:
split_dataset = processed_dataset.train_test_split(test_size=14, seed=0)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction_source', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 186
    })
    test: Dataset({
        features: ['instruction_source', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14
    })
})


In [9]:
# takes a list of samples from a Dataset and collate them into a batch, as a dictionary of PyTorch tensors.
data_collator = DataCollatorForSeq2Seq(
    model=model,
    tokenizer=tokenizer,
    max_length=MAX_LENGTH,
    pad_to_multiple_of=8,
    padding="max_length",
)

### Coniguring LoRA

In [10]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

LORA_R = 512
LORA_ALPHA = 1024
LORA_DROPOUT = 0.05
TARGET_MODULES = ["query_key_value"]

lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    task_type="CAUSAL_LM",
    bias="none",
    target_modules=TARGET_MODULES,
)

# model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 167,772,160 || all params: 2,942,842,880 || trainable%: 5.7010


### Training Args

In [None]:
features = ["instruction_source", "text", "input_ids", "attention_mask", "labels"]

In [31]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./lora-pythia-2.8b",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    num_train_epochs=1,
    fp16=True,
    eval_strategy="epoch",  # ðŸ‘ˆ important
    logging_steps=10,
    save_strategy="epoch",
    report_to="none",
)

In [32]:
import math
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds

    # Shift for causal LM
    shift_logits = logits[..., :-1, :].reshape(-1, logits.shape[-1])
    shift_labels = labels[..., 1:].reshape(-1)

    # Compute loss manually
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(
        torch.tensor(shift_logits),
        torch.tensor(shift_labels),
    )

    perplexity = math.exp(loss.item())
    return {
        "eval_loss": loss.item(),
        "perplexity": perplexity,
    }

In [33]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    # compute_metrics=compute_metrics,
)

In [34]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.312354




TrainOutput(global_step=6, training_loss=0.16954052448272705, metrics={'train_runtime': 259.5356, 'train_samples_per_second': 0.717, 'train_steps_per_second': 0.023, 'total_flos': 1607973517393920.0, 'train_loss': 0.16954052448272705, 'epoch': 1.0})

## Save model

In [None]:
model.save_pretrained("lora-pythia-2.8b")
tokenizer.save_pretrained("lora-pythia-2.8b")

### loading

In [None]:
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
)

model = PeftModel.from_pretrained(base_model, "lora-pythia-2.8b")