# PEFT BLOOM




## packages

- bitsandbytes
- datasets
- accelerate
- loralib

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Import package

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

In [4]:
checkpoint = "bigscience/bloomz-560m"
model = AutoModelForCausalLM.from_pretrained(
    checkpoint,
    load_in_8bit=True,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

### Freezing the original weights


In [2]:
for param in model.parameters():
  param.requires_grad = False
  if param.ndim == 1:
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): 
    return super().forward(x).to(torch.float32)
  
model.lm_head = CastOutputToFloat(model.lm_head)

### Setting up the LoRa Adapters

In [12]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [15]:
for name, module in model.named_modules():
    print(name)


transformer
transformer.word_embeddings
transformer.word_embeddings_layernorm
transformer.h
transformer.h.0
transformer.h.0.input_layernorm
transformer.h.0.self_attention
transformer.h.0.self_attention.query_key_value
transformer.h.0.self_attention.dense
transformer.h.0.self_attention.attention_dropout
transformer.h.0.post_attention_layernorm
transformer.h.0.mlp
transformer.h.0.mlp.dense_h_to_4h
transformer.h.0.mlp.gelu_impl
transformer.h.0.mlp.dense_4h_to_h
transformer.h.1
transformer.h.1.input_layernorm
transformer.h.1.self_attention
transformer.h.1.self_attention.query_key_value
transformer.h.1.self_attention.dense
transformer.h.1.self_attention.attention_dropout
transformer.h.1.post_attention_layernorm
transformer.h.1.mlp
transformer.h.1.mlp.dense_h_to_4h
transformer.h.1.mlp.gelu_impl
transformer.h.1.mlp.dense_4h_to_h
transformer.h.2
transformer.h.2.input_layernorm
transformer.h.2.self_attention
transformer.h.2.self_attention.query_key_value
transformer.h.2.self_attention.dense
tr

In [20]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["self_attention.query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 1572864 || all params: 560787456 || trainable%: 0.2804741766549072


## Data

In [28]:
import transformers
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")


In [29]:
data['train']['quote'][0]

'“Be yourself; everyone else is already taken.”'

In [30]:
data['train']['author'][0]

'Oscar Wilde'

In [31]:
data['train']['tags'][0]

['be-yourself',
 'gilbert-perreira',
 'honesty',
 'inspirational',
 'misattributed-oscar-wilde',
 'quote-investigator']

In [32]:
def merge_columns(example):
    example["prediction"] = example["quote"] + " ->: " + str(example["tags"])
    return example

data['train'] = data['train'].map(merge_columns)
data['train']["prediction"][:5]

["“Be yourself; everyone else is already taken.” ->: ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']",
 "“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.” ->: ['best', 'life', 'love', 'mistakes', 'out-of-control', 'truth', 'worst']",
 "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.” ->: ['human-nature', 'humor', 'infinity', 'philosophy', 'science', 'stupidity', 'universe']",
 "“So many books, so little time.” ->: ['books', 'humor']",
 "“A room without books is like a body without a soul.” ->: ['books', 'simile', 'soul']"]

In [33]:
data['train'][0]

{'quote': '“Be yourself; everyone else is already taken.”',
 'author': 'Oscar Wilde',
 'tags': ['be-yourself',
  'gilbert-perreira',
  'honesty',
  'inspirational',
  'misattributed-oscar-wilde',
  'quote-investigator'],
 'prediction': "“Be yourself; everyone else is already taken.” ->: ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']"}

In [34]:
data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)

In [35]:
data

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'prediction', 'input_ids', 'attention_mask'],
        num_rows: 2508
    })
})

In [37]:
splited_data = data['train'].train_test_split(test_size=0.2)
splited_data

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'prediction', 'input_ids', 'attention_mask'],
        num_rows: 2006
    })
    test: Dataset({
        features: ['quote', 'author', 'tags', 'prediction', 'input_ids', 'attention_mask'],
        num_rows: 502
    })
})

### Training

In [40]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=splited_data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        #max_steps=200,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=25,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
25,3.11
50,2.9481
75,2.8726
100,2.796
125,2.6842
150,2.6792
175,2.6527
200,2.6384
225,2.6459
250,2.6178


TrainOutput(global_step=375, training_loss=2.6953421732584637, metrics={'train_runtime': 267.0894, 'train_samples_per_second': 22.532, 'train_steps_per_second': 1.404, 'total_flos': 1271476231913472.0, 'train_loss': 2.6953421732584637, 'epoch': 2.99})

## Evaluate

In [42]:
trainer.evaluate(splited_data['test'])

{'eval_loss': 2.5623464584350586,
 'eval_runtime': 7.9176,
 'eval_samples_per_second': 63.403,
 'eval_steps_per_second': 7.957,
 'epoch': 2.99}