## Fine-tune large models using 🤗 `peft` adapters (LoRA), `transformers` & `bitsandbytes`

In [1]:
!pip install -U -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print('Memory:', model.get_memory_footprint())
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    
def print_params_shape(model):
    import pandas as pd
    params = []
    for p in model.parameters():
        params.append((p.shape, p.dtype))
    df8 = pd.DataFrame(params).transpose()
    return df8

def prepare_model_for_training(model):
    for param in model.parameters():
        param.requires_grad = False  # freeze the model - train adapters later
        if param.ndim == 1:
            # cast the small parameters (e.g. layernorm) to fp32 for stability
            param.data = param.data.to(torch.float32)

    model.gradient_checkpointing_enable()  # reduce number of stored activations
    model.enable_input_require_grads()

    class CastOutputToFloat(nn.Sequential):
        def forward(self, x): return super().forward(x).to(torch.float32)

    model.lm_head = CastOutputToFloat(model.lm_head)
    return model

# maybe gradient_checkpoint needs to be specified again. Can check if inputs have grad enabled

### Model loading

Here let's load the `opt-6.7b` model, its weights in half-precision (float16) are about 13GB on the Hub! If we load them in 8-bit we would require around 7GB of memory instead.

In [7]:
base_model = "facebook/opt-6.7b"

In [8]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(base_model)

In [9]:
bit8_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    device_map='auto',
)

print_trainable_parameters(bit8_model)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


pytorch_model.bin.index.json:   0%|          | 0.00/41.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Memory: 6874497024
trainable params: 214843392 || all params: 6658473984 || trainable%: 3.22661607623997


In [10]:
bit4_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_4bit=True,
    device_map='auto',
)
print_trainable_parameters(bit4_model)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Memory: 3653271552
trainable params: 214843392 || all params: 3437248512 || trainable%: 6.250446869056642


### Post-processing on the model

Finally, we need to apply some post-processing on the 8-bit model to enable training, let's freeze all our layers, and cast the layer-norm in `float32` for stability. We also cast the output of the last layer in `float32` for the same reasons.

In [11]:
print(bit8_model)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 4096, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
      (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-31): 32 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
            (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
            (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
            (out_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear8bitLt(in_features=4096, out_features=16384, bias=True)
          (fc2): Linear8bitLt(in_features=16384, out_features=4096, bias=True

In [12]:
print(bit4_model)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 4096, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
      (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-31): 32 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
            (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
            (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
            (out_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear4bit(in_features=4096, out_features=16384, bias=True)
          (fc2): Linear4bit(in_features=16384, out_features=4096, bias=True)
          

In [13]:
print_params_shape(bit8_model)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,506,507,508,509,510,511,512,513,514,515
0,"(50272, 4096)","(2050, 4096)","(4096,)","(4096,)","(4096, 4096)","(4096,)","(4096, 4096)","(4096,)","(4096, 4096)","(4096,)",...,"(4096, 4096)","(4096,)","(4096,)","(4096,)","(16384, 4096)","(16384,)","(4096, 16384)","(4096,)","(4096,)","(4096,)"
1,torch.float16,torch.float16,torch.float16,torch.float16,torch.int8,torch.float16,torch.int8,torch.float16,torch.int8,torch.float16,...,torch.int8,torch.float16,torch.float16,torch.float16,torch.int8,torch.float16,torch.int8,torch.float16,torch.float16,torch.float16


In [14]:
print_params_shape(bit4_model)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,506,507,508,509,510,511,512,513,514,515
0,"(50272, 4096)","(2050, 4096)","(4096,)","(4096,)","(8388608, 1)","(4096,)","(8388608, 1)","(4096,)","(8388608, 1)","(4096,)",...,"(8388608, 1)","(4096,)","(4096,)","(4096,)","(33554432, 1)","(16384,)","(33554432, 1)","(4096,)","(4096,)","(4096,)"
1,torch.float16,torch.float16,torch.float16,torch.float16,torch.uint8,torch.float16,torch.uint8,torch.float16,torch.uint8,torch.float16,...,torch.uint8,torch.float16,torch.float16,torch.float16,torch.uint8,torch.float16,torch.uint8,torch.float16,torch.float16,torch.float16


In [15]:
bit8_model = prepare_model_for_training(bit8_model)
bit8_model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 4096, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
      (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-31): 32 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
            (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
            (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
            (out_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear8bitLt(in_features=4096, out_features=16384, bias=True)
          (fc2): Linear8bitLt(in_features=16384, out_features=4096, bias=True

In [16]:
bit4_model = prepare_model_for_training(bit4_model)
bit4_model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 4096, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
      (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-31): 32 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
            (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
            (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
            (out_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear4bit(in_features=4096, out_features=16384, bias=True)
          (fc2): Linear4bit(in_features=16384, out_features=4096, bias=True)
          

### Apply LoRA

In [17]:
print_trainable_parameters(bit8_model)
print_trainable_parameters(bit4_model)

Memory: 6877921280
trainable params: 0 || all params: 6658473984 || trainable%: 0.0
Memory: 3656695808
trainable params: 0 || all params: 3437248512 || trainable%: 0.0


In [18]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [19]:
# just testing 4-bit

# bit8_model = get_peft_model(bit8_model, config)
# print_trainable_parameters(bit8_model)
# print(bit8_model)
print('=========================================')
bit4_model = get_peft_model(bit4_model, config)
print_trainable_parameters(bit4_model)
print(bit4_model)

Memory: 3690250240
trainable params: 8388608 || all params: 3445637120 || trainable%: 0.2434559330496184
PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OPTForCausalLM(
      (model): OPTModel(
        (decoder): OPTDecoder(
          (embed_tokens): Embedding(50272, 4096, padding_idx=1)
          (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
          (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (layers): ModuleList(
            (0-31): 32 x OPTDecoderLayer(
              (self_attn): OPTAttention(
                (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
                (v_proj): lora.Linear4bit(
                  (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(i

In [None]:
# print_params_shape(bit8_model)

In [20]:
print_params_shape(bit4_model)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,634,635,636,637,638,639,640,641,642,643
0,"(50272, 4096)","(2050, 4096)","(4096,)","(4096,)","(8388608, 1)","(4096,)","(8388608, 1)","(4096,)","(16, 4096)","(4096, 16)",...,"(8388608, 1)","(4096,)","(4096,)","(4096,)","(33554432, 1)","(16384,)","(33554432, 1)","(4096,)","(4096,)","(4096,)"
1,torch.float16,torch.float16,torch.float32,torch.float32,torch.uint8,torch.float32,torch.uint8,torch.float32,torch.float32,torch.float32,...,torch.uint8,torch.float32,torch.float32,torch.float32,torch.uint8,torch.float32,torch.uint8,torch.float32,torch.float32,torch.float32


### Training

In [21]:
# !pip install -U datasets

In [10]:
import transformers
from datasets import load_dataset
data = load_dataset("Abirate/english_quotes")
data

Downloading readme:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/647k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 2508
    })
})

In [11]:
data['train'][0]

{'quote': '“Be yourself; everyone else is already taken.”',
 'author': 'Oscar Wilde',
 'tags': ['be-yourself',
  'gilbert-perreira',
  'honesty',
  'inspirational',
  'misattributed-oscar-wilde',
  'quote-investigator']}

In [12]:
data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)
data['train'][0]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

{'quote': '“Be yourself; everyone else is already taken.”',
 'author': 'Oscar Wilde',
 'tags': ['be-yourself',
  'gilbert-perreira',
  'honesty',
  'inspirational',
  'misattributed-oscar-wilde',
  'quote-investigator'],
 'input_ids': [2, 17, 48, 9325, 2512, 131, 961, 1493, 16, 416, 551, 4, 17, 46],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=50,
        max_steps=100,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

## Share adapters on the 🤗 Hub

In [9]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
model.push_to_hub("shahzebnaveed/opt-6.7b-4bit-lora", use_auth_token=True)



README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shahzebnaveed/opt-6.7b-4bit-lora/commit/8334a33fdab25d1476d6f85877cfc0b7bfefa5f6', commit_message='Upload model', commit_description='', oid='8334a33fdab25d1476d6f85877cfc0b7bfefa5f6', pr_url=None, pr_revision=None, pr_num=None)

## Load adapters from the Hub

You can also directly load adapters from the Hub using the commands below:

### Load to resume re-training

In [15]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "shahzebnaveed/opt-6.7b-4bit-lora"
config = PeftConfig.from_pretrained(peft_model_id)

config.base_model_name_or_path = base_model
config.inference_mode = False 

model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_4bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model weights into the model
model = PeftModel.from_pretrained(model, peft_model_id)

# Update config for re-training
from peft import get_peft_model
model = get_peft_model(model, config)
model

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): OPTForCausalLM(
          (model): OPTModel(
            (decoder): OPTDecoder(
              (embed_tokens): Embedding(50272, 4096, padding_idx=1)
              (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
              (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
              (layers): ModuleList(
                (0-31): 32 x OPTDecoderLayer(
                  (self_attn): OPTAttention(
                    (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
                    (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
                    (q_proj): lora.Linear4bit(
                      (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=True)
                      (lora_dropout): ModuleDict(
                        (default): Dropout(p=0.05, inplace=

## Inference

You can then directly use the trained model or the model that you have loaded from the 🤗 Hub for inference as you would do it usually in `transformers`.

In [11]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "shahzebnaveed/opt-6.7b-4bit-lora"
config = PeftConfig.from_pretrained(peft_model_id)
config.base_model_name_or_path = base_model
#config.inference_mode = True

model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_4bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model weights into the model
model = PeftModel.from_pretrained(model, peft_model_id)
model

adapter_config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OPTForCausalLM(
      (model): OPTModel(
        (decoder): OPTDecoder(
          (embed_tokens): Embedding(50272, 4096, padding_idx=1)
          (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
          (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (layers): ModuleList(
            (0-31): 32 x OPTDecoderLayer(
              (self_attn): OPTAttention(
                (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
                (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=True)
                (q_proj): lora.Linear4bit(
                  (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=4096, out

In [12]:
batch = tokenizer("Two things are infinite: ", return_tensors='pt')

with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))





 Two things are infinite:  1. The universe.  2. The number of things that are infinite.                                  


As you can see by fine-tuning for few steps we have almost recovered the quote from Albert Einstein that is present in the [training data](https://huggingface.co/datasets/Abirate/english_quotes).