In [None]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main 
!pip install -q git+https://github.com/huggingface/peft.git

## summary

- bigscience/bloom-7b1
- lora fine-tune bloom: 可插拔式的（plugin/adapter）
    - freeeze original weights
    - plugin lora adapters (peft)
- huggingface transformers 库
    - trainer.train 的参数及过程；
    - mlm 与 clm 的差异：（都是 unsupervised learning，都可以自动地构建 input/labels）
        - mlm：bert
        - clm：gpt（bloom）
    - pipeline
        - dataset/tasks
        - tokenizer
        - training (fine-tune base lora)
        - inference

## base model & lora adapters

In [8]:
import torch
import torch.nn as nn
import bitsandbytes as bnb 
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model 

In [9]:
%load_ext watermark

In [10]:
%watermark --iversions

bitsandbytes: 0.43.1
sys         : 3.8.12 | packaged by conda-forge | (default, Sep 29 2021, 19:52:28) 
[GCC 9.4.0]
apex        : 0.1
torch       : 2.3.0+cu121



In [11]:
from watermark import watermark
print(watermark(packages='peft,torch,loralib,transformers,accelerate,datasets'))

peft        : 0.11.1
torch       : 2.3.0+cu121
loralib     : 0.1.2
transformers: 4.42.0.dev0
accelerate  : 0.30.1
datasets    : 2.19.1



In [47]:
model = AutoModelForCausalLM.from_pretrained(
    "/wangzh/code/share/20240520-lora/personal_chatgpt-main/tutorials/huggingface_models/Meta-Llama-3-8B", 
    load_in_8bit=True, 
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("/wangzh/code/share/20240520-lora/personal_chatgpt-main/tutorials/huggingface_models/Meta-Llama-3-8B")

### Error 1
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# model.config
AutoConfig.from_pretrained("/wangzh/code/share/20240520-lora/personal_chatgpt-main/tutorials/huggingface_models/Meta-Llama-3-8B")

LlamaConfig {
  "_name_or_path": "/wangzh/code/share/20240520-lora/personal_chatgpt-main/tutorials/huggingface_models/Meta-Llama-3-8B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.0.dev0",
  "use_cache": true,
  "vocab_size": 128256
}

In [14]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear8bitLt(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [15]:
# model.transformer.word_embeddings
model.get_input_embeddings()

Embedding(128256, 4096)

In [16]:
tokenizer

PreTrainedTokenizerFast(name_or_path='/wangzh/code/share/20240520-lora/personal_chatgpt-main/tutorials/huggingface_models/Meta-Llama-3-8B', vocab_size=128000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|end_of_text|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|reserved_special_token_2|>", rstrip=False, lstrip=False, sin

### freeze original weights

In [17]:
list(model.parameters())[0].dtype

torch.float16

In [18]:
for i, param in enumerate(model.parameters()):
    param.requires_grad = False  # freeze the model - train adapters later
#     print(i, 'param.requires_grad = False')
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)
#         print(i, 'ndim == 1, torch.float16 to torch.float32')

In [19]:
# reduce number of stored activations
model.gradient_checkpointing_enable()  
model.enable_input_require_grads()

In [20]:
class CastOutputToFloat(nn.Sequential):
    def forward(self, x): 
        return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

### LoRa Adapters

In [21]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [22]:
from peft import LoraConfig, get_peft_model 
config = LoraConfig(
    r=16, #low rank
    lora_alpha=32, #alpha scaling， scale lora weights/outputs
    # target_modules=["q_proj", "v_proj"], #if you know the 
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

In [23]:
model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 6815744 || all params: 8037076992 || trainable%: 0.08480376642881861


In [24]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear8bitLt(in_features=4096, out_features=1024,

## pipeline

### data

In [25]:
import transformers
from datasets import load_dataset
# dataset = load_dataset("Abirate/english_quotes")

In [26]:
dataset = load_dataset('json', data_files='/wangzh/code/share/20240520-lora/personal_chatgpt-main/tutorials/huggingface_models/test_data/test_data/quotes.jsonl')

In [27]:
dataset

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 2508
    })
})

In [28]:
dataset['train']

Dataset({
    features: ['quote', 'author', 'tags'],
    num_rows: 2508
})

In [29]:
dataset['train'].to_pandas()

Unnamed: 0,quote,author,tags
0,“Be yourself; everyone else is already taken.”,Oscar Wilde,"[be-yourself, gilbert-perreira, honesty, inspi..."
1,"“I'm selfish, impatient and a little insecure....",Marilyn Monroe,"[best, life, love, mistakes, out-of-control, t..."
2,“Two things are infinite: the universe and hum...,Albert Einstein,"[human-nature, humor, infinity, philosophy, sc..."
3,"“So many books, so little time.”",Frank Zappa,"[books, humor]"
4,“A room without books is like a body without a...,Marcus Tullius Cicero,"[books, simile, soul]"
...,...,...,...
2503,“Morality is simply the attitude we adopt towa...,"Oscar Wilde,","[morality, philosophy]"
2504,“Don't aim at success. The more you aim at it ...,"Viktor E. Frankl,","[happiness, success]"
2505,"“In life, finding a voice is speaking and livi...",John Grisham,[inspirational-life]
2506,"“Winter is the time for comfort, for good food...",Edith Sitwell,"[comfort, home, winter]"


In [30]:
dataset['train']['quote'][:4]

['“Be yourself; everyone else is already taken.”',
 "“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.”",
 "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”",
 '“So many books, so little time.”']

In [31]:
dataset['train']['author'][:4]

['Oscar Wilde', 'Marilyn Monroe', 'Albert Einstein', 'Frank Zappa']

In [32]:
dataset['train'][:4]

{'quote': ['“Be yourself; everyone else is already taken.”',
  "“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.”",
  "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”",
  '“So many books, so little time.”'],
 'author': ['Oscar Wilde', 'Marilyn Monroe', 'Albert Einstein', 'Frank Zappa'],
 'tags': [['be-yourself',
   'gilbert-perreira',
   'honesty',
   'inspirational',
   'misattributed-oscar-wilde',
   'quote-investigator'],
  ['best', 'life', 'love', 'mistakes', 'out-of-control', 'truth', 'worst'],
  ['human-nature',
   'humor',
   'infinity',
   'philosophy',
   'science',
   'stupidity',
   'universe'],
  ['books', 'humor']]}

In [33]:
str(dataset['train']['tags'][0])

"['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']"

In [34]:
def merge(row):
    row['prediction'] = row['quote'] + ' ->: ' + str(row['tags'])
    return row
dataset['train'] = dataset['train'].map(merge)

In [35]:
dataset['train']['prediction'][:5]

["“Be yourself; everyone else is already taken.” ->: ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']",
 "“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.” ->: ['best', 'life', 'love', 'mistakes', 'out-of-control', 'truth', 'worst']",
 "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.” ->: ['human-nature', 'humor', 'infinity', 'philosophy', 'science', 'stupidity', 'universe']",
 "“So many books, so little time.” ->: ['books', 'humor']",
 "“A room without books is like a body without a soul.” ->: ['books', 'simile', 'soul']"]

In [36]:
dataset['train'][4]

{'quote': '“A room without books is like a body without a soul.”',
 'author': 'Marcus Tullius Cicero',
 'tags': ['books', 'simile', 'soul'],
 'prediction': "“A room without books is like a body without a soul.” ->: ['books', 'simile', 'soul']"}

In [37]:
tokenizer(dataset['train']['prediction'][:4])

{'input_ids': [[128000, 2118, 3513, 6261, 26, 5127, 775, 374, 2736, 4529, 2029, 1492, 25, 2570, 1395, 53246, 726, 518, 364, 61887, 9339, 17453, 78342, 518, 364, 71, 36224, 88, 518, 364, 1354, 5682, 1697, 518, 364, 34965, 45213, 2844, 83528, 7063, 2695, 35376, 518, 364, 3022, 3502, 7164, 58182, 663], [128000, 10378, 2846, 45743, 11, 74797, 323, 264, 2697, 62945, 13, 358, 1304, 21294, 11, 358, 1097, 704, 315, 2585, 323, 520, 3115, 2653, 311, 3790, 13, 2030, 422, 499, 649, 956, 3790, 757, 520, 856, 12047, 11, 1243, 499, 2771, 439, 15123, 1541, 956, 23528, 757, 520, 856, 1888, 2029, 1492, 25, 2570, 16241, 518, 364, 14789, 518, 364, 31153, 518, 364, 80024, 2094, 518, 364, 412, 8838, 4565, 518, 364, 59677, 518, 364, 50810, 267, 663], [128000, 2118, 11874, 2574, 527, 24746, 25, 279, 15861, 323, 3823, 88637, 26, 323, 358, 2846, 539, 2771, 922, 279, 15861, 2029, 1492, 25, 2570, 26380, 5392, 1598, 518, 364, 28400, 269, 518, 364, 81116, 518, 364, 46185, 11597, 88, 518, 364, 40657, 518, 364, 267, 

### tokenize

In [38]:
dataset = dataset.map(lambda samples: tokenizer(samples['prediction']), batched=True)

In [39]:
# 'input_ids', 'attention_mask'
dataset

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'prediction', 'input_ids', 'attention_mask'],
        num_rows: 2508
    })
})

### training

In [43]:
## warning
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [44]:
### error torch._six
import sys
sys.path.insert(0, "/wangzh/code/开源code/apex-master")

In [45]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [46]:
trainer = Trainer(
    model=model, 
    train_dataset=dataset['train'],
    args=TrainingArguments(
        per_device_train_batch_size=4, 
        gradient_accumulation_steps=4,
        warmup_steps=100, 
        max_steps=200, 
        learning_rate=2e-4, 
        fp16=True,
        logging_steps=1, 
        output_dir='outputs'
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  
trainer.train()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

### inference

In [None]:
batch = tokenizer("“Training models with PEFT and LoRa is cool” ->: ", return_tensors='pt')

with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

In [None]:
 
batch = tokenizer("“An important paradigm of natural language processing consists of large-scale pre-training on general domain data and adaptation to particular tasks or domains.” ->: ", return_tensors='pt')

with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

In [None]:
trainer.data_collator