In [1]:
%load_ext autoreload
%autoreload 0

In [2]:
# !huggingface-cli login --token {TOKEN}

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)


model_id = "meta-llama/Llama-2-7b-hf"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.add_special_tokens({"pad_token":"<pad>"})
tokenizer.padding_side = 'left'

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm

In [5]:
print(model.model.layers[0].self_attn.q_proj)

print("\n---- PARAMS ----")
print(model.model.layers[0].self_attn.q_proj.weight)

# 4-bit params packed into 8bit dtype; 8388608 = 4096 * 4096 / 2
print(model.model.layers[0].self_attn.q_proj.weight.shape)

print("\n---- QUANT STATE ---")
absmax, shape, dtype, blocksize, compressed_stats, quant_type, data_type = model.model.layers[0].self_attn.q_proj.weight.quant_state

print(absmax)               # absmax
print(shape)                # shape or original weight
print(dtype)                # original weight dtype
print(blocksize)            # group size for quantization
print(compressed_stats)     # data to compress / uncompress
print(quant_type)           # nf4 or fp4
print(data_type)            # nf4 values

Linear4bit(in_features=4096, out_features=4096, bias=False)

---- PARAMS ----
Parameter containing:
Parameter(Params4bit([[ 83],
            [103],
            [ 74],
            ...,
            [114],
            [108],
            [181]], device='cuda:0', dtype=torch.uint8))
torch.Size([8388608, 1])

---- QUANT STATE ---
tensor([ 77,  70,  58,  ..., 199,  84,  83], device='cuda:0',
       dtype=torch.uint8)
torch.Size([4096, 4096])
torch.float16
64
[tensor(0.0510, device='cuda:0'), [tensor([0.1990, 0.1961, 0.1902,  ..., 0.1433, 0.1922, 0.1824], device='cuda:0'), tensor([-9.9297e-01, -9.7891e-01, -9.6484e-01, -9.5078e-01, -9.3672e-01,
        -9.2266e-01, -9.0859e-01, -8.9453e-01, -8.8047e-01, -8.6641e-01,
        -8.5234e-01, -8.3828e-01, -8.2422e-01, -8.1016e-01, -7.9609e-01,
        -7.8203e-01, -7.6797e-01, -7.5391e-01, -7.3984e-01, -7.2578e-01,
        -7.1172e-01, -6.9766e-01, -6.8359e-01, -6.6953e-01, -6.5547e-01,
        -6.4141e-01, -6.2734e-01, -6.1328e-01, -5.9922e-01, -5.

In [6]:
from peft import LoraConfig, get_peft_model

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (v_proj): Linear4bit(
             

In [7]:
print("---- FULL LAYER ----")
print(model.base_model.model.model.layers[0].self_attn.q_proj)

print("\n\n---- WEIGHTS ----")
print(model.base_model.model.model.layers[0].self_attn.q_proj.weight)
print("requires_grad: ", end="")
print(model.base_model.model.model.layers[0].self_attn.q_proj.weight.requires_grad)

print("\n\n---- LORA A ----")
print(model.base_model.model.model.layers[0].self_attn.q_proj.lora_A["default"].weight)
print("requires_grad: ", end="")
print(model.base_model.model.model.layers[0].self_attn.q_proj.lora_A["default"].weight.requires_grad)

print("\n\n---- LORA B ----")
print(model.base_model.model.model.layers[0].self_attn.q_proj.lora_B["default"].weight)
print("requires_grad: ", end="")
print(model.base_model.model.model.layers[0].self_attn.q_proj.lora_B["default"].weight.requires_grad)

---- FULL LAYER ----
Linear4bit(
  in_features=4096, out_features=4096, bias=False
  (lora_dropout): ModuleDict(
    (default): Identity()
  )
  (lora_A): ModuleDict(
    (default): Linear(in_features=4096, out_features=16, bias=False)
  )
  (lora_B): ModuleDict(
    (default): Linear(in_features=16, out_features=4096, bias=False)
  )
  (lora_embedding_A): ParameterDict()
  (lora_embedding_B): ParameterDict()
)


---- WEIGHTS ----
Parameter containing:
Parameter(Params4bit([[ 83],
            [103],
            [ 74],
            ...,
            [114],
            [108],
            [181]], device='cuda:0', dtype=torch.uint8))
requires_grad: False


---- LORA A ----
Parameter containing:
tensor([[-0.0005,  0.0007, -0.0090,  ...,  0.0069, -0.0020,  0.0116],
        [-0.0050,  0.0035,  0.0052,  ...,  0.0057, -0.0082, -0.0133],
        [ 0.0064,  0.0115,  0.0155,  ...,  0.0064, -0.0047,  0.0118],
        ...,
        [ 0.0111,  0.0038, -0.0144,  ..., -0.0099,  0.0009, -0.0066],
        [

In [None]:
from trl import SFTTrainer
from datasets import load_dataset

dataset = load_dataset("timdettmers/openassistant-guanaco")

# Set training parameters
training_arguments = TrainingArguments(
    output_dir="./training-run",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    logging_steps=20,
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
)

# Train model
trainer.train()