# Training LoRA (low-rank adapter)

Dependencies

In [13]:
#Recomended- conda environment: 
    #install miniconda
    #conda install jupyter
#Required
    #pip install accelerate peft bitsandbytes transformers trl

from datasets import load_dataset
import os
from peft import (
    PeftModel,
    LoraConfig)
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    QuantoConfig,
    pipeline,
    logging,
)
from trl import (
    SFTTrainer, 
    setup_chat_format
)

Many models and datasets are available from hugging face hub, including meta llama 3. There are often download instructions in the repositories

In [2]:
# Model from Hugging Face hub
base_model = "NousResearch/Llama-2-7b-chat-hf"

# New instruction dataset
_dataset = "wikimedia/wikipedia"

#Fine-tuned adapter model
new_model = "llama-2-lora"

#Merged model
merged_model = "llama-2-lora-merged"

Load dataset, quant_config, model, tokenizer...

In [4]:
from datasets import load_dataset

#file is optional
file= "20231101.ab"
dataset = load_dataset(_dataset, file, split="train")

Downloading readme:   0%|          | 0.00/131k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6152 [00:00<?, ? examples/s]

In [4]:
import torch
from transformers import BitsAndBytesConfig

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [6]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map="auto"
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

PEFT = Parameter-Efficient Fine-Tuning, these are the learning parameters

In [8]:
from peft import LoraConfig

peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [9]:
from transformers import TrainingArguments

training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=100,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [11]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/6152 [00:00<?, ? examples/s]

If trainer.train() crashes quickly, may have to adjust max_seq_length in the cell above to not overflow vram

In [12]:
trainer.train()
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

Step,Training Loss
100,2.0151
200,1.0161
300,0.9154
400,0.8924
500,0.5516
600,0.5364
700,0.6537
800,0.6307
900,0.6645
1000,0.5041




('llama-2-lora/tokenizer_config.json',
 'llama-2-lora/special_tokens_map.json',
 'llama-2-lora/tokenizer.json')

merge trained adapter with base model

In [13]:
from peft import PeftModel
from transformers import AutoModelForCausalLM
from trl import setup_chat_format

base_model_reload = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map="auto"
)

base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(base_model_reload, new_model)
model = model.merge_and_unload()
model.save_pretrained(merged_model)
tokenizer.save_pretrained(merged_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



('llama-2-lora-merged/tokenizer_config.json',
 'llama-2-lora-merged/special_tokens_map.json',
 'llama-2-lora-merged/tokenizer.json')

test merged (unquantized) model. It will be very slow 

In [5]:

from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import gc

model = AutoModelForCausalLM.from_pretrained(
    merged_model,
    quantization_config=quant_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(merged_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)

user_input = "Who is Michael Jordan?"
result = pipe(f"<s>[INST] {user_input} [/INST]")
print(result[0]['generated_text'])

while user_input != "Goodbye":
    user_input = input()
    result = pipe(f"<s>[INST] {user_input} [/INST]")
    print(result[0]['generated_text'])

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<s>[INST] Who is Michael Jordan? [/INST]  Michael Jordan (born February 17, 1963) is a former American professional basketball player and entrepreneur who is widely regarded as one of the greatest basketball players of all time.

Jordan was born in Brooklyn, New York, and grew up in Wilmington, North Carolina. He began playing basketball at a young age and was heavily influenced by his father, who was a basketball coach. Jordan attended the University of North Carolina, where he won a national championship in 1982. He was drafted by the Chicago Bulls in 1984 and played for the team from 1984 to 1993, leading them to six NBA championships.

Jordan is known for his incredible scoring ability, defensive prowess, and iconic shoes, which have become a cultural phenomenon. He was
<s>[INST] How many 3 points did Michael Jordan score? [/INST]  Michael Jordan scored 5,937 points in his NBA career. obviously, this is a large number, but to put it into perspective, he scored an average of 30.12 p

The end for now. Here's one way to quantize it

In [None]:
import torch
from unsloth import FastLanguageModel

compute_dtype = getattr(torch, "float16")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "llama-2-lora",
    max_seq_length = 1024,
    dtype = compute_dtype,
    load_in_4bit = True,
)
model.save_pretrained_gguf("llama-2-lora-quantized", tokenizer, quantization_method = "f16")