# Experimenting with Fine-tuning a Base Language Model for Instruction Following
I.e. a chatbot

https://wandb.ai/capecape/alpaca_ft/reports/How-to-Fine-tune-an-LLM-Part-3-The-HuggingFace-Trainer--Vmlldzo1OTEyNjMy#results-of-the-lora-training


## Requirements

In [1]:
import os
os.environ['HF_HOME'] = '/home/stefanwebb/models/hf'
import torch
torch.random.manual_seed(0)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
from datasets import load_dataset
from trl import SFTTrainer

if torch.cuda.is_bf16_supported():
  compute_dtype = torch.bfloat16
else:
  compute_dtype = torch.float16

## Load Pretrained Model
Using Gemma 2B from Google

In [2]:
use_4bit = True
bnb_4bit_quant_type = "nf4"
use_double_quant = True

compute_dtype = torch.bfloat16
attn_implementation = 'flash_attention_2'

# 'target_modules' is a list of the modules that should be targeted by LoRA.
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
target_modules = ["all_linear"]

bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_double_quant,
)

In [3]:
MODEL_ID = "/home/stefanwebb/models/llm/google_gemma-2b"
NEW_MODEL_NAME = "stefans-gemma-2b-instruct-third-attempt"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map='auto',
    torch_dtype="auto",
    quantization_config=bnb_config,
    attn_implementation=attn_implementation
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.padding_side = 'right' # not sure if this is necessary...
# tokenizer.pad_token_id = tokenizer.eos_token_id

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
print(model)

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaFlashAttention2(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
  

## Load Data
https://huggingface.co/datasets/tatsu-lab/alpaca

In [6]:
DATASET_NAME = "tatsu-lab/alpaca"
SPLIT = "train"
MAX_SEQ_LENGTH = 2048 # 8192 # 2048
EOS_TOKEN = tokenizer.eos_token_id

# train is the only data split for this dataset
dataset = load_dataset(DATASET_NAME, split="train")
print(dataset)

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 52002
})


In [7]:
"""
Fields of dataset

    instruction: describes the task the model should perform. Each of the 52K instructions is unique.
    input: optional context or input for the task. For example, when the instruction is "Summarize the following article", the input is the article. Around 40% of the examples have an input.
    output: the answer to the instruction as generated by text-davinci-003.
    text: the instruction, input and output formatted with the prompt template used by the authors for fine-tuning their models.

"""

'\nFields of dataset\n\n    instruction: describes the task the model should perform. Each of the 52K instructions is unique.\n    input: optional context or input for the task. For example, when the instruction is "Summarize the following article", the input is the article. Around 40% of the examples have an input.\n    output: the answer to the instruction as generated by text-davinci-003.\n    text: the instruction, input and output formatted with the prompt template used by the authors for fine-tuning their models.\n\n'

In [8]:
# Select a subset of the data for faster processing
# dataset = dataset.select(range(100))

In [9]:
# def formatting_prompts_func(example):
#     output_texts = []
#     for i in range(len(example['instruction'])):
#         text = f"{example['input'][i]} ### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
#         output_texts.append(text)
#     return output_texts
 
def formatting_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"<start_of_turn>system\nYou are Gemma.{' ' + example['input'][i] if len(example['input'][i]) else ''}<end_of_turn>\n<start_of_turn>user\n{example['instruction'][i]}<end_of_turn>\n<start_of_turn>assistant\n{example['output'][i]}<end_of_turn>\n<eos>"
        output_texts.append(text)
   
    return output_texts

In [10]:
# sample = dataset[0]
# sample = {k: [v] for k,v in sample.items()}

# print(formatting_func(sample))

In [11]:
# print([dataset[0]])

## Train the model

In [12]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=6,
    target_modules="all-linear",
    modules_to_save= ["embed_tokens", "lm_head"],
    bias="none",
    task_type="CAUSAL_LM",
    lora_alpha=12,  # 8,
    lora_dropout=0.1    # 0.05
)

In [13]:
"""
# Create a TrainingArguments object, which is used to define the parameters for model training.
args = TrainingArguments(
    # 'evaluation_strategy' is set to "steps", which means evaluation is done at each logging step.
    evaluation_strategy="steps",

    # 'per_device_train_batch_size' is set to 7, which means each training batch will contain 7 samples per device.
    per_device_train_batch_size=7,

    # 'gradient_accumulation_steps' is set to 4, which means gradients are accumulated for 4 steps before performing a backward/update pass.
    gradient_accumulation_steps=4,

    # 'gradient_checkpointing' is set to True, which means model gradients are stored in memory during training to reduce memory usage.
    gradient_checkpointing=True,

    # 'learning_rate' is set to 1e-4, which is the learning rate for the optimizer.
    learning_rate=1e-4,

    # 'fp16' is set to True if bfloat16 is not supported, which means the model will use 16-bit floating point precision for training if possible.
    fp16 = not torch.cuda.is_bf16_supported(),

    # 'bf16' is set to True if bfloat16 is supported, which means the model will use bfloat16 precision for training if possible.
    bf16 = torch.cuda.is_bf16_supported(),

    # 'max_steps' is set to -1, which means there is no maximum number of training steps.
    max_steps=-1,

    # 'num_train_epochs' is set to 3, which means the training process will go through the entire dataset 3 times.
    num_train_epochs=3,

    # 'save_strategy' is set to "epoch", which means the model is saved at the end of each epoch.
    save_strategy="epoch",

    # 'logging_steps' is set to 10, which means logging is done every 10 steps.
    logging_steps=10,

    # 'output_dir' is set to NEW_MODEL_NAME, which is the directory where the model and its configuration will be saved.
    output_dir=NEW_MODEL_NAME,

    # 'optim' is set to "paged_adamw_32bit", which is the optimizer to be used for training.
    optim="paged_adamw_32bit",

    # 'lr_scheduler_type' is set to "linear", which means the learning rate scheduler type is linear.
    lr_scheduler_type="linear"
)
"""

'\n# Create a TrainingArguments object, which is used to define the parameters for model training.\nargs = TrainingArguments(\n    # \'evaluation_strategy\' is set to "steps", which means evaluation is done at each logging step.\n    evaluation_strategy="steps",\n\n    # \'per_device_train_batch_size\' is set to 7, which means each training batch will contain 7 samples per device.\n    per_device_train_batch_size=7,\n\n    # \'gradient_accumulation_steps\' is set to 4, which means gradients are accumulated for 4 steps before performing a backward/update pass.\n    gradient_accumulation_steps=4,\n\n    # \'gradient_checkpointing\' is set to True, which means model gradients are stored in memory during training to reduce memory usage.\n    gradient_checkpointing=True,\n\n    # \'learning_rate\' is set to 1e-4, which is the learning rate for the optimizer.\n    learning_rate=1e-4,\n\n    # \'fp16\' is set to True if bfloat16 is not supported, which means the model will use 16-bit floating

In [14]:
args = TrainingArguments(
    output_dir=NEW_MODEL_NAME, # directory to save and repository id
    num_train_epochs=1, # 3,                     # number of training epochs
    per_device_train_batch_size=8,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant"           # use constant learning rate scheduler
    # report_to="tensorboard",                # report metrics to tensorboard
)

In [18]:
# Create an instance of the SFTTrainer class, which is used to fine-tune the model.

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=dataset,
    peft_config=lora_config,
    formatting_func=formatting_func,
    max_seq_length=512,
    packing=True,
    dataset_kwargs={
        # "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [19]:
# 'device' is set to 'cuda', which means the CUDA device will be used for computations if available.
device = 'cuda'

# Import the 'gc' module, which provides an interface to the garbage collector.
import gc

# Import the 'os' module, which provides a way of using operating system dependent functionality.
import os

# Call the 'collect' method of the 'gc' module to start a garbage collection, which can help free up memory.
gc.collect()

# Call the 'empty_cache' method of 'torch.cuda' to release all unused cached memory from PyTorch so that it can be used by other GPU applications.
torch.cuda.empty_cache()

In [20]:
# Call the 'train' method of the 'trainer' object to start the training process.
# This method will fine-tune the model on the training dataset according to the parameters specified in the 'args' object.
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,3.1856
20,2.1282
30,1.6012
40,1.5389
50,1.4775
60,1.3697
70,1.3495
80,1.2924
90,1.3426
100,1.3553




TrainOutput(global_step=3250, training_loss=1.2615441562946026, metrics={'train_runtime': 4856.3311, 'train_samples_per_second': 10.708, 'train_steps_per_second': 0.669, 'total_flos': 1.472031075560325e+17, 'train_loss': 1.2615441562946026, 'epoch': 0.9998461775111521})