# Fine-tuning pipeline for quantesized LLMs

In [1]:
personal = False
idun = True

In [2]:
import os

if personal and not idun:
    output_dir = os.path.expanduser('~/models')
elif idun and not personal:
    output_dir = os.path.expanduser('/cluster/work/eliashk/models')

os.environ['HF_HOME'] = output_dir

from datasets import load_dataset

tiny_dataset_path = 'single_output_10_samples.jsonl'

raw_dataset = load_dataset('json', data_files=tiny_dataset_path)

ModuleNotFoundError: No module named 'datasets'

### Install dependencies

In [3]:
!pip install "torch==2.1.2" tensorboard
!pip install  --upgrade \
  "transformers==4.36.2" \
  "datasets==2.16.1" \
  "accelerate==0.26.1" \
  "evaluate==0.4.1" \
  "bitsandbytes==0.42.0" \

!pip install git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e --upgrade
!pip install git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f --upgrade


Collecting tensorboard
  Using cached tensorboard-2.16.2-py3-none-any.whl.metadata (1.6 kB)
Collecting absl-py>=0.4 (from tensorboard)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting grpcio>=1.48.2 (from tensorboard)
  Using cached grpcio-1.62.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting markdown>=2.6.8 (from tensorboard)
  Using cached Markdown-3.6-py3-none-any.whl.metadata (7.0 kB)
Collecting protobuf!=4.24.0,>=3.19.6 (from tensorboard)
  Using cached protobuf-5.26.0-cp37-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Using cached tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl.metadata (1.1 kB)
Collecting werkzeug>=1.0.1 (from tensorboard)
  Using cached werkzeug-3.0.1-py3-none-any.whl.metadata (4.1 kB)
Using cached tensorboard-2.16.2-py3-none-any.whl (5.5 MB)
Using cached absl_py-2.1.0-py3-none-any.whl (133 kB)
Using cach

### Setup flash attention

In [4]:
import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'

!pip install ninja packaging
!MAX_JOBS=4 pip install flash-attn --no-build-isolation



### Using HF model version service

In [5]:
from huggingface_hub import notebook_login

# login(
#     token='',
#     add_to_git_credential=True,
#     write_permission=True,
# )

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Load dataset and convert to instruction or conversational format

#### Conversational format
{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}

{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}

{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}

#### Instruction format

{"prompt": "prompt text", "completion": "ideal generated text"}

{"prompt": "prompt text", "completion": "ideal generated text"}

{"prompt": "prompt text", "completion": "ideal generated text"}


In [6]:
from datasets import load_dataset

system_message = '''You are the robot TARS that translates a domain.pddl file and a natural language instruction into a collection of instances, predicates and goals in an exact format. Users will give you a natural language command and you will generate the correct output based on the provided DOMAIN.
DOMAIN:
{domain}'''

def create_conversation(sample):
    return {
        "messages": [
            {'role': 'system', 'content': system_message.format(domain=sample['domain'])},
            {'role': 'user', 'content': sample['input']},
            {'role': 'assistant', 'content': sample['output']}
        ]
    }

dataset_path = 'output_and_pddl_10_samples.jsonl'

raw_dataset = load_dataset('json', data_files=dataset_path)


In [7]:
dataset = raw_dataset.map(create_conversation, batched=False)
dataset['train'].to_json('train.jsonl', orient='records')

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

81508

### Load formatted dataset

In [16]:
formatted_dataset = load_dataset('json', data_files='train.jsonl', split='train')

### Configure model and tokenizer for Q-LoRA

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format

base_model_id = 'mistralai/Mistral-7B-Instruct-v0.2'
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map='auto',
    attn_implementation='flash_attention_2',
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.padding_side = 'left' # left for Mistral

model, tokenizer = setup_chat_format(model, tokenizer)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
from peft import LoraConfig

# numbers for QLoRA paper
peft_config = LoraConfig(
    lora_alpha=128,
    lora_dropout=0.05,
    r=256,
    bias='none',
    target_modules='all-linear',
    task_type='CAUSAL_LM',
)

In [17]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = os.path.join(output_dir, 'qlora_finetuned'),
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=3,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

### Create SFT Trainer

In [18]:
from trl import SFTTrainer

# find maximum sequence length in formatted_dataset
# max for Mistral is 4096⋅32
# max_seq_len = max(len(tokenizer.encode(x['content'])) for x in formatted_dataset['train']['messages'])
max_seq_len = 4096

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_len if max_seq_len < 4096 else 4096,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        'add_special_tokens': False,
        'append_concat_token': False,
    }
)

Generating train split: 0 examples [00:00, ? examples/s]



### Start training

In [19]:
trainer.train()
trainer.save_model()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss




adapter_model.safetensors:   0%|          | 0.00/1.87G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1711036910.idun-04-09.413524.0:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

In [None]:
del model
del trainer
torch.cuda.empty_cache()