# Fine-tuning pipeline for quantesized LLMs

In [1]:
personal = True
idun = False

In [2]:
import os

if personal and not idun:
    output_dir = os.path.expanduser('~/models')
elif idun and not personal:
    output_dir = os.path.expanduser('/cluster/work/eliashk/models')

os.environ['HF_HOME'] = output_dir

from datasets import load_dataset

tiny_dataset_path = 'single_output_10_samples.jsonl'

raw_dataset = load_dataset('json', data_files=tiny_dataset_path)

### Install dependencies

In [3]:
!pip install "torch==2.1.2" tensorboard
!pip install  --upgrade \
  "transformers==4.36.2" \
  "datasets==2.16.1" \
  "accelerate==0.26.1" \
  "evaluate==0.4.1" \
  "bitsandbytes==0.42.0" \

!pip install git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e --upgrade
!pip install git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f --upgrade


Collecting git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e
  Cloning https://github.com/huggingface/trl (to revision a3c5b7178ac4f65569975efadc97db2f3749c65e) to /tmp/pip-req-build-xzoya3ac
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/trl /tmp/pip-req-build-xzoya3ac
  Running command git rev-parse -q --verify 'sha^a3c5b7178ac4f65569975efadc97db2f3749c65e'
  Running command git fetch -q https://github.com/huggingface/trl a3c5b7178ac4f65569975efadc97db2f3749c65e
  Running command git checkout -q a3c5b7178ac4f65569975efadc97db2f3749c65e
  Resolved https://github.com/huggingface/trl to commit a3c5b7178ac4f65569975efadc97db2f3749c65e
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f
  Cloning https://github.com/hugg

### Setup flash attention

In [None]:
import torch; assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'

!pip install ninja packaging
!MAX_JOBS=4 pip install flash-attn --no-build-isolation

### Using HF model version service

In [None]:
from huggingface_hub import login

login(
    token='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',
    add_to_git_credential=True,
    write_permission=True,
)

### Load dataset and convert to instruction or conversational format

#### Conversational format
{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}

{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}

{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}

#### Instruction format

{"prompt": "prompt text", "completion": "ideal generated text"}

{"prompt": "prompt text", "completion": "ideal generated text"}

{"prompt": "prompt text", "completion": "ideal generated text"}


In [7]:
from datasets import load_dataset

system_message = '''You are the robot TARS that translates a domain.pddl file and a natural language instruction into a collection of instances, predicates and goals in an exact format. Users will give you a natural language command and you will generate the correct output based on the provided DOMAIN.
DOMAIN:
{domain}'''

def create_conversation(sample):
    return {
        "messages": [
            {'role': 'system', 'content': system_message.format(domain=sample['domain'])},
            {'role': 'user', 'content': sample['input']},
            {'role': 'assistant', 'content': sample['output']}
        ]
    }

dataset_path = 'output_and_pddl_10_samples.jsonl'

raw_dataset = load_dataset('json', data_files=dataset_path)


{'domain': Value(dtype='string', id=None),
 'input': Value(dtype='string', id=None),
 'output': Value(dtype='string', id=None)}

In [10]:
dataset = raw_dataset.map(create_conversation, batched=False)
dataset['train'].to_json('train.jsonl', orient='records')

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

81508

### Load formatted dataset

In [15]:
formatted_dataset = load_dataset('json', data_files='train.jsonl')

Generating train split: 0 examples [00:00, ? examples/s]

### Configure model and tokenizer for Q-LoRA

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format

base_model_id = 'mistralai/Mistral-7B-Instruct-v0.2'
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map='auto',
    attn_implementation='flash_attention_2',
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.padding_side = 'left' # left for Mistral

model, tokenizer = setup_chat_format(model, tokenizer)

RuntimeError: No GPU found. A GPU is needed for quantization.

In [12]:
from peft import LoraConfig

# numbers for QLoRA paper
peft_config = LoraConfig(
    lora_alpha=128,
    lora_dropout=0.05,
    r=256,
    bias='none',
    target_modules='all-linear',
    task_type='CAUSAL_LM',
)

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = os.path.join(output_dir, 'qlora_finetuned'),
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=3,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    # push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
)

ValueError: --tf32 requires Ampere or a newer GPU arch, cuda>=11 and torch>=1.7

### Create SFT Trainer

In [16]:
from trl import SFTTrainer

# find maximum sequence length in formatted_dataset
# max for Mistral is 4096⋅32
max_seq_len = max(len(tokenizer.encode(x['content'])) for x in formatted_dataset['train']['messages'])

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
    peft_config=peft_config,
    max_seq_len=max_seq_len if max_seq_len < 4096 else 4096,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        'add_special_tokens': False,
        'append_concat_token': False,
    }
)

NameError: name 'tokenizer' is not defined

### Start training

In [None]:
trainer.train()
trainer.save_model()

In [None]:
del model
del trainer
torch.cuda.empty_cache()