**Setup development environment**

The first step is to install Hugging Face libraies and PyTorch, including:

- torch
- trl (transformer reinforcement learning)
- transformers
- peft (parameter-efficient fine-tuning)
- datasets
- accelerate
- bistandbytes
- flash-attn

All required libraries are already installed. Just enable the conda environment called tuning (i.e., `conda activate tuning`).

In [None]:
import os
import torch
import bitsandbytes as bnb
import datetime
import json
import math
import matplotlib.pyplot as plt
from random import randrange
from typing import Optional, List, Dict
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, PeftModel
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
from transformers.utils import is_accelerate_available, is_bitsandbytes_available
from transformers.trainer import TRAINER_STATE_NAME
from trl import SFTTrainer

In [None]:
print("Is accelerate available? ", is_accelerate_available())
print("Is bitsandbytes available? ", is_bitsandbytes_available())
!transformers-cli env

In [None]:
!nvidia-smi

**Create and prepare dataset**

There are several ways to create datasets for LLM fine-tuning, including:
- using existing open-source datasets
- using LLMs to create synthetically datasets
- using humans to create datasets 
- using a combination of the above methods

We use an aleardy exsiting dataset from Hugginface called: [CodeInstructions](https://huggingface.co/datasets/TokenBender/code_instructions_122k_alpaca_style)

In [None]:
# load existing dataset from hugginface with load_dataset()
dataset = #TODO

# shuffle dataset and select a sample set (e.g., 100 samples)
dataset = #TODO

# print dataset 
#TODO

# print the instruction, input, and output of a sample from the dataset 
# TODO


**Format dataset**

In order to leverage instructing fine-tuning for Mistral, we need to surround our prompts by [INST] and [/INST] tokens. Additionally, the very first instruction should begin with a begin of sequence token and the assistant generation will be ended by the end of sequence token.

In [None]:
MISTRAL_TEMPLATE = """<s>[INST] Below is an instruction that describes a task. Write a response that appropriately completes the request. 

### Instruction: {} 

### Input: {} [/INST] 

{}</s>"""

# for each sample return a dict with the key 'text', 'instruction', 'input', 'output', where text is the final prompt
def format_data(sample: Dict):
    # TODO

# convert dataset using the format_data() method
dataset = # TODO

# split dataset into 0.8/0.2 training samples and test sample
dataset = # TODO

# save train and test datasets to disk
# TODO
# TODO


**Load Model and Tokenizer**

1. Define quantization configuration
2. Load base model 
3. Load tokenizer
4. Inference with base model

In [None]:
# load bnb config with load in 4bits, double quantization, NF4 as quantization type, and torch.float16 as computation type
bnb_config = BitsAndBytesConfig(
    # TODO
)

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# load base model with QLoRA configuration, device_map, and flash-attention
base_model = AutoModelForCausalLM.from_pretrained(
    # TODO
)

# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.unk_token          # we want the pad_token to be different from the eos token
tokenizer.padding_side = 'right'                   # prevent warnings

# add special tokens to indicate the start and end of a prompt
#tokenizer.add_special_tokens({"bos_token": "<s>"})
#tokenizer.add_special_tokens({"eos_token": "</s>"})
#tokenizer.add_special_tokens({"unk_token": "<unk>"})

In [None]:
prompt = """Print hello world in python c and c++."""

base_model.eval()
with torch.no_grad():
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
    output = base_model.generate(input_ids=input_ids, max_new_tokens=1000, pad_token_id=tokenizer.unk_token_id)
    response = tokenizer.batch_decode(output.detach().cpu().numpy(), skip_special_tokens=True)[0]

    print(response)

**Define LoRa Config**

Since we do not train all the parameters but only a small subset, we have to add the LoRA to the model using `peft`.

1. Find trainable layers
2. Define LoRA Config
3. Get PEFT Model

In [None]:
# find all linear layers
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


modules = find_all_linear_names(base_model)

# ["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj", "lm_head"]

print(modules)

In [None]:
# define LoRA config with r, lora_alpha, lora_dropout, target_modules, bias, and task_type
lora_config = LoraConfig(
    #TODO
)

In [None]:
# get PEFT model
peft_model = get_peft_model(base_model, lora_config)

trainable, total = peft_model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

**Fine-tune LLM**

To fine-tune a LLM, we first need to specify the hyperparameters. Hyperparameters can significantly impact the model performance.

1. Define training argumnets
2. Define SFTTrainer
3. Start Fine-Tuning
4. Print Loss

In [None]:
output_dir = "../data/saved_models/mistral-instruct"
os.makedirs(output_dir, exist_ok=True)
time = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

# define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,                                # directory to save and repository id
    max_steps=20,                                         # total number of training steps to perform
    num_train_epochs=1,                                   # number of training epochs
    per_device_train_batch_size=3,                        # batch size per device during training
    gradient_accumulation_steps=2,                        # number of steps before performing a backward/update pass
    gradient_checkpointing=True,                          # use gradient checkpointing to save memory
    optim="adamw_torch_fused",                            # use fused adamw optimizer
    logging_steps=10,                                     # log every 10 steps
    save_strategy="steps",                                # save checkpoint every 10th step
    save_steps=10,                                        # Number of updates steps before two checkpoint saves
    save_total_limit=10,                                  # limit the total amount of checkpoints
    learning_rate=2e-4,                                   # learning rate, based on QLoRA paper
    bf16=True,                                            # use bfloat16 precision
    tf32=True,                                            # use tf32 precision
    max_grad_norm=0.3,                                    # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                                    # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",                         # use constant learning rate scheduler
    run_name="mistral-instruct_{time}".format(time=time),
    report_to="mlflow",                                   # report metrics to mlflow
)

In [None]:
# load training dataset
train_dataset = # TODO

max_seq_length = 8128 # max sequence length for model and packing of the dataset

# define SFTTrainer with model, training arguments, train data, peft_config, max_seq_legth, tokeniker, dataset_text_field, packing
trainer = SFTTrainer(
    # TODO
)

# start training
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
print(metrics) 

# save model and tokenizer
trainer.model.save_pretrained(os.path.join(output_dir, "final_checkpoint/"))
trainer.tokenizer.save_pretrained(os.path.join(output_dir, "final_checkpoint/"))

In [None]:
def smooth(scalars: List[float]) -> List[float]:
    last = scalars[0]
    smoothed = list()
    weight = 1.8 * (1 / (1 + math.exp(-0.05 * len(scalars))) - 0.5)  # a sigmoid function
    for next_val in scalars:
        smoothed_val = last * weight + (1 - weight) * next_val
        smoothed.append(smoothed_val)
        last = smoothed_val
    return smoothed


def plot_loss(save_directory: os.PathLike, keys: Optional[List[str]] = ["loss"]): 
    with open(os.path.join(save_directory, TRAINER_STATE_NAME), "r", encoding="utf-8") as f:
        data = json.load(f)

    for key in keys:
        steps, metrics = [], []
        for i in range(len(data["log_history"])):
            if key in data["log_history"][i]:
                steps.append(data["log_history"][i]["step"])
                metrics.append(data["log_history"][i][key])

        if len(metrics) == 0:
            print(f"No metric {key} to plot.")
            continue

        plt.figure()
        plt.plot(steps, metrics, alpha=0.4, label="original")
        plt.plot(steps, smooth(metrics), label="smoothed")
        plt.title("training {} of {}".format(key, save_directory))
        plt.xlabel("step")
        plt.ylabel(key)
        plt.legend()
        plt.savefig(os.path.join(save_directory, "training_{}.png".format(key)), format="png", dpi=100)
        print("Figure saved:", os.path.join(save_directory, "training_{}.png".format(key)))

# plot loss
plot_loss(save_directory=output_dir)

**Inference with the fine-tuned Model**

1. Load test dataset and prepare test prompt
2. Load base model and tokenizer
2. Merge weights
3. Run inference with fine-tuned model

In [None]:
# load test dataset
test_dataset = #TODO

MISTRAL_INFERENCE_TEMPLATE = """<s>[INST] Below is an instruction that describes a task. Write a response that appropriately completes the request. 

### Instruction: {} 

### Input: {} [/INST]"""

# select a test sample
test_sample = #TODO

# create test prompt
prompt = #TODO

# print prompt
#TODO

In [None]:
# load base model
base_model = AutoModelForCausalLM.from_pretrained(
    #TODO
)

# load tokenizer
tokenizer = #TODO
tokenizer.pad_token = #TODO

In [None]:
# specify adapter path
adapter_path = #TODO

# merge LoRA weights with base model base model and save
merged_model= PeftModel.from_pretrained(base_model, adapter_path)
merged_model = merged_model.merge_and_unload()

# save merged model
merged_model.save_pretrained(os.path.join(output_dir, "final-merged-checkpoint"), safe_serialization=True, max_shard_size='4GB')
tokenizer.save_pretrained(os.path.join(output_dir, "final-merged-checkpoint"))

In [None]:
# Run inference with the fine-tuned model
#TODO