<a href="https://colab.research.google.com/github/satishbteli/Sparkov_Data_Generation/blob/master/Session%2B2_3%2B02_finetune_mistral_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Objective

Illustrate LoRA fine-tuning of Mistral 7b on a dataset of dialogue summarization.

# Setup

In [None]:
!pip install -q "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

#Install the required versions of additional libraries:

!pip install -q --no-deps xformers==0.0.27.post2 "trl<0.9.0" peft accelerate bitsandbytes
!pip install -q datasets==2.16.1 evaluate rouge_score bert_score
!pip install triton==3.0.0
#


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.9/111.9 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m17.7 MB/s[0m eta [36m0:0

In [None]:
import torch

from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments, EarlyStoppingCallback

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


The functionalities we use from the above packages are:
- `transformers`, `datasets`: helpers to load models and datsets from the HuggingFace ecosystem
- `unsloth`: facilitates application of QLoRA in conjunction with peft on 4-bit quantized base models
- `trl`: abstractions to train the LoRA adapter

# Model

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True
)

==((====))==  Unsloth 2024.11.7: Fast Mistral patching. Transformers = 4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

In [None]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): Mis

In [None]:
tokenizer

LlamaTokenizerFast(name_or_path='unsloth/mistral-7b-instruct-v0.2-bnb-4bit', vocab_size=32000, model_max_length=32768, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

By default, the tokenizer adds a beginning-of-sequence token but does not add an end-of-sequence token. We will need to explicitly add this during training.

In [None]:
tokenizer.add_bos_token, tokenizer.add_eos_token

(True, False)

In [None]:
EOS_TOKEN = tokenizer.eos_token

# Prepare Data

Source: https://huggingface.co/datasets/knkarthick/dialogsum

In [None]:
dataset = load_dataset("knkarthick/dialogsum")

Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Generating validation split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Generating test split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


In [None]:
train_size, validation_size = 300, 50
training_dataset = dataset['train'].shuffle(seed=42).select(range(train_size))
validation_dataset = dataset['validation'].shuffle(seed=42).select(range(validation_size))

The Alpaca instruction prompt is a general purpose prompt template that can be adapted to any task.

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [None]:
def prompt_formatter(example, prompt_template):
    instruction='Write a concise summary of the following dialogue.'
    dialogue=example["dialogue"]
    summary=example["summary"]

    formatted_prompt = prompt_template.format(instruction, dialogue, summary) + EOS_TOKEN

    return {'formatted_prompt': formatted_prompt}

Notice how we are adding the end-of-sequence token to the prompt.

In [None]:
formatted_training_dataset = training_dataset.map(
    prompt_formatter,
    fn_kwargs={'prompt_template': alpaca_prompt}
)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
formatted_validation_dataset = validation_dataset.map(
    prompt_formatter,
    fn_kwargs={'prompt_template': alpaca_prompt}
)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Let us now inspect one sample.

In [None]:
formatted_training_dataset[0]

{'id': 'train_6155',
 'dialogue': "#Person1#: Hello, Anna speaking!\n#Person2#: Hey, Anna, this is Jason.\n#Person1#: Jason, where have you been hiding lately? You know it's been a long time since your last call. Have you been good?\n#Person2#: Yes. How are you, Anna?\n#Person1#: I am fine. What have you been doing?\n#Person2#: Working. I've been really busy these days. I got a promotion.\n#Person1#: That's great, congratulations!\n#Person2#: Thanks. I am feeling pretty good about myself too. You know, bigger office, a raise and even an assistant.\n#Person1#: That's good. So I guess I'll have to make an appointment to see you.\n#Person2#: You are kidding.\n#Person1#: How long have you been working there?\n#Person2#: A bit over two years. This is a fast-moving company, and seniority isn't the only factor in deciding promotions.\n#Person1#: How do you like your new boss?\n#Person2#: She is very nice and open-minded.\n#Person1#: Much better than the last one, huh?\n#Person2#: Yeah. He was

# Fine-Tuning

We now patch in the adapter modules to the base model using the `get_peft_model` method.

> Practical Tip: $r$ defines the dimensions of the low-rank matrices, while $\alpha$ determines the scaling factor for the weight matrices. It is common to freeze $\alpha=16$, while varying the values of $r = \alpha, \alpha/2, \alpha/4$ and arriving at the optimal value of that gives the lowest validation loss (note that we use the same loss used for the base model, e.g., perplexity or log loss).

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    use_gradient_checkpointing=True,
    random_state=42,
    loftq_config=None
)

Unsloth 2024.11.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): l

Notice how LoRA adapters are attached to the layers specified during instantiation.

For training, we use the following nuances borrowed from the broader deep learning discipline.

- Low learning rates for smooth parameter updates
- Early stopping to monitor for validation loss (negative log likelihood in this case)
- Checkpointing to enable resumption of training


In [None]:
!pip install schedulefree



In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_training_dataset,
    eval_dataset=formatted_validation_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    dataset_text_field = "formatted_prompt",
    max_seq_length=2048,
    dataset_num_proc=2,
    packing=False, # Increases efficiency for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs=10,
        eval_strategy="epoch",
        save_strategy='epoch',
        metric_for_best_model="eval_loss",
        load_best_model_at_end=True,
        greater_is_better=False,
        learning_rate=5e-5,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_hf",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=42,
        output_dir="outputs",
        report_to = 'none'
    )
)

Map (num_proc=2):   0%|          | 0/300 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
training_history = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 300 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 370
 "-____-"     Number of trainable parameters = 41,943,040


Epoch,Training Loss,Validation Loss
0,1.1309,1.195589
2,1.2016,1.170511
4,0.6545,1.336014


Epoch,Training Loss,Validation Loss
0,1.1309,1.195589
2,1.2016,1.170511
4,0.6545,1.336014
6,0.6484,1.462559


As we can see above, the early stopping callback ensured that training stopped when validation loss was not reducing.

In [None]:
training_history

TrainOutput(global_step=225, training_loss=0.9714556219842699, metrics={'train_runtime': 2325.6923, 'train_samples_per_second': 1.29, 'train_steps_per_second': 0.159, 'total_flos': 2.995596024594432e+16, 'train_loss': 0.9714556219842699, 'epoch': 6.0})

# Inference

In [None]:
test_dataset = dataset['test']

In [None]:
instruction='Write a concise summary of the following dialogue.'
test_dialogue = test_dataset[0]['dialogue']
test_summary = test_dataset[0]['summary']

In [None]:
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): l

> Reminder: At this stage, we have the model + adapters patched in!

In [None]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): l

In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        instruction,
        test_dialogue,
        "", # leave output blank for generation
    )
], return_tensors="pt").to("cuda")

In [None]:
outputs = model.generate(
    **inputs,
    max_new_tokens=128,
    use_cache=True,
    pad_token_id=tokenizer.eos_token_id
)

In [None]:
print(
    tokenizer.decode(
        outputs[0][inputs.input_ids.shape[-1]:],
        skip_special_tokens=True,
        cleanup_tokenization_spaces=True
    )
)

#Person1# asks Ms. Dawson to take a dictation for him. #Person1# tells Ms. Dawson to write a memo that restricts all office communications to email correspondence and official memos. #Person1# also tells Ms. Dawson that any employee who persists in using Instant Messaging will face termination.


In [None]:
test_summary

'Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.'

Now that we have a fine-tuned model, we can save the model to disk.

# Save Trained Model

In [None]:
# @title Setup to enable bash commands
import locale

def getpreferredencoding():
    return "UTF-8"

locale.getpreferredencoding = getpreferredencoding

In [None]:
lora_model_name = "dialogue-summarizer-mistral"

In [None]:
model.save_pretrained(lora_model_name)

In [None]:
!ls -lh {lora_model_name}

total 161M
-rw-r--r-- 1 root root  741 Nov 19 11:27 adapter_config.json
-rw-r--r-- 1 root root 161M Nov 19 11:27 adapter_model.safetensors
-rw-r--r-- 1 root root 5.0K Nov 19 11:27 README.md


As we can see from the output above, we save only the adapter
(since we can load the base model on-demand). In order to enable inference, we can export the saved model to a remote, secure location (in this case, Google Drive).

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r {lora_model_name} /content/drive/MyDrive