# Module 2 Project 3 Part 2: Finetuning

- Finetune a model using instruction tuning format with LoRA and PeFT. Find a dataset from HuggingFace to make this easier. 
- Perform supervised finetuning on this dataset.
- Must be run on GPU

## STEP 1: IMPORTS
- We want `torch`, `numpy`, `transformers`, and `datasets` for our data
- We will also be using [LoRA](https://arxiv.org/abs/2106.09685) and [SFT](https://huggingface.co/docs/trl/main/en/sft_trainer) here for the supervised finetune trainer

In [None]:
import torch
import torch.nn as nn
import numpy as np

from transformers import AutoModelForSequenceClassification, AutoModelForCausalLM

from functools import partial

import os

from datasets import load_dataset
from datasets import DatasetDict
from transformers import LlamaTokenizer

import re
import random
from multiprocessing import cpu_count

from transformers import BitsAndBytesConfig

from trl import SFTTrainer
from peft import LoraConfig
from transformers import TrainingArguments

## STEP 2: LOAD THE DATA
- We want to load [ultrachat_200k](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k) dataset from HuggingFace to use for our finetune
- Separate the data into train and test pieces
- Display a sample of the messages used in this dataset

In [None]:
raw_datasets = load_dataset("HuggingFaceH4/ultrachat_200k")

indices = range(0,100)

dataset_dict = {"train": raw_datasets["train_sft"].select(indices),
                "test": raw_datasets["test_sft"].select(indices)}

raw_datasets = DatasetDict(dataset_dict)
print(raw_datasets)

example = raw_datasets["train"][0]
print(example.keys())

messages = example["messages"]
for message in messages:
  role = message["role"]
  content = message["content"]
  print('{0:20}:  {1}'.format(role, content))

## STEP 3: MODEL AND TOKENIZER SETUP
- Using [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) as model choice here
- Our tokenizer will be the LlamaTokenizer
- Set default chat template and create a method to apply it to a batch of data
- Apply chat template to all data in the training and testing sets

In [None]:
model_id = "mistralai/Mistral-7B-v0.1"

tokenizer = LlamaTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(model_id)

if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

if tokenizer.model_max_length > 100_000:
  tokenizer.model_max_length = 2048

DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE

def apply_chat_template(example, tokenizer):
    messages = example["messages"]
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)

    return example

column_names = list(raw_datasets["train"].features)
raw_datasets = raw_datasets.map(apply_chat_template,
                                num_proc=cpu_count(),
                                fn_kwargs={"tokenizer": tokenizer},
                                remove_columns=column_names,
                                desc="Applying chat template",)
train_dataset = raw_datasets["train"]
eval_dataset = raw_datasets["test"]

for index in random.sample(range(len(raw_datasets["train"])), 3):
  print(f"Sample {index} of the processed training set:\n\n{raw_datasets['train'][index]['text']}")

## STEP 4: CONFIG AND ARGS
- Set up our quantization config to load the model in 4bit (necessary for running on 1 3080 GPU)
- Set device map to 'cuda' for GPU access
- Set model arguments to use th quantization config and `flash_attention_2`

In [None]:
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype="torch.bfloat16",
)
device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None

model_kwargs = dict(
    attn_implementation="flash_attention_2",
    torch_dtype="auto",
    use_cache=False,
    device_map=device_map,
    quantization_config=quantization_config,
)

## STEP 5: CREATE LORA LAYER
- We will manually create a LoRA layer to finetune this model with SFT (not necessary but just a conceptual exercise)
- We set our A and B values to approximate our weight update matrix
- We build a LinearWithLoRA layer to be used in place of the linear layers in our model

In [None]:
for param in model.parameters():
    param.requires_grad = False

class LoRALayer(nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.A = torch.nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        x = self.alpha * (x @ self.A @ self.B)
        return x

class LinearWithLoRA(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )

    def forward(self, x):
        return self.linear(x) + self.lora(x)

print(model)

## STEP 6: ASSIGN LORA LAYERS TO MODEL
- Go through the layers in our model
- Again, this is just for a mental exercise to understand what happens with LoRA

In [None]:
lora_r = 8
lora_alpha = 16
lora_dropout = 0.05
lora_query = True
lora_key = False
lora_value = True
lora_projection = False
lora_mlp = False
lora_head = False

layers = []

assign_lora = partial(LinearWithLoRA, rank=lora_r, alpha=lora_alpha)

for layer in model.distilbert.transformer.layer:
    if lora_query:
        layer.attention.q_lin = assign_lora(layer.attention.q_lin)
    if lora_key:
        layer.attention.k_lin = assign_lora(layer.attention.k_lin)
    if lora_value:
        layer.attention.v_lin = assign_lora(layer.attention.v_lin)
    if lora_projection:
        layer.attention.out_lin = assign_lora(layer.attention.out_lin)
    if lora_mlp:
        layer.ffn.lin1 = assign_lora(layer.ffn.lin1)
        layer.ffn.lin2 = assign_lora(layer.ffn.lin2)
if lora_head:
    model.pre_classifier = assign_lora(model.pre_classifier)
    model.classifier = assign_lora(model.classifier)

print(model)

## STEP 7: TRAINING ARGUMENTS
- Set up our training arguments with 128 gradient accumulation steps and a LR of 2e-05
- Create a [Parameter Efficient Finetuning](https://github.com/huggingface/peft) config to use with LoRA SFT. This defines the LoRA parameters like alpha
- Set up the SFTTrainer using our datasets and our model
- Start finetune loop

In [None]:
training_args = TrainingArguments(
    fp16=True, # specify bf16=True instead when training on GPUs that support bf16
    do_eval=True,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=128,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2.0e-05,
    log_level="info",
    logging_steps=5,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    max_steps=-1,
    num_train_epochs=1,
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    save_strategy="no",
    save_total_limit=None,
    seed=42,
)

peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)

trainer = SFTTrainer(
        model=model_id,
        model_init_kwargs=model_kwargs,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field="text",
        tokenizer=tokenizer,
        packing=True,
        peft_config=peft_config,
        max_seq_length=tokenizer.model_max_length,
    )

trainer.train()
trainer.save_state() 

## STEP 8: DISPLAY OUTPUT
- To confirm our finetune worked, grab the model path output from the last save
- We will then create a new tokenizer and model instance, and then an initial message template
- Then we generate the output at max 256 new tokens and a temperature of 0.7

In [None]:
output_dir = "model_output_location"

from transformers import AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir, load_in_4bit=True, device_map="auto")

messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a pirate",
    },
    {"role": "user", "content": "What year did the London Bridge cross the ocean a second time?"},
]

input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
)

print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])