# Supervised Fine-tune Model

## Part 1 Install and import libraries

In [None]:
!pip install datasets
!pip install bitsandbytes
!pip install --upgrade transformers

In [None]:
# Python built-in libraries
import os

# Hugging face libraries
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)

# Pytorch libraries
import torch

# Other libraries

## Part 2 Set global parameters

In [None]:
# Baseline model
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

# Datasets
dataset_name = "GBaker/MedQA-USMLE-4-options"

# Output dir
output_path = "output_models"
if not os.path.exists(output_path):
    os.mkdir(output_path)

## Part 3 Load model

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Config bits and bytes quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.float16,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = "auto"
)
model = prepare_model_for_kbit_training(model)

In [None]:
# Config lora
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "up_proj","down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

In [None]:
# This cell is for test
model.print_trainable_parameters()

trainable params: 6,881,280 || all params: 1,550,595,584 || trainable%: 0.4438


## Part 4 Load dataset

In [None]:
# Define dataset preprocess function
def preprocess(data):
    '''
        Preprocess dataset for training and validation.
        Each data is a dict.
    '''

    # Pick question, options and answers from data
    question = data["question"]
    answer = data["answer_idx"]

    options = [key + ". " + val for key, val in data["options"].items()]
    options = "\n".join(options)

    # Concatenate information
    instruction = "Please answering the following question "        \
                    "by selecting the correct answer.\n\n"          \
                    f"Question:\n {question}\n\n"                   \
                    f"Options: {options}\n\n"                       \
                    "Provide only the letter of the correct answer."

    # Add prompt format
    instruction_formatted = "<|im_start|>user\n"                    \
                            f"{instruction} <|im_end|>\n"           \
                            "<|im_start|>assistant\n"               \
                            f"{answer} <|im_end|>\n"

    return {"text": instruction_formatted}


# Define dataset tokenization function
def tokenize(data):
    '''
        Tokenize dataset.
    '''
    # The longest input sequence length is 4424
    return tokenizer(data["text"], truncation=True, padding="max_length", max_length=1024)

In [None]:
# Load dataset
dataset = load_dataset(dataset_name)

# Preprocess dataset
column_names = dataset["train"].column_names
dataset = dataset.map(preprocess, remove_columns=column_names)

train_dataset = dataset["train"]
test_dataset = dataset["test"]

# Tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=["text"])
test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=["text"])

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

README.md:   0%|          | 0.00/654 [00:00<?, ?B/s]

phrases_no_exclude_train.jsonl:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

phrases_no_exclude_test.jsonl:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1273 [00:00<?, ? examples/s]

Map:   0%|          | 0/10178 [00:00<?, ? examples/s]

Map:   0%|          | 0/1273 [00:00<?, ? examples/s]

Map:   0%|          | 0/10178 [00:00<?, ? examples/s]

Map:   0%|          | 0/1273 [00:00<?, ? examples/s]

In [None]:
# This cell is for test
print(train_dataset[0])

{'input_ids': [151644, 872, 198, 5501, 35764, 279, 2701, 3405, 553, 26301, 279, 4396, 4226, 382, 14582, 510, 362, 220, 17, 18, 4666, 6284, 20280, 5220, 518, 220, 17, 17, 5555, 12743, 367, 18404, 448, 19675, 5193, 4335, 2554, 13, 2932, 5302, 432, 3855, 220, 16, 1899, 4134, 323, 702, 1012, 92305, 8818, 16163, 803, 3015, 323, 4633, 69537, 15357, 8649, 13, 2932, 5937, 11074, 1632, 323, 374, 8110, 553, 264, 10668, 369, 1059, 19636, 13, 6252, 9315, 374, 220, 24, 22, 13, 22, 58472, 320, 18, 21, 13, 20, 30937, 701, 6543, 7262, 374, 220, 16, 17, 17, 14, 22, 22, 9465, 39, 70, 11, 27235, 374, 220, 23, 15, 44173, 11, 32415, 804, 525, 220, 16, 24, 44173, 11, 323, 23552, 49743, 374, 220, 24, 23, 4, 389, 3054, 3720, 13, 27379, 7006, 374, 27190, 369, 458, 19265, 315, 2783, 1975, 665, 41643, 9210, 8376, 28568, 323, 264, 89554, 84556, 13, 15920, 315, 279, 2701, 374, 279, 1850, 6380, 369, 419, 8720, 1939, 3798, 25, 362, 13, 53687, 292, 60497, 198, 33, 13, 356, 823, 376, 685, 87, 603, 198, 34, 13, 3155, 8

## Part 5 Config training arguments

In [None]:
# Config training arguments
training_args = TrainingArguments(
    output_dir=output_path,
    do_eval=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_ratio=0.05,
    logging_steps=10,
    eval_steps=len(train_dataset) // (8 * 8),
    save_steps=len(train_dataset) // (8 * 8),
    # load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard"
)

## Part 6 Train

In [None]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

# Train model
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,2.1111
20,1.714
30,1.4224
40,1.3494
50,1.3325
60,1.3134
70,1.3083
80,1.2876
90,1.2878
100,1.3038


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=477, training_loss=1.2611491545191351, metrics={'train_runtime': 18981.2823, 'train_samples_per_second': 1.609, 'train_steps_per_second': 0.025, 'total_flos': 2.460599149532283e+17, 'train_loss': 1.2611491545191351, 'epoch': 2.9866457187745485})

In [None]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

# Save model
model.save_pretrained(f"/content/drive/MyDrive/final_model")
tokenizer.save_pretrained(f"/content/drive/MyDrive/final_model")

('/content/drive/MyDrive/final_model/tokenizer_config.json',
 '/content/drive/MyDrive/final_model/special_tokens_map.json',
 '/content/drive/MyDrive/final_model/vocab.json',
 '/content/drive/MyDrive/final_model/merges.txt',
 '/content/drive/MyDrive/final_model/added_tokens.json',
 '/content/drive/MyDrive/final_model/tokenizer.json')

## Test

In [None]:
from google.colab import drive
drive.mount('/content/drive/MyDrive')

Mounted at /content/drive
