<a target="_blank" href="https://colab.research.google.com/github/segmed/gemma_mistral/blob/main/notebooks/mistral_sft.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

inspired by:
*   https://adithyask.medium.com/a-beginners-guide-to-fine-tuning-gemma-0444d46d821c
*   https://www.kaggle.com/code/lucamassaron/fine-tune-gemma-7b-it-for-sentiment-analysis

In [1]:
!nvidia-smi

Sat Feb 24 23:35:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0              45W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!pip install -qq "transformers==4.38.0" --upgrade
!pip -qq install bitsandbytes accelerate datasets peft trl wandb
!pip -qq install "torch>=2.1.1"
# !pip -qq install flash-attn --no-build-isolation

In [3]:
# if you are using google colab

import os
from google.colab import userdata
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')
if os.environ["HF_TOKEN"] == None:
    from huggingface_hub import notebook_login
    notebook_login()

In [4]:
# if you are using google colab

import os
from google.colab import userdata
os.environ["WANDB_API_KEY"] = userdata.get('WANDB_API_KEY')
if os.environ["WANDB_API_KEY"] == None:
    !wandb.login()

# import wandb
# wandb.init(project='gemma_sft')

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto",
                                            #  torch_dtype=torch.float16,
                                            #  attn_implementation="flash_attention_2", # could not get working
                                             quantization_config=quantization_config)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
from datasets import load_dataset
dataset = load_dataset('medmcqa', split='train').shuffle(seed=42).train_test_split(test_size=0.01)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 180993
    })
    test: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 1829
    })
})

In [7]:
import numpy as np

choice_map = np.array(["(a)", "(b)", "(c)", "(d)"])

def get_content(row):
    question = row['question']
    cop = row['cop']
    opa = row['opa']
    opb = row['opb']
    opc = row['opc']
    opd = row['opd']
    exp = row['exp']
    topic_name = row['topic_name']
    optiona = f"(a) {opa}"
    optionb = f"(b) {opb}"
    optionc = f"(c) {opc}"
    optiond = f"(d) {opd}"
    exp = row['exp']
    answer = choice_map[cop]

    system_prompt_choice = """Analyze the question and determine which choice is the correct one and return the answer as one or more of the following "(a)", "(b)", "(c)", "(d)" followed by an explanation:"""
    return f"{system_prompt_choice}\n{question}\n{optiona} {optionb} {optionc} {optiond}", answer, exp

def get_test_sample(row):
    content, answer, exp = get_content(row)
    chat = [
        { "role": "user", "content": content },
    ]
    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True, add_special_tokens=False)
    return prompt

def get_test_sample_answer(row):
    content, answer, exp = get_content(row)
    return answer, exp

def get_training_sample(row):
    content, answer, exp = get_content(row)
    chat = [
        { "role": "user", "content": content },
        { "role": "assistant", "content": f"{answer}\n{exp}" },
    ]
    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False, add_special_tokens=False)
    return prompt

In [8]:
import pandas as pd
df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])

In [9]:
df_train['text'] = df_train.apply(get_training_sample, axis=1)
df_test['text'] = df_test.apply(get_test_sample, axis=1)

In [10]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(pd.DataFrame(df_train, columns=['text']))
test_dataset = Dataset.from_pandas(pd.DataFrame(df_test, columns=['text', 'exp', 'cop', 'topic_name']))

In [11]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [12]:
import bitsandbytes as bnb
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if 'lm_head' in lora_module_names: # needed for 16-bit
            lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [13]:
modules = find_all_linear_names(model)
print(modules)

['k_proj', 'gate_proj', 'o_proj', 'v_proj', 'down_proj', 'q_proj', 'up_proj']


In [14]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [15]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 167772160 | total: 7409504256 | Percentage: 2.2643%


In [16]:
import transformers
import torch

from trl import SFTTrainer

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()
output_dir = "outputs_100_steps"
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    dataset_text_field="text",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        # max_steps=15,
        # per_device_train_batch_size=1,
        # gradient_accumulation_steps=1,
        # warmup_steps=0.03,
        # learning_rate=2e-4,
        # weight_decay=0.001,
        # bf16=True,
        # max_grad_norm=0.3,
        # logging_steps=5,
        # output_dir=output_dir,
        # optim="paged_adamw_8bit",
        # save_strategy="epoch",
        # report_to="wandb",
        # evaluation_strategy="steps", # Evaluate the model every logging step
        # eval_steps=5,               # Evaluate and save checkpoints every 100 steps
        # do_eval=True,

        # num_train_epochs=3,
        max_steps=2000,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        warmup_steps=0.03,
        learning_rate=2e-4,
        weight_decay=0.001,
        bf16=True,
        max_grad_norm=0.3,
        logging_steps=100,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        save_strategy="steps",
        report_to="wandb",
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=100,               # Evaluate and save checkpoints every 100 steps
        do_eval=True,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)



Map:   0%|          | 0/180993 [00:00<?, ? examples/s]

Map:   0%|          | 0/1829 [00:00<?, ? examples/s]



In [17]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train(resume_from_checkpoint = False)

[34m[1mwandb[0m: Currently logged in as: [33mfoobar8675[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss
100,1.3096,1.010862
200,1.1593,1.002138
300,1.1509,0.998863
400,1.1436,0.990574
500,1.1324,0.992441
600,1.1282,0.995001
700,1.1382,1.001463
800,1.1197,0.973816
900,1.1088,1.000481
1000,1.1092,0.988934




TrainOutput(global_step=2000, training_loss=1.1111650619506837, metrics={'train_runtime': 9909.0191, 'train_samples_per_second': 1.615, 'train_steps_per_second': 0.202, 'total_flos': 1.6557819082717594e+17, 'train_loss': 1.1111650619506837, 'epoch': 0.09})

In [18]:
new_model = "mistral-mcqa" #Name of the model you will be pushing to huggingface model hub

In [19]:
trainer.model.save_pretrained(new_model)

In [20]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [21]:
# Push the model and tokenizer to the Hugging Face Model Hub
merged_model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/matthewchung74/mistral-mcqa/commit/5d1cf6ab98684a99fe150039c329484bc5ab675f', commit_message='Upload tokenizer', commit_description='', oid='5d1cf6ab98684a99fe150039c329484bc5ab675f', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from google.colab import runtime
runtime.unassign()