<a href="https://colab.research.google.com/github/stewart-lab/kmGPT/blob/fine-tuning/Unsloth_Lora_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install accelerate
!pip install peft
!pip install wandb
!pip install trl
!pip install bitsandbytes
!pip install scikit-learn
!pip install "unsloth[cu118-torch230] @ git+https://github.com/unslothai/unsloth.git"
!pip install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git"
!pip install "unsloth[cu118-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
!pip install "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
!pip install xformers

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m225.3/302.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

In [2]:
!pip install flash-attn



In [3]:
from accelerate import Accelerator
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig
import wandb
import transformers
import torch
import glob
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
import random
from unsloth import FastLanguageModel
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from torch import nn
import sys
import gc
from transformers import AdamW
from accelerate import notebook_launcher
from sklearn.model_selection import train_test_split
from accelerate import DistributedDataParallelKwargs
import time
import re
from transformers import get_cosine_schedule_with_warmup
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments, BitsAndBytesConfig
import accelerate
import json
from peft import IA3Config, IA3Model, LoraConfig
import jinja2
import math
import bitsandbytes as bnb
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import math
from trl import setup_chat_format
from peft import prepare_model_for_kbit_training

# From this Gist: https://gist.github.com/ihoromi4/b681a9088f348942b01711f251e5f964
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [4]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
!wandb login 4a376fd0ab1c0901b9d9886d0734a88b4794a7fd

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [6]:
class config:
    # General Configuration
    device_type = "gpus"
    model = "unsloth/Phi-3-mini-4k-instruct"

    # Training Configuration
    max_seq_length = 2048
    trust = True

    # Prompt Parameters
    ab_hypothesis = "There exists an interaction between the disease {a_term} and the gene {b_term}."
    bc_hypothesis = "There exists an interaction between the drug {c_term} and the gene {b_term}."
    ac_hypothesis = "The drug {c_term} has an interaction with the disease {a_term}."

    instr = "Classify this abstract as either 0 (Not Relevant) or 1 (Relevant) for evaluating the provided hypothesis."

In [7]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = config.model,
    max_seq_length = config.max_seq_length,
    load_in_4bit = True,
    trust_remote_code = config.trust,
    attn_implementation = 'flash_attention_2',
    device_map = "auto",
)

model = FastLanguageModel.get_peft_model(
    model,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    r = 32,
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_rslora = True,
    loftq_config = None
)

Unsloth: You passed in `unsloth/Phi-3-mini-4k-instruct` and `load_in_4bit = True`.
We shall load `unsloth/Phi-3-mini-4k-instruct-bnb-4bit` for 4x faster loading.


config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [8]:
def train_ans_prompt(hyp, abstract, instr, label, cot) -> str:
	return f"Abstract: {abstract}\nHypothesis: {hyp}\nInstructions: {instr}\nScore: {label}\nExplanation: {cot}"

def test_ans_prompt(hyp, abstract, instr, label) -> str:
	return f"Abstract: {abstract}\nHypothesis: {hyp}\nInstructions: {instr}\nScore: {label}"

def eval_ans_prompt(hyp, abstract, instr) -> str:
	return f"Abstract: {abstract}\nHypothesis: {hyp}\nInstructions: {instr}\nScore: "

In [10]:
train = pd.read_csv("filtered_synthetic_train.tsv", sep="\t")
test = pd.read_csv("test.tsv", sep="\t")

In [11]:
def processRowTrainText(row, prompt_fn):
    if pd.isnull(row["a_term"]):
        hypothesis = config.bc_hypothesis.format(c_term=row["c_term"], b_term=row["b_term"])
    elif pd.isnull(row["b_term"]):
        hypothesis = config.ac_hypothesis.format(c_term=row["c_term"], a_term=row["a_term"])
    elif pd.isnull(row["c_term"]):
        hypothesis = config.ab_hypothesis.format(a_term=row["a_term"], b_term=row["b_term"])
    return prompt_fn(hypothesis, row["abstract"], config.instr, int(row["label"]), row["cot"])

In [12]:
def processRowTestText(row, prompt_fn):
    if pd.isnull(row["a_term"]):
        hypothesis = config.bc_hypothesis.format(c_term=row["c_term"], b_term=row["b_term"])
    elif pd.isnull(row["b_term"]):
        hypothesis = config.ac_hypothesis.format(c_term=row["c_term"], a_term=row["a_term"])
    elif pd.isnull(row["c_term"]):
        hypothesis = config.ab_hypothesis.format(a_term=row["a_term"], b_term=row["b_term"])
    return prompt_fn(hypothesis, row["abstract"], config.instr, int(row["label"]))

In [13]:
def processRowPrompt(row, prompt_fn):
    if pd.isnull(row["a_term"]):
        hypothesis = config.bc_hypothesis.format(c_term=row["c_term"], b_term=row["b_term"])
    elif pd.isnull(row["b_term"]):
        hypothesis = config.ac_hypothesis.format(c_term=row["c_term"], a_term=row["a_term"])
    elif pd.isnull(row["c_term"]):
        hypothesis = config.ab_hypothesis.format(a_term=row["a_term"], b_term=row["b_term"])
    return prompt_fn(hypothesis, row["abstract"], config.instr)

In [14]:
train["text"] = train.apply(lambda row: processRowTrainText(row, train_ans_prompt), axis=1)
train["prompt"] = train.apply(lambda row: processRowPrompt(row, eval_ans_prompt), axis=1)
train = Dataset.from_pandas(train)

In [15]:
test["text"] = test.apply(lambda row: processRowTestText(row, test_ans_prompt), axis=1)
test["prompt"] = test.apply(lambda row: processRowPrompt(row, eval_ans_prompt), axis=1)
test = Dataset.from_pandas(test)

In [16]:
print(train["text"][69])

Abstract: Pharmacogenetics offers the potential to improve health outcomes by identifying individuals who are at greater risk of harm from certain medicines. Routine adoption of pharmacogenetic tests requires evidence of their cost effectiveness. The present review aims to systematically review published economic evaluations of pharmacogenetic tests that aim to prevent or reduce the incidence of ADRs. We conducted a systematic literature review of economic evaluations of pharmacogenetic tests aimed to reduce the incidence of adverse drug reactions. Literature was searched using Embase, MEDLINE and the NHS Economic Evaluation Database with search terms relating to pharmacogenetic testing, adverse drug reactions, economic evaluations and pharmaceuticals. Titles were screened independently by two reviewers. Articles deemed to meet the inclusion criteria were screened independently on abstract, and full texts reviewed. We identified 852 articles, of which 47 met the inclusion criteria. The

In [17]:
print(train["prompt"][69])

Abstract: Pharmacogenetics offers the potential to improve health outcomes by identifying individuals who are at greater risk of harm from certain medicines. Routine adoption of pharmacogenetic tests requires evidence of their cost effectiveness. The present review aims to systematically review published economic evaluations of pharmacogenetic tests that aim to prevent or reduce the incidence of ADRs. We conducted a systematic literature review of economic evaluations of pharmacogenetic tests aimed to reduce the incidence of adverse drug reactions. Literature was searched using Embase, MEDLINE and the NHS Economic Evaluation Database with search terms relating to pharmacogenetic testing, adverse drug reactions, economic evaluations and pharmaceuticals. Titles were screened independently by two reviewers. Articles deemed to meet the inclusion criteria were screened independently on abstract, and full texts reviewed. We identified 852 articles, of which 47 met the inclusion criteria. The

In [18]:
print(test["text"][69])

Abstract: The abnormal expression of SEC61G plays an important role in the development of various tumors. This study explored the effects of SEC61G on MAPK signaling pathway and proliferation of cervical cancer (CC) cells. shRNA was used to inhibit the expression of SEC61G and EdU to observe its effect on the proliferation of CC cell SiHa. The effect of SEC61G on invasion was evaluated by Transwell assay. TCGA database was used to analyze the influence of high or low SEC61G expression level on the overall survival of CC patients. Western blot was used to detect the expressions of SEC61G, p-RAF1, Raf1, p-MEK1/2, MEK1/2, and p-ERK1/2 in cells. SiHa cells overexpressing SEC61G (SiHa-SEC61G) and control group (SiHa-mock) were subcutaneously implanted in nude mice. The tumor growth curve was measured at the specified time points between SiHa-SEC61G and SiHa-mock. The inhibitory effect of gefitinib on SEC61G was further evaluated. In patients with CC, high SEC61G expression predicted poor pr

In [19]:
print(test["prompt"][69])

Abstract: The abnormal expression of SEC61G plays an important role in the development of various tumors. This study explored the effects of SEC61G on MAPK signaling pathway and proliferation of cervical cancer (CC) cells. shRNA was used to inhibit the expression of SEC61G and EdU to observe its effect on the proliferation of CC cell SiHa. The effect of SEC61G on invasion was evaluated by Transwell assay. TCGA database was used to analyze the influence of high or low SEC61G expression level on the overall survival of CC patients. Western blot was used to detect the expressions of SEC61G, p-RAF1, Raf1, p-MEK1/2, MEK1/2, and p-ERK1/2 in cells. SiHa cells overexpressing SEC61G (SiHa-SEC61G) and control group (SiHa-mock) were subcutaneously implanted in nude mice. The tumor growth curve was measured at the specified time points between SiHa-SEC61G and SiHa-mock. The inhibitory effect of gefitinib on SEC61G was further evaluated. In patients with CC, high SEC61G expression predicted poor pr

# Training

In [20]:
wandb.init(project="kmGPT", entity = "morgridge", group = "Fine Tuning", name = "Unslothed RSLora 32 & Neftune & (Filtered Labels + CoT) & Phi-3", reinit=True)

[34m[1mwandb[0m: Currently logged in as: [33mleoxu27[0m ([33mmorgridge[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [21]:
from transformers.integrations import WandbCallback
class LLMSampleCB(WandbCallback):
    def __init__(self, trainer, test_dataset):
        super().__init__()
        self.test = test_dataset
        self.y = torch.tensor(self.test["label"])
        self.model, self.tokenizer = trainer.model, trainer.tokenizer

    def get_metrics(self):
        FastLanguageModel.for_inference(trainer.model)
        y_hat = []
        for i in tqdm(range(len(test["prompt"]))):
            prompt = test["prompt"][i]
            prompt_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
            out = trainer.model.generate(prompt_ids.cuda(), max_new_tokens = 1)[-1]
            response = tokenizer.decode(out)
            try:
                score = int(response[-1])
            except:
                score = 1 - self.y[i]
            y_hat.append(score)

        y_hat = torch.tensor(y_hat)

        acc = accuracy_score(self.y, y_hat)
        prec = precision_score(self.y, y_hat, average='weighted')
        recall = recall_score(self.y, y_hat, average='weighted')
        f1 = f1_score(self.y, y_hat, average='weighted')

        return acc, prec, recall, f1

    def on_evaluate(self, args, state, control,  **kwargs):
        super().on_evaluate(args, state, control, **kwargs)
        acc, prec, recall, f1 = self.get_metrics()
        self._wandb.log({"Running Validation Accuracy": acc})
        self._wandb.log({"Running Validation Precision": prec})
        self._wandb.log({"Running Validation Recall": recall})
        self._wandb.log({"Running Validation F1": f1})
        epoch = math.ceil(trainer.state.epoch)

        print(f"Epoch {epoch}:\n\tAccuracy: {acc:.3f}\n\tPrecision: {prec:.3f}\n\tRecall: {recall:.3f}\n\tF-1 Score: {f1:.3f}")


In [22]:
training_args = TrainingArguments(
    output_dir = "checkpoints",
    report_to = "wandb",
    learning_rate = 2e-4,
    warmup_ratio = 0.03,
    lr_scheduler_type = "cosine",
    num_train_epochs = 5,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 8,
    bf16 = True,
    optim = "paged_adamw_32bit",
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    logging_steps = 1,
    do_eval=True,
    load_best_model_at_end = True,
    save_total_limit = 2,
    neftune_noise_alpha = 5,
    weight_decay = 0.01,
)

In [23]:
trainer = SFTTrainer(
    args = training_args,
    model=model,
    # peft_config=peft_config,
    # data_collator=DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer = tokenizer),
    packing = True,
    train_dataset=train,
    eval_dataset=test,
    dataset_text_field="text",
    tokenizer=tokenizer,
)



Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [24]:
wandb_callback = LLMSampleCB(trainer, test)
trainer.add_callback(wandb_callback)

In [None]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 160 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 8
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 59,768,832


Epoch,Training Loss,Validation Loss


In [None]:
FastLanguageModel.for_inference(model)
prompt = test["prompt"][0]
prompt_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
out = model.generate(prompt_ids.cuda(), max_new_tokens = 100)
response = tokenizer.decode(out[0])

In [None]:
print(response)

In [None]:
with torch.inference_mode():
    with torch.cuda.amp.autocast():
        y_hat = []
        for i in tqdm(range(len(test["prompt"]))):
            prompt = test["prompt"][i]
            prompt_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
            out = trainer.model.generate(prompt_ids.cuda(), max_new_tokens = 1)[-1]
            response = tokenizer.decode(out)
            score = int(response[-1])
            y_hat.append(score)
            # print(score)

In [None]:
y = torch.tensor(test["label"])
y_hat = torch.tensor(y_hat)

In [None]:
wandb.log({"Validation Accuracy": accuracy_score(y, y_hat)})
wandb.log({"Validation Precision": precision_score(y, y_hat, average='weighted')})
wandb.log({"Validation Recall": recall_score(y, y_hat, average='weighted')})
wandb.log({"Validation F1-Score": f1_score(y, y_hat, average='weighted')})

In [None]:
accuracy_score(y, y_hat)

In [None]:
precision_score(y_hat, y)

In [None]:
recall_score(y_hat, y)

In [None]:
f1_score(y_hat, y)

In [None]:
wandb.log({f"Confusion Matrix": wandb.plot.confusion_matrix(y_true=y.tolist(), preds=y_hat.tolist(), class_names=["Irrelevant", "Relevant"], title = "Relevance Confusion Matrix")})

In [None]:
wandb.finish()