# 1) Setup

In [None]:
!pip install accelerate
!pip install peft
!pip install wandb
!pip install trl
!pip install bitsandbytes
!pip install scikit-learn
!pip install "unsloth[cu118-torch230] @ git+https://github.com/unslothai/unsloth.git"
!pip install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git"
!pip install "unsloth[cu118-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
!pip install "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
!pip install xformers

Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl.metadata (18 kB)
Collecting huggingface-hub (from accelerate)
  Downloading huggingface_hub-0.23.4-py3-none-any.whl.metadata (12 kB)
Collecting safetensors>=0.3.1 (from accelerate)
  Downloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting fsspec (from torch>=1.10.0->accelerate)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Collecting tqdm>=4.42.1 (from huggingface-hub->accelerate)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K

In [None]:
!pip install flash-attn

In [None]:
from accelerate import Accelerator
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig
import wandb
import transformers
import torch
import glob
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
import random
from unsloth import FastLanguageModel
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from torch import nn
import sys
import gc
from transformers import AdamW
from accelerate import notebook_launcher
from sklearn.model_selection import train_test_split
from accelerate import DistributedDataParallelKwargs
import time
import re
from transformers import get_cosine_schedule_with_warmup
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments, BitsAndBytesConfig
import accelerate
import json
from peft import IA3Config, IA3Model, LoraConfig
import jinja2
import math
import bitsandbytes as bnb
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import math
from trl import setup_chat_format
from peft import prepare_model_for_kbit_training

# From this Gist: https://gist.github.com/ihoromi4/b681a9088f348942b01711f251e5f964
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
!huggingface-cli login --token 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

In [None]:
!wandb login 4a376fd0ab1c0901b9d9886d0734a88b4794a7fd

In [None]:
class config:
    # General Configuration
    device_type = "gpus"
    model = "unsloth/Phi-3-mini-4k-instruct"

    # Training Configuration
    max_seq_length = 2048
    trust = True

    # Porpoise One (Relevance Filtering Parameters)
    ab_hypothesis = "There exists an interaction between the disease {a_term} and the gene {b_term}."
    bc_hypothesis = "There exists an interaction between the drug {c_term} and the gene {b_term}."
    ac_hypothesis = "The drug {c_term} has an interaction with the disease {a_term}."

    rel_instr = "Classify this abstract as either 0 (Not Relevant) or 1 (Relevant) for evaluating the provided hypothesis."

    # Porpoise Two (Supports parameters)
    sup_instr = "Explain why (or why not) this biomedical abstract supports the provided statement. Give a score of 1 for supports and a score of 0 for does not support."

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = config.model,
    max_seq_length = config.max_seq_length,
    load_in_4bit = True,
    trust_remote_code = config.trust,
    attn_implementation = 'flash_attention_2',
    device_map = "auto",
)

model = FastLanguageModel.get_peft_model(
    model,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    r = 16,
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_rslora = True,
    loftq_config = None
)

# 2) Data Preparation

### Relevance data prep

In [None]:
def train_ans_prompt(hyp, abstract, instr, label, cot) -> str:
	return f"Abstract: {abstract}\nHypothesis: {hyp}\nInstructions: {config.rel_instr}\nScore: {label}\nExplanation: {cot}"

def test_ans_prompt(hyp, abstract, instr, label) -> str:
	return f"Abstract: {abstract}\nHypothesis: {hyp}\nInstructions: {config.rel_instr}\nScore: {label}"

def eval_ans_prompt(hyp, abstract, instr) -> str:
	return f"Abstract: {abstract}\nHypothesis: {hyp}\nInstructions: {config.rel_instr}\nScore: "

In [15]:
train_rel = pd.read_csv("./data/Porpoise_1/same_dist_train.tsv", sep="\t")
test_rel = pd.read_csv("./data/Porpoise_1/same_dist_test.tsv", sep="\t")

In [16]:
def processRowTrainText(row, prompt_fn):
    if pd.isnull(row["a_term"]):
        hypothesis = config.bc_hypothesis.format(c_term=row["c_term"], b_term=row["b_term"])
    elif pd.isnull(row["b_term"]):
        hypothesis = config.ac_hypothesis.format(c_term=row["c_term"], a_term=row["a_term"])
    elif pd.isnull(row["c_term"]):
        hypothesis = config.ab_hypothesis.format(a_term=row["a_term"], b_term=row["b_term"])
    return prompt_fn(hypothesis, row["abstract"], config.rel_instr, int(row["label"]), row["cot"])

In [17]:
def processRowTestText(row, prompt_fn):
    if pd.isnull(row["a_term"]):
        hypothesis = config.bc_hypothesis.format(c_term=row["c_term"], b_term=row["b_term"])
    elif pd.isnull(row["b_term"]):
        hypothesis = config.ac_hypothesis.format(c_term=row["c_term"], a_term=row["a_term"])
    elif pd.isnull(row["c_term"]):
        hypothesis = config.ab_hypothesis.format(a_term=row["a_term"], b_term=row["b_term"])
    return prompt_fn(hypothesis, row["abstract"], config.rel_instr, int(row["label"]))

In [18]:
def processRowPrompt(row, prompt_fn):
    if pd.isnull(row["a_term"]):
        hypothesis = config.bc_hypothesis.format(c_term=row["c_term"], b_term=row["b_term"])
    elif pd.isnull(row["b_term"]):
        hypothesis = config.ac_hypothesis.format(c_term=row["c_term"], a_term=row["a_term"])
    elif pd.isnull(row["c_term"]):
        hypothesis = config.ab_hypothesis.format(a_term=row["a_term"], b_term=row["b_term"])
    return prompt_fn(hypothesis, row["abstract"], config.rel_instr)

In [19]:
train_rel["text"] = train_rel.apply(lambda row: processRowTrainText(row, train_ans_prompt), axis=1)
train_rel["prompt"] = train_rel.apply(lambda row: processRowPrompt(row, eval_ans_prompt), axis=1)

In [20]:
test_rel["text"] = test_rel.apply(lambda row: processRowTestText(row, test_ans_prompt), axis=1)
test_rel["prompt"] = test_rel.apply(lambda row: processRowPrompt(row, eval_ans_prompt), axis=1)

### Support data prep

In [21]:
def getText(row):
    return f'Abstract: {row["abstract"]}\nStatement: {row["statement"]}\nInstructions: {config.sup_instr}\nScore: {row["label"]}\nExplanation: {row["cot"]}'

def getPrompt(row):
    return f'Abstract: {row["abstract"]}\nStatement: {row["statement"]}\nInstructions: {config.sup_instr}\nScore: '

In [22]:
train_sup = pd.read_csv("./data/Porpoise_2/train.tsv", sep = "\t")
test_sup = pd.read_csv("./data/Porpoise_2/test.tsv", sep = "\t")

In [23]:
train_sup["text"] = train_sup.apply(lambda row: getText(row), axis = 1)
train_sup["prompt"] = train_sup.apply(lambda row: getPrompt(row), axis = 1)

In [24]:
test_sup["text"] = test_sup.apply(lambda row: getText(row), axis = 1)
test_sup["prompt"] = test_sup.apply(lambda row: getPrompt(row), axis = 1)

### Merging Data together

In [25]:
train_text = pd.concat([train_sup["text"], train_rel["text"]], ignore_index = True)
train_prompts = pd.concat([train_sup["prompt"], train_rel["prompt"]], ignore_index = True)

test_text = pd.concat([test_sup["text"], test_rel["text"]], ignore_index = True)
test_prompts = pd.concat([test_sup["prompt"], test_rel["prompt"]], ignore_index = True)

In [26]:
train = pd.DataFrame({"text": train_text, "prompt": train_prompts})
test = pd.DataFrame({"text": test_text, "prompt": test_prompts})

In [27]:
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)

In [28]:
print(len(train))

140


In [29]:
print(len(test))

112


# 3. Training

In [65]:
wandb.init(project="kmGPT", entity = "morgridge", group = "Porpoise 2.0", name = "Debugging", reinit=True)

In [31]:
from transformers.integrations import WandbCallback
class LLMSampleCB(WandbCallback):
    def __init__(self, trainer, test_sup, test_rel):
        super().__init__()
        self.test_rel = test_rel
        self.test_sup = test_sup
        
        self.y_sup = torch.tensor(self.test_sup["label"])
        self.y_rel = torch.tensor(self.test_rel["label"])
        
        self.model, self.tokenizer = trainer.model, trainer.tokenizer

    def get_metrics(self, test_set, labels):
        FastLanguageModel.for_inference(trainer.model)
        y_hat = []
        for i in tqdm(range(len(test_set["prompt"]))):
            prompt = test_set["prompt"][i]
            prompt_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
            out = trainer.model.generate(prompt_ids.cuda(), max_new_tokens = 1)[-1]
            response = tokenizer.decode(out)
            try:
                score = int(response[-1])
            except:
                score = 1 - labels[i]
            y_hat.append(score)

        y_hat = torch.tensor(y_hat)

        acc = accuracy_score(labels, y_hat)
        prec = precision_score(labels, y_hat, average='weighted')
        recall = recall_score(labels, y_hat, average='weighted')
        f1 = f1_score(labels, y_hat, average='weighted')

        return acc, prec, recall, f1

    def log(self, acc, prec, recall, f1, title):
        epoch = math.ceil(trainer.state.epoch)
        self._wandb.log({f"{title} Running Validation Accuracy": acc})
        self._wandb.log({f"{title} Running Validation Precision": prec})
        self._wandb.log({f"{title} Running Validation Recall": recall})
        self._wandb.log({f"{title} Running Validation F1": f1})
        print(f"*********** {title} RESULTS ***********")
        print(f"Epoch {epoch}:\n\tAccuracy: {acc:.3f}\n\tPrecision: {prec:.3f}\n\tRecall: {recall:.3f}\n\tF-1 Score: {f1:.3f}")

    def avg(self, num1, num2):
        return (num1 + num2) / 2.0
        
    def on_evaluate(self, args, state, control,  **kwargs):
        super().on_evaluate(args, state, control, **kwargs)
        acc_rel, prec_rel, recall_rel, f1_rel = self.get_metrics(self.test_rel, self.y_rel)
        acc_sup, prec_sup, recall_sup, f1_sup = self.get_metrics(self.test_sup, self.y_sup)

        self.log(acc_rel, prec_rel, recall_rel, f1_rel, "Relevance")
        self.log(acc_sup, prec_sup, recall_sup, f1_sup, "Support")

        acc_avg = self.avg(acc_rel, acc_sup)
        prec_avg = self.avg(prec_rel, prec_sup)
        recall_avg = self.avg(recall_rel, recall_sup)
        f1_avg = self.avg(f1_rel, f1_sup)

        self.log(acc_avg, prec_avg, recall_avg, f1_avg, "Average")

In [32]:
training_args = TrainingArguments(
    output_dir = "checkpoints",
    report_to = "wandb",
    learning_rate = 2e-4,
    warmup_ratio = 0.03,
    lr_scheduler_type = "cosine",
    num_train_epochs = 1,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    bf16 = True,
    optim = "paged_adamw_8bit",
    evaluation_strategy="epoch",
    save_strategy = "epoch",
    logging_steps = 1,
    do_eval=True,
    neftune_noise_alpha = 5,
    weight_decay = 0.1,
)



In [33]:
trainer = SFTTrainer(
    args = training_args,
    model=model,
    # peft_config=peft_config,
    # data_collator=DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer = tokenizer),
    packing = True,
    train_dataset=train,
    eval_dataset=test,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=2048,
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [34]:
wandb_callback = LLMSampleCB(trainer, test_sup, test_rel)
trainer.add_callback(wandb_callback)

In [35]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 75 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 4
 "-____-"     Number of trainable parameters = 29,884,416


Epoch,Training Loss,Validation Loss
0,1.1908,1.226476


  0%|          | 0/72 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 72/72 [00:07<00:00,  9.79it/s]
100%|██████████| 40/40 [00:04<00:00,  8.94it/s]


*********** Relevance RESULTS ***********
Epoch 1:
	Accuracy: 0.750
	Precision: 0.741
	Recall: 0.750
	F-1 Score: 0.739
*********** Support RESULTS ***********
Epoch 1:
	Accuracy: 0.900
	Precision: 0.900
	Recall: 0.900
	F-1 Score: 0.900
*********** Average RESULTS ***********
Epoch 1:
	Accuracy: 0.825
	Precision: 0.820
	Recall: 0.825
	F-1 Score: 0.820


TrainOutput(global_step=4, training_loss=1.232761263847351, metrics={'train_runtime': 72.5252, 'train_samples_per_second': 1.034, 'train_steps_per_second': 0.055, 'total_flos': 2951057265131520.0, 'train_loss': 1.232761263847351, 'epoch': 0.8421052631578947})

# 4) Post Processing and Eval

In [50]:
def gather_predictions(test_set):
    with torch.inference_mode():
        with torch.cuda.amp.autocast():
            y_hat = []
            cots = []
            num_wrong = 0
            for i in tqdm(range(len(test_set["prompt"]))):
                prompt = test_set["prompt"][i]
                prompt_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
                out = trainer.model.generate(prompt_ids.cuda(), max_new_tokens = 1)
                response = tokenizer.decode(out[0])
                score = int(response[-1])
                cot = "Correct! So no explanation was given."
    
                if score != test_set["label"][i]:
                    rationale = trainer.model.generate(prompt_ids.cuda(), max_new_tokens = 10)
                    rationale = tokenizer.decode(rationale[0])
                    prompt, ans = rationale.split("Score: ")
                    cot = ans[1:]
                    num_wrong += 1
                    print("wrong")
                
                y_hat.append(score)
                cots.append(cot)
    return y_hat, cots

In [62]:
def pred_table(test_set, y_hat, y, cots, title):
    data = list(zip(test_set["prompt"], y_hat, y, cots))
    test_table = wandb.Table(columns = ["prompt", "y_hat", "y", "rationale"], data = data)
    wandb.log({f"{title}": test_table})

def conf_mat(y_hat, y, title, class_names):
    wandb.log({f"{title}": wandb.plot.confusion_matrix(y_true=y.tolist(), preds=y_hat.tolist(), class_names=class_names, title = title)})

def avg_scores(y_hat_rel, y_hat_sup, y_rel, y_sup):

    rel_acc = accuracy_score(y_rel, y_hat_rel)
    rel_prec = precision_score(y_rel, y_hat_rel, average='weighted')
    rel_recall = recall_score(y_rel, y_hat_rel, average='weighted')
    rel_f1 = f1_score(y_rel, y_hat_rel, average='weighted')

    sup_acc = accuracy_score(y_sup, y_hat_sup)
    sup_prec = precision_score(y_sup, y_hat_sup, average='weighted')
    sup_recall = recall_score(y_sup, y_hat_sup, average='weighted')
    sup_f1 = f1_score(y_sup, y_hat_sup, average='weighted')

    avg_acc = (rel_acc + sup_acc) / 2.0
    avg_prec = (rel_prec + sup_prec) / 2.0
    avg_recall = (rel_recall + sup_recall) / 2.0
    avg_f1 = (rel_f1 + sup_f1) / 2.0
    
    print(f"Average Validation Accuracy: {avg_acc}")
    print(f"Average Validation Precision: {avg_prec}")
    print(f"Average Validation Recall: {avg_recall}")
    print(f"Average Validation F1-Score: {avg_f1}")

In [44]:
y_hat_rel, cots_rel = gather_predictions(test_rel)
y_hat_sup, cots_sup = gather_predictions(test_sup)

y_rel, y_hat_rel = torch.tensor(test_rel["label"]), torch.tensor(y_hat_rel)
y_sup, y_hat_sup = torch.tensor(test_sup["label"]), torch.tensor(y_hat_sup)

  3%|▎         | 2/72 [00:00<00:23,  2.92it/s]

wrong


  8%|▊         | 6/72 [00:01<00:18,  3.64it/s]

wrong


 17%|█▋        | 12/72 [00:03<00:19,  3.10it/s]

wrong


 19%|█▉        | 14/72 [00:03<00:19,  3.00it/s]

wrong


 22%|██▏       | 16/72 [00:04<00:18,  2.96it/s]

wrong


 25%|██▌       | 18/72 [00:05<00:21,  2.57it/s]

wrong


 28%|██▊       | 20/72 [00:06<00:19,  2.73it/s]

wrong


 43%|████▎     | 31/72 [00:08<00:10,  3.95it/s]

wrong


 49%|████▊     | 35/72 [00:09<00:11,  3.22it/s]

wrong


 50%|█████     | 36/72 [00:09<00:15,  2.38it/s]

wrong


 53%|█████▎    | 38/72 [00:10<00:12,  2.62it/s]

wrong


 58%|█████▊    | 42/72 [00:11<00:08,  3.41it/s]

wrong


 61%|██████    | 44/72 [00:12<00:08,  3.15it/s]

wrong


 76%|███████▋  | 55/72 [00:14<00:04,  3.55it/s]

wrong


 79%|███████▉  | 57/72 [00:14<00:04,  3.18it/s]

wrong


 92%|█████████▏| 66/72 [00:16<00:01,  3.41it/s]

wrong


 94%|█████████▍| 68/72 [00:17<00:01,  3.05it/s]

wrong


 99%|█████████▊| 71/72 [00:18<00:00,  3.31it/s]

wrong


100%|██████████| 72/72 [00:18<00:00,  3.94it/s]
  8%|▊         | 3/40 [00:00<00:10,  3.54it/s]

wrong


 38%|███▊      | 15/40 [00:02<00:06,  4.00it/s]

wrong


 52%|█████▎    | 21/40 [00:04<00:04,  4.11it/s]

wrong


 90%|█████████ | 36/40 [00:06<00:00,  4.38it/s]

wrong


100%|██████████| 40/40 [00:06<00:00,  6.02it/s]


In [66]:
pred_table(test_rel, y_hat_rel, y_rel, cots_rel, "Relevance Predictions")
pred_table(test_sup, y_hat_sup, y_sup, cots_sup, "Support Predictions")

In [56]:
conf_mat(y_hat_rel, y_rel, "Relevance Confusion Matrix", ["Irrelevant", "Relevant"])
conf_mat(y_hat_sup, y_sup, "Support Confusion Matrix", ["Unsupportive", "Supportive"])

In [57]:
avg_scores(y_hat_rel, y_hat_sup, y_rel, y_sup)

Average Validation Accuracy: 0.825
Average Validation Precision: 0.8203703703703704
Average Validation Recall: 0.825
Average Validation F1-Score: 0.8197478991596638


In [64]:
wandb.finish()

VBox(children=(Label(value='0.745 MB of 0.751 MB uploaded (0.004 MB deduped)\r'), FloatProgress(value=0.992583…

In [None]:
# model.save_pretrained_merged("Porpoise2.0", tokenizer, save_method = "merged_16bit")
# model.push_to_hub_merged("hf/porpoise2.0", tokenizer, save_method = "merged_16bit")