In [1]:
import os
import time
import pickle
import numpy as np
from tqdm import tqdm
import torch
from bert_score import score
from sklearn.metrics import f1_score
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model, PeftModel, PeftConfig
from transformers import (AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          AutoTokenizer,
                          GenerationConfig,
                          pipeline)
from transformers.pipelines.pt_utils import KeyDataset


bin C:\ProgramData\Miniconda\envs\qlora\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll


In [2]:
ckpt = 60  # 20 40 60 80 100
lora_rank = 128


In [3]:
dataset = load_from_disk('data/finetuning_llama')['test']


In [4]:
accum = {}
for i in dataset['answer']:
    accum[i] = accum.get(i, 0) + 1

accum


{'no': 50, 'yes': 50}

In [5]:
# model_path = "llms/vicuna-13b-v1.5"
model_path = "llms/Llama-2-13b-chat-hf"
adapter_path = f"checkpoints/llama_{lora_rank}/checkpoint-{ckpt}"

tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    padding_side="right",
    use_fast=False,
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    trust_remote_code=True,
    load_in_8bit=True
)

model = PeftModel.from_pretrained(model, adapter_path)
tokenizer.pad_token_id = model.config.eos_token_id


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
def keep_answer(text, marker="[/INST] "):
    if marker in text:
        start_index = text.index(marker) + len(marker)
        return text[start_index:].strip()
    else:
        return ""


def llm_batch_generate(prompt, max_token=100, top_p=0.1, temperature=0.1, without_prompt=True):
    inputs = tokenizer.batch_encode_plus(prompt, return_tensors="pt", padding=True).to("cuda")
    with torch.inference_mode():
        outputs = model.generate(
                **inputs,
                generation_config=GenerationConfig(
                    do_sample=True,
                    max_new_tokens=max_token,
                    top_p=top_p,
                    temperature=temperature,
                )
        )

    text = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    if without_prompt:
        if type(text)==list:
            return [keep_answer(i) for i in text]
        else:
            return keep_answer(text)

    return text


In [5]:
def generate_result(dataset, batch_size=2):
    pred_list = []
    for i in tqdm(range(0, len(dataset), batch_size), desc="Generating answers"):
        end_idx = min(i + batch_size, len(dataset))
        batch = dataset[i:end_idx]
        pred_list += llm_batch_generate(batch['conversation'])
    return pred_list


def binary_result(pred_list, true_list):
    result = [true_ans if true_ans in pred_ans.lower()[:10]
              else ('no' if true_ans=='yes' else 'yes')
              for pred_ans, true_ans in zip(pred_list, true_list)]
    return result


In [8]:
pred_long_answer = generate_result(dataset)


Generating answers: 100%|██████████| 50/50 [13:54<00:00, 16.69s/it]


In [9]:
with open(f'results/pred_long_llama_{lora_rank}_ckpt_{ckpt}.pkl', 'wb') as f:
    pickle.dump(pred_long_answer, f)


In [7]:
with open(f'results/pred_long_llama_{lora_rank}_ckpt_{ckpt}.pkl', 'rb') as f:
    pred_long_answer = pickle.load(f)


In [8]:
true_binary = dataset['answer']
pred_binary = binary_result(pred_long_answer, true_binary)
true_long_answer = dataset['full_answer']


In [9]:
short_ans_score = f1_score(true_binary, pred_binary, pos_label="yes")
print(f"f1 score: {short_ans_score:.3f}")


f1 score: 0.899


In [10]:
Precision, Recall, F1 = score(
    pred_long_answer, true_long_answer, lang='en',
    model_type='microsoft/deberta-xlarge-mnli',
    rescale_with_baseline=True)

print(f'Average BERTScore: {torch.mean(F1).item():.3f}')


Average BERTScore: 0.437


In [11]:
def print_pair(idx):
    print(f"True: {true_long_answer[idx]}\n\nPred: {pred_long_answer[idx]}\n\nBERTScore: {F1[idx].item():.3f}")


In [12]:
print_pair(0)

True: No. Preoperative pulmonary embolism is not associated with worse early mortality, recurrence or cancer specific survival in patients with renal cell carcinoma and tumor thrombus.

Pred: No. Preoperative pulmonary embolism is not associated with poor postoperative outcomes in patients with renal cell carcinoma and venous thrombus.

BERTScore: 0.721


In [13]:
print_pair(1)

True: Yes. These results indicate that a higher rate of eating is positively and independently associated with circulating IL-1β concentrations in Japanese men not being treated for metabolic diseases.

Pred: $}}% Yes. Our findings suggest that a higher rate of eating is associated with higher circulating IL-1β concentrations in Japanese men not being treated for metabolic diseases.

BERTScore: 0.828


In [14]:
print_pair(2)

True: No. Non-compliance to the guideline is more common in older patients and in patients with melanoma in the head and neck region. After adjusting for confounders, a significant effect of complying with the guidelines on overall survival could not be observed.

Pred: $}}% No. The results of this study suggest that non-compliance with the guideline for re-excision of CMM does not have a significant impact on survival.

BERTScore: 0.305


In [18]:
print_pair(3)

True: Yes. Partial inhibition of TGF-beta using alpha(v)beta6 integrin antibodies is effective in blocking murine pulmonary fibrosis without exacerbating inflammation. In addition, the elevated expression of alpha(v)beta6, an activator of the fibrogenic cytokine, TGF-beta, in human pulmonary fibrosis suggests that alpha(v)beta6 monoclonal antibodies could represent a promising new therapeutic strategy for treating pulmonary fibrosis.

Pred: Yes. Inhibition of alpha(v)beta6-mediated TGF-beta activation may be a useful therapeutic strategy for pulmonary fibrosis, as it may be possible to inhibit TGF-beta at sites of alpha(v)beta6 up-regulation without affecting other homeostatic roles of TGF-beta.

BERTScore: 0.352


In [19]:
print_pair(4)

True: Yes. DDR appears feasible and acceptable to minority youth. DDR may increase moderate-vigorous physical activity and improve physical fitness in at-risk populations.

Pred: Yes. This study suggests that DDR may be a feasible and acceptable way to increase physical fitness in minority elementary school youth.

BERTScore: 0.570


In [20]:
# r128@020 step: f1 - 0.333; BERTScore - 0.392
# r128@040 step: f1 - 0.881; BERTScore - 0.411
# r128@060 step: f1 - 0.899; BERTScore - 0.437
# r128@080 step: f1 - 0.918; BERTScore - 0.417
# r128@100 step: f1 - 0.922; BERTScore - 0.350

# r064@020 step: f1 - 0.462; BERTScore - 0.402
# r064@040 step: f1 - 0.862; BERTScore - 0.419
# r064@060 step: f1 - 0.923; BERTScore - 0.427
# r064@080 step: f1 - 0.896; BERTScore - 0.411
# r064@100 step: f1 - 0.931; BERTScore - 0.360

# base           f1 - 0.400; BERTScore - 0.209
