In [1]:
import os
import time
import pickle
import numpy as np
from tqdm import tqdm
import torch
from bert_score import score
from sklearn.metrics import f1_score
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model
from transformers import (AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          AutoTokenizer,
                          GenerationConfig,
                          pipeline)
from transformers.pipelines.pt_utils import KeyDataset


bin C:\ProgramData\Miniconda\envs\qlora\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll


In [2]:
dataset = load_from_disk('data/finetuning_llama')['test']


In [3]:
accum = {}
for i in dataset['answer']:
    accum[i] = accum.get(i, 0) + 1

accum


{'no': 50, 'yes': 50}

In [4]:
# model_path = "llms/vicuna-13b-v1.5"
model_path = "llms/Llama-2-13b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    load_in_8bit=True,
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_path)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    generation_config=GenerationConfig(
                max_new_tokens=100,
                do_sample=True,
                top_p=0.1,
                temperature=0.1,
    )
)

pipe.tokenizer.pad_token_id = model.config.eos_token_id


In [6]:
def generate_result(pipeline, dataset):
    pred_list = []
    with torch.inference_mode():
        for out in tqdm(pipe(KeyDataset(dataset, 'conversation'), batch_size=2), total=len(dataset)):
            pred_list += out
    clean_pred = [ans['generated_text'].replace(prompt, "").strip()
                  for ans, prompt in zip(pred_list, dataset['conversation'])]
    return clean_pred


def binary_result(pred_list, true_list):
    result = [true_ans if true_ans in pred_ans.lower()[:10]
              else ('no' if true_ans=='yes' else 'yes')
              for pred_ans, true_ans in zip(pred_list, true_list)]
    return result


In [7]:
pred_long_answer = generate_result(pipe, dataset)


100%|██████████| 100/100 [13:09<00:00,  7.90s/it]


In [8]:
with open('results/llama_base.pkl', 'wb') as f:
    pickle.dump(pred_long_answer, f)

# with open('pred_llama_original.pkl', 'rb') as f:
#     pred_long_answer = pickle.load(f)


In [9]:
true_binary = dataset['answer']
pred_binary = binary_result(pred_long_answer, true_binary)
true_long_answer = dataset['full_answer']


In [10]:
short_ans_score = f1_score(true_binary, pred_binary, pos_label="yes")
print(f"f1 score: {short_ans_score:.3f}")


f1 score: 0.753


In [11]:
Precision, Recall, F1 = score(
    pred_long_answer, true_long_answer, lang='en', 
    model_type='microsoft/deberta-xlarge-mnli',
    rescale_with_baseline=True)

print(f'Average BERTScore: {torch.mean(F1).item():.3f}')


Average BERTScore: 0.238


In [12]:
def print_pair(idx):
    print(f"True: {true_long_answer[idx]}\n\nPred: {pred_long_answer[idx]}\n\nBERTScore: {F1[idx].item():.3f}")


In [13]:
print_pair(0)


True: No. Preoperative pulmonary embolism is not associated with worse early mortality, recurrence or cancer specific survival in patients with renal cell carcinoma and tumor thrombus.

Pred: No. Preoperative pulmonary embolism does not predict poor postoperative outcomes in patients with renal cell carcinoma and venous thrombus.

Based on the study's findings, there was no significant difference in 90-day mortality, recurrence rate, or cancer-specific survival between patients with and without preoperative pulmonary embolism. Additionally, preoperative pulmonary embolism was not

BERTScore: 0.445


In [14]:
print_pair(1)


True: Yes. These results indicate that a higher rate of eating is positively and independently associated with circulating IL-1β concentrations in Japanese men not being treated for metabolic diseases.

Pred: $}}% Yes, a higher rate of eating is associated with higher circulating interleukin-1β concentrations in Japanese men not being treated for metabolic diseases.

Based on the provided context, the study found a significant positive association between the rate of eating and circulating IL-1β concentrations in Japanese men. The Spearman correlation coefficient was 0.250, indicating a strong positive correlation between the two variables. Additionally, the association

BERTScore: 0.378


In [15]:
print_pair(2)


True: No. Non-compliance to the guideline is more common in older patients and in patients with melanoma in the head and neck region. After adjusting for confounders, a significant effect of complying with the guidelines on overall survival could not be observed.

Pred: $}}% Yes, non-compliance with the re-excision guidelines for cutaneous melanoma in The Netherlands does influence survival. Based on the provided context, the study found that after adjusting for age, gender, subsite, and Breslow thickness, there was no significant difference in overall survival between the compliance group and the non-compliance group. This suggests that non-compliance with the re-excision guidelines may have a

BERTScore: 0.267


In [16]:
print_pair(3)

True: Yes. Partial inhibition of TGF-beta using alpha(v)beta6 integrin antibodies is effective in blocking murine pulmonary fibrosis without exacerbating inflammation. In addition, the elevated expression of alpha(v)beta6, an activator of the fibrogenic cytokine, TGF-beta, in human pulmonary fibrosis suggests that alpha(v)beta6 monoclonal antibodies could represent a promising new therapeutic strategy for treating pulmonary fibrosis.

Pred: Based on the provided context, the answer is: YES.

The study found that low doses of the monoclonal antibody that blocks alpha(v)beta6-mediated TGF-beta activation attenuated collagen expression without increasing alveolar inflammatory cell populations or macrophage activation markers. This suggests that partial inhibition of integrin alpha(v)beta6 can prevent pulmonary fibrosis without exacerb

BERTScore: 0.324


In [17]:
print_pair(4)

True: Yes. DDR appears feasible and acceptable to minority youth. DDR may increase moderate-vigorous physical activity and improve physical fitness in at-risk populations.

Pred: No, based on the provided context, there is no direct correlation between pilot study of an active screen time game and improved physical fitness in minority elementary school youth. The study found that the participants' physical fitness levels improved, but this was not due to the active screen time game itself, but rather due to the increased movement to music that the participants maintained during the study. The study found that participants averaged 1.12 hours/day of increased movement to music, which led

BERTScore: 0.091
