In [1]:

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os

model_name = "../../../../projects/sciences/computing/sheju347/MedicalQA/train/saved_models/fine_tuned_model_entire_UltraMedical_batch_4"
model_name = os.path.abspath(model_name)
print(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only = True, trust_remote_code = False)
model = AutoModelForCausalLM.from_pretrained(model_name, local_files_only = True, trust_remote_code = False, torch_dtype=torch.bfloat16)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("cuda available:", torch.cuda.is_available())
print("device:", device)
print("done")


  from .autonotebook import tqdm as notebook_tqdm


/projects/sciences/computing/sheju347/MedicalQA/train/saved_models/fine_tuned_model_entire_UltraMedical_batch_4


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.27it/s]


cuda available: True
device: cuda
done


In [2]:
from llama_index.core import Settings
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

def set_llm(temperature = 0.0000001):
    llm = HuggingFaceLLM(
        model=model,
        tokenizer=tokenizer,
        context_window=4096,  # Phi-3 Mini typically has a 4k context window
        max_new_tokens=1024,   # Max tokens to generate per response
        generate_kwargs={"temperature": temperature, "do_sample": True},
        # system_prompt="You are a helpful and friendly AI assistant.",
        messages_to_prompt=lambda messages: tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        ),
        # You might need to set `device_map` here as well if not set in AutoModelForCausalLM
        # device_map="cuda" if torch.cuda.is_available() else "cpu",
    )

    Settings.llm = llm

set_llm()

embed_model = HuggingFaceEmbedding(
    model_name="microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract"
)
Settings.embed_model = embed_model
# embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
# # embedding_model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
# Settings.embed_model = HuggingFaceEmbedding(model_name=embedding_model_name)

print("done")

No sentence-transformers model found with name microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract. Creating a new one with mean pooling.


done


In [3]:
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir = "/projects/sciences/computing/sheju347/RAG/pubmed_index_100k")
index = load_index_from_storage(storage_context)

print(index)


<llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x14d104c12220>


In [6]:
from datasets import load_dataset
import torch
import re
import time
import logging

logging.basicConfig(
    filename='notebook_log_test_fine_tuned.txt',      # Log file name
    filemode='a',                    # Append mode
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO               # Log level
)

class TestPerformance():

    def __init__(self, model, tokenizer, device, temperature = 0.0000001):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.temperature = temperature
    
    def test_MedQA_response(self):# Test apply_chat_template // https://huggingface.co/docs/transformers/main/en/chat_templating
        print(self.model.name_or_path)

        def format_choices(choices):
            a = zip(list(choices.keys()), choices.values())
            final_answers = []
            for x,y in a:
                final_answers.append(f'[{x}] : {y}')
            return "\n".join(final_answers)


        def run_inference(content, model, tokenizer, max_new_tokens, temperature):
            # messages = [{"role": "user", "content": f"{content}"}]
            # # add_generation_prompt indicates the start of a response
            # inputs = tokenizer.apply_chat_template(messages, add_generation_prompt = True, return_tensors = "pt").to(self.device)
            # # print("inputs:", tokenizer.apply_chat_template(messages, add_generation_prompt = True, tokenize = False))
            # outputs = model.generate(inputs, max_new_tokens = max_new_tokens, do_sample = True, temperature = temperature)
            # text = tokenizer.batch_decode(outputs)[0]
            
            query_engine = index.as_query_engine()
            response = query_engine.query(content)
            text = response.response

            # Print the retrieved source nodes
            for i, node in enumerate(response.source_nodes):
                print(f"--- Source {i + 1} ---")
                print(node.text)
                print(f"Score: {node.score}")  # Optional: similarity score
                print()
            return text

        prompt = f'''
        {{question}} \n
        {{choices}}
        '''

        examples = [
        # # Training data index 0 (GBaker/MedQA-USMLE-4-options)
        # # correct: D
        # {"question": "A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?",
        # "choices": {
        # "A": "Ampicillin",
        # "B": "Ceftriaxone",
        # "C": "Doxycycline",
        # "D": "Nitrofurantoin"
        # },
        # "correct": "D",
        # "source": "Training data index 0 (GBaker/MedQA-USMLE-4-options)"
        # },

        # # Training data index 1
        # # correct: A
        # {"question": "A 3-month-old baby died suddenly at night while asleep. His mother noticed that he had died only after she awoke in the morning. No cause of death was determined based on the autopsy. Which of the following precautions could have prevented the death of the baby?",
        # "choices": {
        # "A": "Placing the infant in a supine position on a firm mattress while sleeping",
        # "B": "Keeping the infant covered and maintaining a high room temperature",
        # "C": "Application of a device to maintain the sleeping position",
        # "D": "Avoiding pacifier use during sleep"
        # },
        # "correct": "A",
        # "source": "Training data index 1 (GBaker/MedQA-USMLE-4-options)"
        # },

        # # Training data index 2
        # # correct: A
        # {"question": "A mother brings her 3-week-old infant to the pediatrician's office because she is concerned about his feeding habits. He was born without complications and has not had any medical problems up until this time. However, for the past 4 days, he has been fussy, is regurgitating all of his feeds, and his vomit is yellow in color. On physical exam, the child's abdomen is minimally distended but no other abnormalities are appreciated. Which of the following embryologic errors could account for this presentation?",
        # "choices": {
        # "A": "Abnormal migration of ventral pancreatic bud",
        # "B": "Complete failure of proximal duodenum to recanalize",
        # "C": "Abnormal hypertrophy of the pylorus",
        # "D": "Failure of lateral body folds to move ventrally and fuse in the midline"
        # },
        # "correct": "A",
        # "source": "Training data index 2 (GBaker/MedQA-USMLE-4-options)"
        # },

        # # Test data index 0
        # # correct: B
        # {"question": "A junior orthopaedic surgery resident is completing a carpal tunnel repair with the department chairman as the attending physician. During the case, the resident inadvertently cuts a flexor tendon. The tendon is repaired without complication. The attending tells the resident that the patient will do fine, and there is no need to report this minor complication that will not harm the patient, as he does not want to make the patient worry unnecessarily. He tells the resident to leave this complication out of the operative report. Which of the following is the correct next action for the resident to take?",
        # "choices":{
        # "A": "Disclose the error to the patient and put it in the operative report",
        # "B": "Tell the attending that he cannot fail to disclose this mistake",
        # "C": "Report the physician to the ethics committee",
        # "D": "Refuse to dictate the operative report"
        # },
        # "correct": "B",
        # "source": "Test data index 0 (GBaker/MedQA-USMLE-4-options)"
        # },

            
        {"question": "A mother brings her 3-week-old infant to the pediatrician's office because she is concerned about his feeding habits. He was born without complications and has not had any medical problems up until this time. However, for the past 4 days, he has been fussy, is regurgitating all of his feeds, and his vomit is yellow in color. On physical exam, the child's abdomen is minimally distended but no other abnormalities are appreciated. Which of the following embryologic errors could account for this presentation?",
        "choices": {
        "A": "Abnormal migration of ventral pancreatic bud",
        "B": "Complete failure of proximal duodenum to recanalize",
        "C": "Abnormal hypertrophy of the pylorus",
        "D": "Failure of lateral body folds to move ventrally and fuse in the midline"
        },
        "correct": "A",
        "source": "Train data index 2 (GBaker/MedQA-USMLE-4-options)"
        },

        ]

        for example in examples:
            formated_choices = format_choices(example["choices"])
    
            model_prompt = prompt.format(question = example["question"], choices = formated_choices)
    
            # print(model_prompt)
    
            output_text = run_inference(model_prompt, self.model, self.tokenizer, max_new_tokens = 1024, temperature = self.temperature)
            # output_text = output_text.split("<|assistant|>")[-1]
            print("model output:", output_text)
            print("\nsource: ", example["source"])
            print("correct: ", example["correct"])
            print("\n\n")

    def test_MedQA_test_data_accuracy(self, is_ensemble = False):

        if is_ensemble:
            set_llm(temperature = 0.7)
        else:
            set_llm()

        MAX_TOKEN_OUTPUT = 1024
        SPLIT = "test"
        DATA_RANGE = None #range(627, 643)

        regex_pattern=r"[\(\[]([A-Z])[\)\]]"
        regex = re.compile(regex_pattern)

        def format_choices(choices):
            a = zip(list(choices.keys()), choices.values())
            final_answers = []
            for x,y in a:
                final_answers.append(f'[{x}] : {y}')
            return "\n".join(final_answers)

        def find_match(regex, resp, convert_dict={}):
            match = regex.findall(resp)
            if match:
                match = match[-1]
                if isinstance(match, tuple):
                    match = [m for m in match if m][0]
                match = match.strip()
                if match and match in convert_dict: 
                    match = convert_dict[match]
            return match
                
        def extract_answer(response):
            matchFirst = re.search(r'the answer is .(\w).', response)
            if matchFirst:
                return f"({matchFirst.group(1)})"
            match = find_match(regex, response) 
            if match:
                return f"({match})"
            return "[invalid]"

        def run_inference_get_answer_letter(content):
            query_engine = index.as_query_engine()
            response = query_engine.query(content)
            text = response.response
        
            # # Print the retrieved source nodes
            # for i, node in enumerate(response.source_nodes):
            #     print(f"--- Source {i + 1} ---")
            #     print(node.text)
            #     print(f"Score: {node.score}")  # Optional: similarity score
            #     print()
            
            # print("outputs text:", text)
            text = text.split("<|assistant|>")[-1]
            # answer = tokenizer.decode(output[0], skip_special_tokens = True)

            answer = extract_answer(text).strip("()")
            
            print(f"answer: {answer}")
            logging.info(f"answer: {answer}")
            
            return answer
                
        def get_medqa_accuracy():
            # Load MedQA dataset
            # med_qa = load_dataset("bigbio/med_qa", trust_remote_code = True)
            med_qa = load_dataset("GBaker/MedQA-USMLE-4-options", trust_remote_code = True)
            keys = med_qa.keys()
            # print(len(med_qa["train"]), len(med_qa["validation"]), len(med_qa["test"]))

            print(f"model: {self.model.name_or_path}")
            logging.info(f"model: {self.model.name_or_path}")
            
            start_time = time.time()
            
            data_list = med_qa[SPLIT]
            if DATA_RANGE != None:
                data_list = data_list.select(DATA_RANGE)
            count = 0
            correct_count = 0
            for data in data_list:
                question = data["question"]
                answer_idx = data["answer_idx"]
                choices = data["options"]

                prompt = f'''\n{{question}}\n{{choices}}\n'''

                formated_choices = format_choices(choices)
                
                model_prompt = prompt.format(question = question, choices = formated_choices)
                
                # messages = [{"role": "user", "content": f"{model_prompt}"}]
                # inputs = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(self.device)
    
                # print("messages: ", messages)
                # break
                
                # print("correct answer: ", answer_idx)

                if is_ensemble:
                    answer_dict = {}
                    for i in range(0, 5):
                        current_answer = run_inference_get_answer_letter(model_prompt)
                        if current_answer in answer_dict:
                            answer_dict[current_answer] += 1
                        else:
                            answer_dict[current_answer] = 1
                    answer = max(answer_dict, key = answer_dict.get)
                else:
                    answer = run_inference_get_answer_letter(model_prompt)
                
                correct_answer = answer_idx
            
                is_correct = (answer == correct_answer)
                # print("Correct!!!" if is_correct else "Wrong")
            
            
                if is_correct:
                    correct_count += 1
            
                count += 1

                print(f"question {count}/{len(data_list)} answer:{answer} correct_answer:{correct_answer} {is_correct}")
                logging.info(f"question {count}/{len(data_list)} answer:{answer} correct_answer:{correct_answer} {is_correct}")
            
            accuracy = correct_count / count
            print(f"Total questions: {count}, correct: {correct_count}, accuracy: {accuracy}")
            logging.info(f"Total questions: {count}, correct: {correct_count}, accuracy: {accuracy}")
            
            finish_time = time.time()
            elapse_time = finish_time - start_time
            print(f"elapse_time: {elapse_time}")
            logging.info(f"elapse_time: {elapse_time}")

            return accuracy

        get_medqa_accuracy()

In [7]:
test = TestPerformance(model, tokenizer, device)
# test.test_MedQA_response()
test.test_MedQA_test_data_accuracy(is_ensemble = False)
# test.test_MedQA_test_data_accuracy(is_ensemble = True)

model: /projects/sciences/computing/sheju347/MedicalQA/train/saved_models/fine_tuned_model_entire_UltraMedical_batch_4
--- Source 1 ---
The limits of informed consent. The patient, a 59-year-old man, was referred to a psychiatric hospital with what appeared initially to be the signs and symptoms of mental disorder. In hospital a lesion of the brain was diagnosed and surgery was proposed to relieve the condition. The patient, however, during this and subsequent admissions to hospital, refused operation. His refusal to consent was regarded as valid as he seemed to have good insight into his condition. Finally, under section 26 of the Mental Health Act, he was treated surgically. Unfortunately the patient died six weeks later of intracranial haemorrhage. Three comments are made on this case - two by psychiatrists, Dr K Davison and Dr Ashley Robin, the other by a professor of Christian ethics, Professor F C Blackie. Both psychiatrists argue that when a patient's mind is affected by mental 

KeyboardInterrupt: 