In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "KrithikV/MedMobile"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code = False)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code = False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("cuda available:", torch.cuda.is_available())
print("device:", device)

ModuleNotFoundError: No module named 'transformers'

In [None]:
print ("FOR TEST")

print ("model: ", model.name_or_path)


prompt = """
Question: A junior orthopaedic surgery resident is completing a carpal tunnel repair with the department chairman as the attending physician. During the case, the resident inadvertently cuts a flexor tendon. The tendon is repaired without complication. The attending tells the resident that the patient will do fine, and there is no need to report this minor complication that will not harm the patient, as he does not want to make the patient worry unnecessarily. He tells the resident to leave this complication out of the operative report. Which of the following is the correct next action for the resident to take?
Choices:
A: Disclose the error to the patient but leave it out of the operative report
B: Disclose the error to the patient and put it in the operative report
C: Tell the attending that he cannot fail to disclose this mistake
D: Report the physician to the ethics committee
E: Refuse to dictate the operative report
Given five answer candidates, A, B, C, D, and E, choose the best answer choice.
The answer is:
"""

print ("Prompt: ", prompt)
print ("--------------")

import time
start_time = time.time()

inputs = tokenizer(prompt, return_tensors = "pt", padding = False).to(device)

with torch.no_grad():
    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens = 50,
        # attention_mask = inputs["attention_mask"],
        # pad_token_id = tokenizer.eos_token_id,
        use_cache = False,  # For "microsoft/Phi-3-mini-4k-instruct"
        # do_sample = False,
    )
        
answer = tokenizer.decode(outputs[0], skip_special_tokens = True)

answer = answer.replace(prompt, "").strip()


finish_time = time.time()
elapse_time = finish_time - start_time
print("elapse_time: ", elapse_time)
print ("--------------")

print("Answer:", answer)


In [None]:
import os

def get_folder_size(folder_path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            file_path = os.path.join(dirpath, filename)
            if os.path.exists(file_path):
                total_size += os.path.getsize(file_path)
    return total_size

folder_path = "/home/sheju347/.cache/"
size_in_mb = get_folder_size(folder_path) / (1024 * 1024 * 1024)
print(f"Folder size: {size_in_mb:.2f} GB")

In [None]:
import shutil
import os

def remove_all():
    shutil.rmtree("/home/sheju347/.cache/", ignore_errors=True)

remove_all()

In [1]:
!du -sh ./* ./.??*

32K	./Desktop
9.6G	./Documents
32K	./Downloads
24K	./Flan-T5-Medical.ipynb
56K	./MedMobile.ipynb
64K	./Medical_QA_test.ipynb
198M	./Miniconda3-latest-Linux-x86_64.sh
32K	./Music
72K	./Phi3_medical.ipynb
32K	./Pictures
32K	./Public
32K	./Templates
80K	./Test_DL.ipynb
24K	./Test_fine_tuned.ipynb
72K	./Train_Phi_3_mini.ipynb
32K	./Videos
616K	./code
455M	./data
56K	./dialogsum_train.ipynb
39M	./ondemand
32K	./saved
24K	./.ICEauthority
24K	./.Xauthority
32K	./.apptainer
48K	./.bash_history
24K	./.bashrc
21G	./.cache
24K	./.condarc
2.5M	./.config
88K	./.dbus
64K	./.gnupg
24K	./.icons
488K	./.ipynb_checkpoints
2.2M	./.ipython
520K	./.jupyter
56K	./.krb5
5.6G	./.local
14M	./.mozilla
472K	./.npm
64K	./.nv
24K	./.python_history
104K	./.slurm-apptainer
56K	./.ssh
56K	./.vnc


In [4]:
!du -sh .cache/huggingface/hub/*/

ERROR: ld.so: object '/usr/lib64/libstdc++.so.6' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
ERROR: ld.so: object '/usr/lib64/libstdc++.so.6' from LD_PRELOAD cannot be preloaded (cannot open shared object file): ignored.
684M	.cache/huggingface/hub/datasets--TsinghuaC3I--UltraMedical/
14M	.cache/huggingface/hub/datasets--knkarthick--dialogsum/
9.6G	.cache/huggingface/hub/models--KrithikV--MedMobile/
956M	.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B-instruct/
1.3G	.cache/huggingface/hub/models--google--flan-t5-base/
9.5G	.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/


In [None]:
!du -sh .local/lib/* .local/lib/.*

In [None]:
!find . -type d -name '.*' -exec du -sh {} \;

In [None]:
!df -h ./

In [None]:
import sys
print(sys.version)

In [1]:
import re
import sys
import unicodedata
# from openai import OpenAI
import os
from datasets import load_dataset
import random
import json
import requests
# from vllm import LLM, SamplingParams

# Set openai key if using gpt4o as engine.
# os.environ['OPENAI_API_KEY'] = "OPEN AI KEY HERE"

def return_parted_rows(df, part_ind, part_ind_list):
    # Extract chapter numbers from the 'chapters' column
    df['chapter_num'] = df['current_chapter'].apply(lambda x: int(re.search(r'\d+', x).group()))
    
    # Determine the start and end range for the given part_ind
    start = part_ind_list[part_ind]
    end = part_ind_list[part_ind + 1] - 1 if part_ind + 1 in part_ind_list else df['chapter_num'].max()

    # Filter the DataFrame to include only rows with chapters within the range
    df_filtered = df[(df['chapter_num'] >= start) & (df['chapter_num'] <= end)]
    
    # Drop the temporary 'chapter_num' column if not needed
    df_filtered = df_filtered.drop(columns=['chapter_num'])
    
    return df_filtered

def format_choices(choices):
    a = zip(list(choices.keys()), choices.values())
    final_answers = []
    for x,y in a:
        final_answers.append(f'[{x}] : {y}')
    return "\n".join(final_answers)

    
def format_examples(examples):
    formatted_examples = []
    for row in examples:
        example = f'## Question {row["question"]} \n ## Answer {row["answer"]}'
        formatted_examples.append(example)
    return "\n".join(formatted_examples)

def extract_samples(task, numShot, model_prompt):
    questions, answer_choices, correct_answers = task_load(task, 'train')
    example_indexes = random.sample(range(len(questions)), numShot)
    example_list = []
    for i in example_indexes:
        example_list.append(model_prompt.format(question=questions[i], choices=format_choices(answer_choices[i]), answer=correct_answers[i]))
    return example_list

def task_load(task, split):
    if task=="medqa":
        ds = load_dataset("GBaker/MedQA-USMLE-4-options", split=split)
        questions = [ds[i]['question'] for i in range(len(ds))]
        answer_choices = [ds[i]['options'] for i in range(len(ds))]
        correct_answers = [ds[i]['answer_idx'] for i in range(len(ds))]
        return questions, answer_choices, correct_answers
    
    elif task=="medmcqa":
        if split == 'test':
            split = 'validation'
        ds = load_dataset("openlifescienceai/medmcqa", split=split)
        questions = [ds[i]['question'] for i in range(len(ds))]
        answer_choices = [{"A": ds[i]['opa'], "B": ds[i]['opb'], "C": ds[i]['opc'], "D": ds[i]['opd']} for i in range(len(ds))]
        correct_answers = [chr(ds[i]['cop']+65) for i in range(len(ds))]
        return questions, answer_choices, correct_answers
    
    elif task=="medbullets_op4":
        path = "ADD MEDBULLETS PATH HERE"
        with open(path, 'r') as file:
            ds = json.load(file)
        questions = [ds['question'].values()]
        answer_choices = [{"A": ds['opa'][str(i)], "B": ds['opb'][str(i)], "C": ds['opc'][str(i)], "D": ds['opd'][str(i)]} for i in range(len(ds))]
        correct_answers = [ds['answer_idx'].values()]
        return questions, answer_choices, correct_answers

    elif task=="medbullets_op5":
        path = "ADD MEDBULLETS PATH HERE"
        with open(path, 'r') as file:
            ds = json.load(file)
        questions = [ds['question'].values()]
        answer_choices = [{"A": ds['opa'][str(i)], "B": ds['opb'][str(i)], "C": ds['opc'][str(i)], "D": ds['opd'][str(i)]} for i in range(len(ds))]
        correct_answers = [ds['answer_idx'].values()]
        return questions, answer_choices, correct_answers
    
    elif task=="pubmedqa":
        # This also contains context that is necessary for the question.
        path = "ADD PATH FOR PUBMEDQA HERE"

        with open(path, 'r') as file:
            ds = json.load(file)
        ds = list(ds.values())
        answer_choice_dict = {'A': "yes", 'B': "no", 'C': "maybe"}
        answer_choices = [answer_choice_dict]*len(ds)
        correct_answers = []
        questions = []
        for i in range(len(ds)):
            question_context = "Context: " + "\nContext: ".join(ds[i]['CONTEXTS'])
            questions.append(question_context + "\n" + ds[i]['QUESTION'])

            rev_answer_choice_dict = dict((v,k) for k,v in answer_choice_dict.items())
            answer = rev_answer_choice_dict.get(ds[i]['final_decision'])
            correct_answers.append(answer)
        return questions, answer_choices, correct_answers
    
    elif "mmlu" in task:
        subset = task.split("-", 1)[1]
        ds = load_dataset("cais/mmlu", subset, split=split)
        questions = [ds[i]['question'] for i in range(len(ds))]
        answer_choices = [{"A": ds[i]['choices'][0], "B": ds[i]['choices'][1], "C": ds[i]['choices'][2], "D": ds[i]['choices'][3]} for i in range(len(ds))]
        correct_answers = [chr(ds[i]['answer']+65) for i in range(len(ds))]
        return questions, answer_choices, correct_answers

    else:
        raise Exception("TASK NOT FOUND")

def filterContext(context):
    end_tag = "</end>"
    if end_tag in context:
        return context.split(end_tag)[0] + end_tag
    return context

def run_inference(content, engine, temp=0.0001, max_tokens_output=200, tokenizer=None, model=None, local=False, vllm = False):
    if local:
        messages = [{"role": "user", "content": f"{content}"}]
        inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to('cuda:0')
        outputs = model.generate(inputs, max_new_tokens=max_tokens_output, do_sample = True, temperature=temp)
        text = tokenizer.batch_decode(outputs)[0]
        return text.split("<|assistant|>")[-1]
    elif vllm:
        return None
    else:
        return None
        # client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
        # messages = [{"role": "user", "content": f"{content}"}]
        # response = client.chat.completions.create(
        #     model=engine,
        #     messages=messages,
        #     temperature=temp,
        #     max_tokens=max_tokens_output,
        #     frequency_penalty=0.0
        # )
        # response_text = response.choices[0].message.content
        # return response_text
    
class MultiChoiceFilter:
    # Inspiring from lmeval
    def __init__(self, ignore_case=False, ignore_punctuation=False, regex_pattern=r"[\(\[]([A-Z])[\)\]]"):
        
        self.ignore_case = ignore_case
        self.ignore_punctuation = ignore_punctuation
        self.regex_pattern = regex_pattern
        self.regex = re.compile(regex_pattern)
        self.punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode) 
                                       if unicodedata.category(chr(i)).startswith("P"))

    def filter_text(self, text):
        if self.ignore_case:
            text = text.lower()
        if self.ignore_punctuation:
            text = text.translate(self.punct_tbl)
        return text

    def find_match(self, regex, resp, convert_dict={}):
        match = regex.findall(resp)
        if match:
            match = match[-1]
            if isinstance(match, tuple):
                match = [m for m in match if m][0]
            match = match.strip()
            if match and match in convert_dict: 
                match = convert_dict[match]
        return match

    def extract_answer(self, response, choices=None):
        matchFirst = re.search(r'the answer is .(\w).', response)
        if matchFirst:
            return f"({matchFirst.group(1)})"
        match = self.find_match(self.regex, response) 
        if match:
            return f"({match})"
        return "[invalid]"

    def filter_responses(self, responses, choices):
        return [self.extract_answer(resp, choices) for resp in responses]


prompt_eval_bare_fully = f'''
{{question}} \n
{{choices}}
'''

ModuleNotFoundError: No module named 'datasets'

In [None]:
from tqdm import tqdm
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datetime import datetime

ENGINE = "KrithikV/MedMobile"
TASK_LIST = ['medqa'] # Options ["medqa", 'mmlu-anatomy', 'mmlu-professional_medicine', 'mmlu-college_biology', 'mmlu-college_medicine', 'mmlu-clinical_knowledge', 'mmlu-medical_genetics', pubmedqa, medmcqa"]
SPLIT = "test"
NUMBER_OF_ENSEMBLE = 5
if NUMBER_OF_ENSEMBLE > 1:
    ENGINE_TEMPERATURE = 0.7
else:
    ENGINE_TEMPERATURE = 0.000000001 
MAX_TOKEN_OUTPUT = 1024
NSHOT = 0

TEST_QUESTION_NUM = 10000000 # for debug purpose, stop after testing this number of questions

OUTPUT_DIR = "./Documents/" #"OUTPUT DIRECTORY HERE"
results_db = {
    "metadata": {
        "model" : ENGINE,
        "temperature" : ENGINE_TEMPERATURE,
        "num_shot" : NSHOT,
        "number_of_ensemble": NUMBER_OF_ENSEMBLE,
        "max_tokens" : MAX_TOKEN_OUTPUT,
    }
}

if NUMBER_OF_ENSEMBLE > 1:
    runName = f'MedMobile ({ENGINE}) + Ensemble ({NUMBER_OF_ENSEMBLE})'
else: 
    runName = f'MedMobile ({ENGINE})'

## DISPLAY HYPERPARAMETERS
for name, value in results_db['metadata'].items():
    print(f"{name} : {value}")

# tokenizer = AutoTokenizer.from_pretrained(ENGINE, trust_remote_code = False)
# model = AutoModelForCausalLM.from_pretrained(ENGINE, trust_remote_code = False) # Setting trust_remote_code = True will cause AttributeError: 'DynamicCache' object has no attribute 'get_max_length'

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

print("cuda available:", torch.cuda.is_available())
print("device:", device)

print("Model Running: " + ENGINE)
print("Run: " + runName)

mcf = MultiChoiceFilter(ignore_case=True, ignore_punctuation=True)

for task in TASK_LIST:
    question_list, answer_choices_list, correct_answer_list = task_load(task, SPLIT)
    # print("question_list", len(question_list), question_list[0])
    print(f"{task} loaded succesfully. Now conducting evaluation on {len(question_list)} samples.")
    
    start_time = time.time()
    model_db = []

    # tqdm: show progress bar
    for i, (question, answer_choices, correct_answer) in tqdm(enumerate(zip(question_list, answer_choices_list, correct_answer_list))):
        D = {}
        context = ""

        prompt = prompt_eval_bare_fully # NOT use RAG, not use k-shot prompting (NSHOT == 0)

        
        model_prompt = prompt.format(question=question, choices=format_choices(answer_choices), context = filterContext(context))

        # print(model_prompt)
        # print(correct_answer)

        D["query"] = question
        D["question_choices"] = answer_choices
        D["correct_answer"] = correct_answer
        D["attempts"] = []
        D["model_prompt"] = model_prompt
        
        for j in range(NUMBER_OF_ENSEMBLE):
            # run_inference()
            max_tokens_output = MAX_TOKEN_OUTPUT 
            messages = [{"role": "user", "content": f"{model_prompt}"}]
            inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(device) #.to('cuda:0')
            outputs = model.generate(inputs, 
                                     max_new_tokens=max_tokens_output, 
                                     do_sample = True, 
                                     temperature=ENGINE_TEMPERATURE,
                                     # use_cache = False, # TODO solve the problem
                                    )
            text = tokenizer.batch_decode(outputs)[0]
            text = text.split("<|assistant|>")[-1]

            query_object = {"id": ("attemp_" + str(j)), "COT": text}
            D["attempts"].append(query_object)

            # print(j, ": ", text)

        model_db.append(D)

        if i >= TEST_QUESTION_NUM - 1:
            break

    end_time = time.time()
    total_num_questions = 0
    num_correct = 0
    num_invalid = 0
    for q in model_db:
        choices = q["question_choices"]
        letter_counts = {}
        for attempt in q["attempts"]:
            model_choice = mcf.extract_answer(attempt["COT"], choices)
            attempt["model_choice"] = model_choice
            if model_choice in letter_counts:
                letter_counts[model_choice] += 1
            else:
                letter_counts[model_choice] = 1
        max_count = 0
        for letter, count in letter_counts.items():
            if count > max_count:
                q["ensemble_answer"] = letter
                max_count = count

        print("Ansemble answer: ", q["ensemble_answer"], " Correct answer: ", q["correct_answer"])
        
        total_num_questions += 1
        if q["ensemble_answer"].strip("()") == q["correct_answer"]:
            num_correct += 1
        elif q["ensemble_answer"] == "[invalid]":
            num_invalid += 1

    print("Number of correct answer: " + str(num_correct))
    print("Total number of questions: " + str(total_num_questions))
    print("Model accuracy: " + str(num_correct / total_num_questions))


    results_db_task = results_db.copy()
    results_db_task['metadata']['informal_run_name'] = runName
    results_db_task['metadata']['task'] = task
    results_db_task['metadata']['timestamp'] = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    results_db_task['metadata']['prompt'] = prompt
    results_db_task['metadata']['number_of_invalids'] = num_invalid
    results_db_task['metadata']['number_of_questions'] = total_num_questions
    results_db_task['metadata']['true_accuracy'] = num_correct / total_num_questions
    results_db_task['metadata']['eff_accuracy'] = num_correct/ (total_num_questions - num_invalid)
    results_db_task['metadata']['run_time'] = end_time - start_time
    results_db_task['metadata']['run_time_per_iteration'] = (end_time - start_time) / total_num_questions
    results_db_task['model_results'] = model_db
    
    filename = f"{OUTPUT_DIR}{task}/query_database_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.json"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'w') as file:
        json.dump(results_db_task, file, indent=4)
    
