In [1]:
import os

# Set env vars BEFORE importing huggingface modules
os.environ["HF_HOME"] = "/projects/sciences/computing/sheju347/.cache/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/projects/sciences/computing/sheju347/.cache/huggingface/hub"

# Now import huggingface modules
from huggingface_hub import constants

print("HF_HOME:", constants.HF_HOME)
print("HF_HUB_CACHE:", constants.HF_HUB_CACHE)

HF_HOME: /projects/sciences/computing/sheju347/.cache/huggingface
HF_HUB_CACHE: /projects/sciences/computing/sheju347/.cache/huggingface/hub


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "Junyi-Shen/Junyi-Phi3-mini-UltraMedical"

is_bf16_supported = torch.cuda.is_bf16_supported()

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype = torch.bfloat16 if is_bf16_supported else torch.float16,
    device_map="auto"
)

prompt_format_normal = '''
{question}

{choices}
'''

question = """A 6-year-old boy is brought to the physician by his mother because of a 2-day history of dysuria and increased urinary frequency. Vital signs are within normal limits. Urinalysis shows cloudy, red urine. This patient's clinical presentation is best explained by an infection with a virus with which of the following features?"""

formated_choices = """
[A] : Non-enveloped with linear, single-stranded DNA
[B] : Non-enveloped with linear, single-stranded RNA
[C] : Enveloped with linear, single-stranded RNA
[D] : Non-enveloped with linear, double-stranded DNA"""
prompt = prompt_format_normal.format(question = question, choices = formated_choices)

messages = [{"role": "user", "content": f"{prompt}"}]

inputs = tokenizer.apply_chat_template(messages, add_generation_prompt = True, return_tensors = "pt").to(model.device)

outputs = model.generate(
    inputs,
    max_new_tokens=1024,
    do_sample=False
)

print(tokenizer.decode(outputs[0], skip_special_tokens = False))

2025-12-14 23:37:28,512 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.17it/s]


<|user|> A 6-year-old boy is brought to the physician by his mother because of a 2-day history of dysuria and increased urinary frequency. Vital signs are within normal limits. Urinalysis shows cloudy, red urine. This patient's clinical presentation is best explained by an infection with a virus with which of the following features?


[A] : Non-enveloped with linear, single-stranded DNA
[B] : Non-enveloped with linear, single-stranded RNA
[C] : Enveloped with linear, single-stranded RNA
[D] : Non-enveloped with linear, double-stranded DNA
<|end|><|assistant|> The clinical presentation of dysuria and increased urinary frequency, along with cloudy, red urine, suggests a urinary tract infection (UTI). In children, the most common cause of UTI is a bacterial infection, typically by Escherichia coli. However, viral infections can also cause UTIs, particularly in the case of adenovirus or polyomavirus.

Adenoviruses are non-enveloped viruses with linear, double-stranded DNA, and they can cau

In [8]:
from context_retriever import ContextRetriever
from peft import PeftModel, PeftConfig

model_name_lora = "Junyi-Shen/Junyi-Phi3-mini-context-lora-adapter"
model_lora = PeftModel.from_pretrained(model, model_name_lora)



context_retriever = ContextRetriever(model.device)
context_retriever.set_params(topK_searchEngine = 150, 
                             topK_SPLADE = 30, 
                             topK_denseEmbedding = 0, 
                             topK_crossEncoder = 1, 
                             topK_LLM = 0)
context = context_retriever.get_RAG_context(question, formated_choices)

prompt_format_RAG = '''
Context:
{context}

Question:
{question}

{choices}
'''

prompt = prompt_format_RAG.format(context = context, question = question, choices = formated_choices)

messages = [{"role": "user", "content": f"{prompt}"}]

inputs = tokenizer.apply_chat_template(messages, add_generation_prompt = True, return_tensors = "pt").to(model.device)

# inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model_lora.generate(
    inputs,
    max_new_tokens=1024,
    do_sample=False
)

print(tokenizer.decode(outputs[0], skip_special_tokens = False))


2025-12-14 23:41:56,773 - INFO - Use pytorch device_name: cuda:0
2025-12-14 23:41:56,773 - INFO - Load pretrained SparseEncoder: naver/splade-v3


Template validated successfully!
Successfully created pointwise inference handler!


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Token indices sequence length is longer than the specified maximum sequence length for this model (712 > 512). Running this sequence through the model will result in indexing errors
2025-12-14 23:42:17,844 - INFO - RAG data: [{'docId': 'pubmed23n1009_265', 'BM25_score': 50.3242, 'BM25_ranking': 41, 'SPLADE_score': 57.06997299194336, 'SPLADE_ranking': 13, 'MonoT5_score': 0.4637843906295596, 'MonoT5_ranking': 1}]


RAG data: [{'docId': 'pubmed23n1009_265', 'BM25_score': 50.3242, 'BM25_ranking': 41, 'SPLADE_score': 57.06997299194336, 'SPLADE_ranking': 13, 'MonoT5_score': 0.4637843906295596, 'MonoT5_ranking': 1}]
<|user|> Context:
Accidental ketosis-induced polyuria in a toddler: a case report. In the pediatric population, parental concern of recent onset frequent or large volume urination in young children is common. A 2-year-old male with no significant past medical history and unremarkable family history was brought to his pediatrician by his mother who reports that the child had been "soaking through his diapers" for the previous two to 3 days. Mother states that patient has not had an appreciable change in the number of wet diapers per day, just the perceived weight/volume of each diaper. The patient's mother denied any recent illness, apparent abdominal pain, dysuria, or recent changes in his bowel movements. She similarly denied polydipsia, polyphagia, or gross hematuria in the patient. Pati