In [None]:
! pip install -r /Users/rohansingh/Documents/medicalQNA/requirements.txt

In [None]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={"train": "../data/medquad.csv"})
print(dataset["train"][0])


In [None]:
! pip install transformers accelerate datasets bitsandbytes peft sentencepiece

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "csv",
    data_files={"train": "/Users/rohansingh/Documents/medicalQNA/data/medquad.csv"}
)

dataset = dataset["train"]
dataset


## Qwen Preprocessing Format

following chat-style template for preparing instructionâ€“answer pairs for Qwen training:

```text
<|im_start|>user
{question}
<|im_end|>
<|im_start|>assistant
{answer}
<|im_end|>


In [None]:
def format_chat(example):
    example["text"] = (
        "<|im_start|>user\n"
        f"You are a safe and helpful medical assistant.\n"
        f"Question: {example['question']}\n"
        "<|im_end|>\n"
        "<|im_start|>assistant\n"
        f"{example['answer']}\n"
        "<|im_end|>"
    )
    return example

dataset = dataset.map(format_chat)
dataset = dataset.train_test_split(test_size=0.02)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "Qwen/Qwen2.5-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.float16
)



In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
)

model = get_peft_model(model, lora_config)



In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="./qwen-small-medquad",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    max_steps=800,       # small training
    learning_rate=2e-4,
    fp16=True,
    logging_steps=20,
    save_steps=200,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset,
    tokenizer=tokenizer
)

trainer.train()


In [None]:
model = model.merge_and_unload()
model.save_pretrained("./qwen2.5-1.5B-medquad")
tokenizer.save_pretrained("./qwen2.5-1.5B-medquad")


## Test Inference

In [None]:

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_path = "./qwen2.5-1.5B-medquad"     # saved model

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")

def ask_medical(question):
    prompt = (
        "<|im_start|>user\n"
        f"{question}\n"
        "<|im_end|>\n"
        "<|im_start|>assistant\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.2,
        do_sample=False
    )

    text = tokenizer.decode(output[0], skip_special_tokens=True)
    return text.split("assistant")[-1].strip()  # clean response


In [None]:
print(ask_medical("What are the symptoms of iron deficiency anemia?"))

In [None]:
print(ask_medical("What causes chronic kidney disease in adults?"))

In [None]:
print(ask_medical("Explain high blood pressure in simple words."))