In [None]:
! pip install transformers accelerate datasets bitsandbytes peft sentencepiece

In [11]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files={"train": "../data/medquad.csv"})
print(dataset["train"][0])


{'question': 'What is (are) Glaucoma ?', 'answer': "Glaucoma is a group of diseases that can damage the eye's optic nerve and result in vision loss and blindness. While glaucoma can strike anyone, the risk is much greater for people over 60. How Glaucoma Develops  There are several different types of glaucoma. Most of these involve the drainage system within the eye. At the front of the eye there is a small space called the anterior chamber. A clear fluid flows through this chamber and bathes and nourishes the nearby tissues. (Watch the video to learn more about glaucoma. To enlarge the video, click the brackets in the lower right-hand corner. To reduce the video, press the Escape (Esc) button on your keyboard.) In glaucoma, for still unknown reasons, the fluid drains too slowly out of the eye. As the fluid builds up, the pressure inside the eye rises. Unless this pressure is controlled, it may cause damage to the optic nerve and other parts of the eye and result in loss of vision. Ope

In [12]:
from datasets import load_dataset

dataset = load_dataset(
    "csv",
    data_files={"train": "/Users/rohansingh/Documents/medicalQNA/data/medquad.csv"}
)

dataset = dataset["train"]
dataset


Dataset({
    features: ['question', 'answer', 'source', 'focus_area'],
    num_rows: 16412
})

## Qwen Preprocessing Format

following chat-style template for preparing instruction–answer pairs for Qwen training:

```text
<|im_start|>user
{question}
<|im_end|>
<|im_start|>assistant
{answer}
<|im_end|>


In [None]:
def format_chat(example):
    example["text"] = (
        "<|im_start|>user\n"
        f"You are a safe and helpful medical assistant.\n"
        f"Question: {example['question']}\n"
        "<|im_end|>\n"
        "<|im_start|>assistant\n"
        f"{example['answer']}\n"
        "<|im_end|>"
    )
    return example

dataset = dataset.map(format_chat)
dataset = dataset.train_test_split(test_size=0.02)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "Qwen/Qwen2.5-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype=torch.float16
)



In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
)

model = get_peft_model(model, lora_config)



In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="./qwen-small-medquad",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    max_steps=800,       # small training
    learning_rate=2e-4,
    fp16=True,
    logging_steps=20,
    save_steps=200,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset,
    tokenizer=tokenizer
)

trainer.train()


In [None]:
model = model.merge_and_unload()
model.save_pretrained("./qwen2.5-1.5B-medquad")
tokenizer.save_pretrained("./qwen2.5-1.5B-medquad")


## Test Inference

In [None]:

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_path = "./qwen2.5-1.5B-medquad"     # saved model

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")

def ask_medical(question):
    prompt = (
        "<|im_start|>user\n"
        f"{question}\n"
        "<|im_end|>\n"
        "<|im_start|>assistant\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.2,
        do_sample=False
    )

    text = tokenizer.decode(output[0], skip_special_tokens=True)
    return text.split("assistant")[-1].strip()  # clean response


In [None]:
print(ask_medical("What are the symptoms of iron deficiency anemia?"))

In [None]:
print(ask_medical("What causes chronic kidney disease in adults?"))

In [None]:
print(ask_medical("Explain high blood pressure in simple words."))

In [3]:
! pip install gradio

Collecting gradio
  Downloading gradio-5.49.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting audioop-lts<1.0 (from gradio)
  Downloading audioop_lts-0.2.2-cp313-abi3-macosx_11_0_arm64.whl.metadata (2.0 kB)
Collecting brotli>=1.1.0 (from gradio)
  Downloading brotli-1.2.0-cp313-cp313-macosx_10_13_universal2.whl.metadata (6.1 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.121.2-py3-none-any.whl.metadata (28 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-1.0.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.13.3 (from gradio)
  Downloading gradio_client-1.13.3-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.11.4-cp313-cp313-macosx_15_0_arm64.whl.metadata (41 kB)
Collecting pydub (from grad

In [10]:
! pip install --upgrade bitsandbytes



In [11]:
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# ---- CHANGE THIS ----
MODEL_PATH = "/Users/rohansingh/Documents/medicalQNA/project/content/qwen-medquad-output/merged"
# ----------------------

# quantization config (matches your config.json)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    trust_remote_code=True
)

def generate_answer(question):
    prompt = (
        "<|im_start|>user\n"
        f"{question}\n"
        "<|im_end|>\n"
        "<|im_start|>assistant\n"
    )
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.2,
        do_sample=False
    )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    # Return only the assistant part
    return decoded.split("assistant")[-1].strip()

# Gradio UI
ui = gr.Interface(
    fn=generate_answer,
    inputs=gr.Textbox(label="Ask a medical question"),
    outputs=gr.Textbox(label="Answer"),
    title="Medical Q&A using Qwen2 (Fine-tuned)",
)

ui.launch()




ImportError: The installed version of bitsandbytes (<0.43.1) requires CUDA, but CUDA is not available. You may need to install PyTorch with CUDA support or upgrade bitsandbytes to >=0.43.1.