<a href="https://colab.research.google.com/github/sheryar47/-Parameter-Efficient-Supervised-Fine-Tuning-of-LLaMA-3.2-3B-on-a-Medical-Chain-of-Thought-Dataset/blob/main/Parameter-Efficient%20Supervised%20Fine-Tuning%20of%20LLaMA%20%203.2%20(3B)%20on%20a%20Medical%20Chain-of-Thought%20Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install -q unsloth wandb huggingface_hub datasets evaluate rouge-score peft transformers trl accelerate

# Verify GPU availability
!nvidia-smi

# Import libraries
from unsloth import FastLanguageModel
import torch
from transformers import TrainingArguments
from trl import SFTTrainer
import wandb
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from evaluate import load
import pandas as pd
import numpy as np

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
Sat Apr 26 12:34:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   64C    P0             28W /   70W |     102MiB /  15360MiB |      0%      

In [None]:
# Login to Weights & Biases
wandb.login()

# Login to Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

def prepare_dataset():
    # Load dataset
    dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en")

    # Convert to pandas DataFrame
    df = pd.DataFrame(dataset['train'])

    # Split dataset (100 samples for validation)
    train_df, val_df = train_test_split(df, test_size=100, random_state=42)

    # Formatting function
    def format_text(row):
        return f"""Below is a medical query requiring step-by-step reasoning and a final response.

### Medical Query:
{row['input']}

### Chain-of-Thought Reasoning:
<think>
{row['rationale']}
</think>

### Final Response:
<response>
{row['output']}
</response>"""

    # Apply formatting
    train_df['text'] = train_df.apply(format_text, axis=1)
    val_df['text'] = val_df.apply(format_text, axis=1)

    # Convert back to Dataset format
    return Dataset.from_pandas(train_df), Dataset.from_pandas(val_df)


In [None]:
def initialize_model():
    # Load 4-bit quantized model
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/llama-3-8B-bnb-4bit",
        max_seq_length = 2048,
        dtype = None,
        load_in_4bit = True,
    )

    # Prepare LoRA configuration
    model = FastLanguageModel.get_peft_model(
        model,
        r = 16,
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj"],
        lora_alpha = 16,
        lora_dropout = 0,
        bias = "none",
        use_gradient_checkpointing = True,
        random_state = 42,
    )

    return model, tokenizer

model, tokenizer = initialize_model()

==((====))==  Unsloth 2025.4.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Unsloth 2025.4.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
def evaluate_model(model, tokenizer, dataset, num_samples=10):
    rouge = load("rouge")
    predictions = []
    references = []

    for i in range(min(num_samples, len(dataset))):
        # Prepare input
        prompt = f"""Below is a medical query requiring step-by-step reasoning and a final response.

### Medical Query:
{dataset[i]['input']}

### Chain-of-Thought Reasoning:
<think>"""

        # Generate output
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
        )

        # Process output
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(prediction)
        references.append(dataset[i]['output'])

    # Calculate ROUGE score
    return rouge.compute(
        predictions=predictions,
        references=references,
        rouge_types=["rougeL"],
    )["rougeL"].mid.fmeasure

In [None]:
def setup_training():
    # Initialize W&B
    wandb.init(project="medical-reasoning-llama3")

    # Training arguments
    return TrainingArguments(
        output_dir = "./results",
        num_train_epochs = 3,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        learning_rate = 2e-5,
        # Removed 'optim' and 'fp16/bf16' because older versions may not support directly
        logging_steps = 10,
        # Removed 'evaluation_strategy', 'eval_steps', 'save_strategy', 'save_steps', 'report_to'
    )

training_args = setup_training()


In [None]:
print(df.columns)

Index(['Question', 'Complex_CoT', 'Response'], dtype='object')


In [None]:
print(df.columns)

Index(['Question', 'Complex_CoT', 'Response'], dtype='object')


In [None]:
def format_text(row):
    return f"""Below is a medical query requiring step-by-step reasoning and a final response.

### Medical Query:
{row['instruction']}

### Chain-of-Thought Reasoning:
<think>
{row['rationale']}
</think>

### Final Response:
<response>
{row['answer']}
</response>"""

In [None]:
def save_model():
    # Save model
    model.save_pretrained_merged("llama3-medical-reasoning", tokenizer, save_method="lora")

    # Create model card
    model_card = """---
license: apache-2.0
tags:
- medical
- reasoning
- llama-3
---

# Medical Reasoning LLaMA 3 Adapter

Fine-tuned for medical chain-of-thought reasoning.
"""

    # Save files
    with open("README.md", "w") as f:
        f.write(model_card)

    # Upload to Hub
    # Replace "your_username" with your actual Hugging Face username or organization name
    model.push_to_hub("sheri57/llama3-medical-reasoning")
    tokenizer.push_to_hub("sheri57/llama3-medical-reasoning")

save_model()

# Clean up
wandb.finish()

Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... Done.


README.md:   0%|          | 0.00/574 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/sheri57/llama3-medical-reasoning


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
from huggingface_hub import HfApi

# Check if model was uploaded successfully
api = HfApi()
model_info = api.model_info("sheri57/llama3-medical-reasoning")
print(f"Model successfully uploaded at: {model_info.id}")
print(f"Last modified: {model_info.lastModified}")

Model successfully uploaded at: sheri57/llama3-medical-reasoning
Last modified: 2025-04-26 13:00:03+00:00


In [None]:
from unsloth import FastLanguageModel

# Load your fine-tuned model
model, tokenizer = FastLanguageModel.from_pretrained(
    "sheri57/llama3-medical-reasoning",
    load_in_4bit=True,
)

# Test inference
def generate_response(question):
    prompt = f"""Below is a medical query requiring step-by-step reasoning and a final response.

### Medical Query:
{question}

### Chain-of-Thought Reasoning:
<think>"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with a medical question
print(generate_response("What are the early symptoms of diabetes?"))

==((====))==  Unsloth 2025.4.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Below is a medical query requiring step-by-step reasoning and a final response.

### Medical Query:
What are the early symptoms of diabetes?

### Chain-of-Thought Reasoning:
<think> What are the early symptoms of diabetes? </think>
<examine> What are the early symptoms of diabetes? </examine>
<reason> What are the early symptoms of diabetes? </reason>
<infer> What are the early symptoms of diabetes? </infer>
<deduce> What are the early symptoms of diabetes? </deduce>
<infer> What are the early symptoms of diabetes? </infer>
<deduce> What 

In [None]:
import os
if not os.path.exists("./results"):
    os.makedirs("./results")

In [None]:
import shutil
import datetime

# Create timestamped folder
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
backup_dir = f"./training_artifacts_{timestamp}"

# Save important files
shutil.make_archive(backup_dir, 'zip', "./results")  # Training logs
shutil.copy("README.md", backup_dir)
print(f"Training artifacts saved to: {backup_dir}")

Training artifacts saved to: ./training_artifacts_20250426_130804


In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.upload_folder(
    folder_path="./results", # or wherever your model is saved
    repo_id="sheri57/llama3-medical-reasoning",
    repo_type="model"
)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/sheri57/llama3-medical-reasoning/commit/4177830c5acacc49033caff4de061dee6b4680a3', commit_message='Upload folder using huggingface_hub', commit_description='', oid='4177830c5acacc49033caff4de061dee6b4680a3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sheri57/llama3-medical-reasoning', endpoint='https://huggingface.co', repo_type='model', repo_id='sheri57/llama3-medical-reasoning'), pr_revision=None, pr_num=None)

In [None]:
# Make model public (if desired)
from huggingface_hub import update_repo_visibility
update_repo_visibility("sheri57/llama3-medical-reasoning", private=False)

{'private': False}

In [None]:
def sanitize_input(text):
    # Very basic sanitization function
    if not isinstance(text, str):
        return None
    return text.strip().lower()

def get_medical_response(disease_name):
    # 1. Sanitize input
    clean_input = sanitize_input(disease_name)
    if not clean_input:
        return "Please provide a valid disease name"

    # 2. Format prompt
    prompt = f"""Below is a medical query requiring step-by-step reasoning and a final response.

### Medical Query:
What are the symptoms and treatment options for {clean_input}?

### Chain-of-Thought Reasoning:
<think>"""

    try:
        # 3. Tokenize
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")

        # 4. Generate
        outputs = model.generate(
            **inputs,
            max_new_tokens=250,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
        )

        # 5. Decode
        full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # 6. Extract relevant part
        return full_response.split("### Medical Query:")[-1]

    except Exception as e:
        return f"Error processing {clean_input}: {str(e)}"

# Test it
print(get_medical_response("malaria"))
print(get_medical_response("type 2 diabetes"))

Error processing malaria: name 'tokenizer' is not defined
Error processing type 2 diabetes: name 'tokenizer' is not defined


In [None]:
# Free up GPU memory
import gc
del model
del tokenizer
torch.cuda.empty_cache()
gc.collect()

49270

In [None]:
import gradio as gr

def medical_assistant(question):
    question = question.lower()  # Make it lowercase to handle cases like Fever, fever, FEVER etc.

    if "fever" in question:
        return "For fever, stay hydrated, rest, and consult a doctor if the fever persists."
    elif "headache" in question:
        return "For headaches, drink water, rest in a dark room, and consider over-the-counter pain relief if needed."
    elif "diabetes" in question:
        return "Diabetes management includes healthy eating, regular exercise, and monitoring blood sugar levels."
    else:
        return "I'm not sure about that. Please consult a qualified medical professional for accurate advice."

def safe_medical_assistant(question):
    try:
        return medical_assistant(question)
    except Exception as e:
        return f"Error occurred: {str(e)}"

interface = gr.Interface(fn=safe_medical_assistant, inputs="text", outputs="text")
interface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7ae2963c2566f9024f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
!pip install gradio
import gradio as gr

def answer_question(question):
    return generate_response(question)

iface = gr.Interface(
    fn=answer_question,
    inputs=gr.Textbox(lines=3, placeholder="Enter medical question..."),
    outputs="text",
    title="Medical Reasoning Assistant"
)
iface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://eb139f7aed43dfe7dc.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


