In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
!pip install -q wandb rouge-score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [3]:
# Login to Weights & Biases
import wandb, os
os.environ["WANDB_API_KEY"] = "b6c3453db684ba9f6b28cfa8da346cf14a8b759a"  # Replace with your key
wandb.init(project="llama3-medical-cot")

# Check GPU
import torch
print("GPU Available:", torch.cuda.is_available())

[34m[1mwandb[0m: Currently logged in as: [33msamad995542[0m ([33msamad995542-codealpha[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


GPU Available: True


In [4]:
from datasets import load_dataset, Dataset
import pandas as pd
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [5]:
from datasets import load_dataset, Dataset
import pandas as pd

# Load dataset from Hugging Face
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en")
df = pd.DataFrame(dataset['train'])

# Verify dataset columns
print("Columns in dataset:", df.columns)
print("Sample data:", df[['Question', 'Complex_CoT', 'Response']].head(2))

# Format with Question, <think>, and <response> tags
df["formatted"] = df.apply(
    lambda row: f"Question: {row['Question']}\n<think>{row['Complex_CoT']}</think><response>{row['Response']}</response>",
    axis=1
)

# Handle null or non-string values
df["formatted"] = df["formatted"].astype(str).fillna("")

# Split into train and validation sets
val_df = df.sample(n=100, random_state=42)
train_df = df.drop(val_df.index)

# Convert to Dataset objects with "text" column
train_dataset = Dataset.from_dict({"text": train_df["formatted"].tolist()})
eval_dataset = Dataset.from_dict({"text": val_df["formatted"].tolist()})

# Verify dataset structure
print("Train dataset type:", type(train_dataset))
print("Train dataset columns:", train_dataset.column_names)
print("Train dataset sample:", train_dataset[0])
print("Eval dataset type:", type(eval_dataset))
print("Eval dataset columns:", eval_dataset.column_names)
print("Eval dataset sample:", eval_dataset[0])

README.md:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

medical_o1_sft.json:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/19704 [00:00<?, ? examples/s]

Columns in dataset: Index(['Question', 'Complex_CoT', 'Response'], dtype='object')
Sample data:                                             Question  \
0  Given the symptoms of sudden weakness in the l...   
1  A 33-year-old woman is brought to the emergenc...   

                                         Complex_CoT  \
0  Okay, let's see what's going on here. We've go...   
1  Okay, let's figure out what's going on here. A...   

                                            Response  
0  The specific cardiac abnormality most likely t...  
1  In this scenario, the most likely anatomical s...  
Train dataset type: <class 'datasets.arrow_dataset.Dataset'>
Train dataset columns: ['text']
Train dataset sample: {'text': "Question: Given the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings

In [6]:
from unsloth import FastLanguageModel

# Load the model in 4-bit
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)

# Apply LoRA adaptation (fixed version without `task_type`)
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
)


==((====))==  Unsloth 2025.5.8: Fast Llama patching. Transformers: 4.52.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.5.8 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
import os

# Critical for T4 GPU stability
os.environ["TRITON_DISABLE"] = "1"  # Disables problematic Triton compilation
os.environ["UNSLOTH_DISABLE_FLASH"] = "1"  # Disables flash attention

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # This enables evaluation
    dataset_text_field="text",
    max_seq_length=1024,  # Reduced from 2048 for T4 memory constraints
    data_collator=DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        pad_to_multiple_of=8  # Helps with memory alignment
    ),
    dataset_num_proc=2,
    packing=False,  # Required for stability with Unsloth
    args=TrainingArguments(
        output_dir="outputs",
        per_device_train_batch_size=1,  # Reduced from 2 for T4
        gradient_accumulation_steps=8,
        warmup_steps=10,
        max_steps=200,
        learning_rate=1e-5,
        fp16=True,  # Force fp16 since T4 doesn't support bf16
        logging_steps=10,
        optim="adamw_8bit",  # Better for T4 memory
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        save_total_limit=2,
        eval_steps=50,  # Evaluation every 50 steps
        report_to="wandb" if "wandb" in globals() else "none",
        ddp_find_unused_parameters=False,
    ),
)

# Start training
trainer.train()

Unsloth: Tokenizing ["text"]:   0%|          | 0/19604 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"]:   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 19,604 | Num Epochs = 1 | Total steps = 200
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856/3,000,000,000 (0.81% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,1.8176
20,1.7934
30,1.7639
40,1.744
50,1.7527
60,1.7167
70,1.6716
80,1.6737
90,1.6494
100,1.6771


TrainOutput(global_step=200, training_loss=1.670088291168213, metrics={'train_runtime': 2273.9929, 'train_samples_per_second': 0.704, 'train_steps_per_second': 0.088, 'total_flos': 1.7016130970099712e+16, 'train_loss': 1.670088291168213})

In [16]:
# Clear GPU memory
torch.cuda.empty_cache()

# Ensure inference mode
model = FastLanguageModel.for_inference(model)

# Prepare inputs (aligned with training format)
question = "A 56-year-old patient presents with sudden chest pain radiating to the left arm. What is the most likely diagnosis?"
prompt = f"Question: {question}\n<think>"

# Tokenize
inputs = tokenizer(
    prompt,
    return_tensors="pt",
).to("cuda")

# Generate response
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=300,  # Increased to allow full reasoning and answer
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
    )

# Decode output
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\nModel Answer:\n")
if "<response>" in response and "</response>" in response:
    print(response)
elif "<response>" in response:
    print(response + "</response>")
    print("\nWarning: Response may be incomplete (missing </response>). Consider increasing max_new_tokens.")
else:
    print(response)
    print("\nError: No <response> tag found. Response is incomplete. Consider increasing max_new_tokens or checking model training.")

# Clear GPU memory
torch.cuda.empty_cache()


Model Answer:

Question: A 56-year-old patient presents with sudden chest pain radiating to the left arm. What is the most likely diagnosis?
<think>Alright, let's break this down. This patient is 56 years old, and they're coming in with sudden chest pain that radiates to the left arm. Now, we need to think about what this could mean. 

First, the chest pain is a big clue. It's sudden, so it's probably not just a minor annoyance or something like indigestion. Sudden chest pain usually points towards something serious, like a heart issue. 

Radiating pain to the left arm is also a common symptom for a specific condition. It makes me think of angina pectoris, where the pain is usually in the chest, but it can also spread to the left arm because of the way the heart's nerves work.

Now, angina pectoris is usually caused by blockages in the coronary arteries, which supply blood to the heart. These blockages are often due to atherosclerosis, which is like plaque buildup in the arteries. It'

In [9]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

# Dummy predictions and references for demonstration
preds = val_df["formatted"].iloc[:5].tolist()
refs = val_df["formatted"].iloc[:5].tolist()  # Replace with actual model output later

# Compute average ROUGE-L
scores = [scorer.score(p, r)["rougeL"].fmeasure for p, r in zip(preds, refs)]
print("Average ROUGE-L:", sum(scores) / len(scores))


Average ROUGE-L: 1.0


In [10]:
model.save_pretrained("lora_adapter")
tokenizer.save_pretrained("tokenizer")

# Upload to Hugging Face (manual step, or use CLI if token added)
!huggingface-cli login  # Paste your token here when prompted

!huggingface-cli upload lora_adapter --repo-id abdulsamad99/medical-fine-tuning --type model
!huggingface-cli upload tokenizer --repo-id abdulsamad99/medical-fine-tuning --type model


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `medical fine tuning` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is:

In [11]:
from huggingface_hub import HfApi, ModelCard

# Replace with your actual Hugging Face token
api = HfApi(token="enter your token of huggingface")

# 1. Create a repository (if it doesn't exist)
api.create_repo(
    repo_id="abdulsamad99/medical-fine-tuning",  # No spaces!
    repo_type="model",
    exist_ok=True
)

# 2. Save your model and tokenizer locally first
model.save_pretrained("medical_model")
tokenizer.save_pretrained("medical_model")

# 3. Upload to Hub
api.upload_folder(
    folder_path="medical_model",
    repo_id="abdulsamad99/medical-fine-tuning",
    repo_type="model"
)

# 4. Create a model card (optional but recommended)
card = ModelCard.load("medical_model/README.md")  # Create this file first
card.push_to_hub("abdulsamad99/medical-fine-tuning")

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/abdulsamad99/medical-fine-tuning/commit/b33f7bfe94989a23c8d9b2650ee03cd77dafdc1d', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='b33f7bfe94989a23c8d9b2650ee03cd77dafdc1d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/abdulsamad99/medical-fine-tuning', endpoint='https://huggingface.co', repo_type='model', repo_id='abdulsamad99/medical-fine-tuning'), pr_revision=None, pr_num=None)

In [12]:
# Import unsloth first to apply optimizations
from unsloth import FastLanguageModel
import torch
from transformers import AutoTokenizer

# Clear GPU memory
torch.cuda.empty_cache()

# Check GPU
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0))

# Model configuration
model_id = "abdulsamad99/medical-fine-tuning"
max_seq_length = 512  # Reduced for memory

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load model in 4-bit
model, _ = FastLanguageModel.from_pretrained(
    model_name=model_id,
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True,
)

# Enable inference mode for memory efficiency
model = FastLanguageModel.for_inference(model)

# Move to GPU
model = model.to("cuda")

# Inference prompt (aligned with training format)
question = "A 65-year-old woman presents with slurred speech and weakness on one side of her body. What is the most likely diagnosis?"
prompt = f"Question: {question}\n<think>"

# Tokenize
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate
with torch.no_grad():  # Reduce memory usage
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,  # Increased slightly to allow reasoning + answer
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
    )

# Decode
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\nModel Answer:\n")
if "<response>" in decoded:
    print(decoded)
else:
    print(decoded + "</response>")

# Clear GPU memory
torch.cuda.empty_cache()

GPU Available: True
GPU Name: Tesla T4


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/3.83k [00:00<?, ?B/s]

==((====))==  Unsloth 2025.5.8: Fast Llama patching. Transformers: 4.52.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]


Model Answer:

Question: A 65-year-old woman presents with slurred speech and weakness on one side of her body. What is the most likely diagnosis?
<think>Okay, let's think about this case. We have a 65-year-old woman who's experiencing slurred speech and weakness on one side of her body. That sounds like a classic stroke scenario. The symptoms of slurred speech and unilateral weakness are very typical of an ischemic stroke.

Now, I want to consider the possible causes. There are a couple of possibilities here. The first one is an ischemic stroke. This could be due to a blood clot in a cerebral artery, which might</response>
