In [None]:
# Step 1: Setup Environment and Install Required Libraries
# first add api in side bar of colab wiht HF_TOKKEN and WANDB_API_TOKEN
# Install the basic Unsloth package from PyPI
# Unsloth helps in faster and memory-efficient fine-tuning of large language models (LLMs).
!pip install unsloth

# Force-reinstall the latest Unsloth version directly from GitHub
# - '--force-reinstall' ensures any old version is removed.
# - '--no-cache-dir' prevents using any locally cached packages.
# - '--no-deps' skips installing dependencies again (faster).
# This guarantees you are working with the newest Unsloth features and bug fixes.
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git


Collecting git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-req-build-aqo27lif
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-aqo27lif
  Resolved https://github.com/unslothai/unsloth.git to commit 7a8f99e1890213cdd01a3ab6c3e13174a96e8220
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2025.4.1-py3-none-any.whl size=192657 sha256=b7e4393a33bbf608fcf8eda13c83a175028133614e249f39351e1793e359bb03
  Stored in directory: /tmp/pip-ephem-wheel-cache-hlakti8e/wheels/d1/17/05/850ab10c33284a4763b0595cd8ea9d01fce6e221cac24b3c01
Successfully built unsloth
Installing collected packages: unsloth
 

In [None]:
# Step 3: Import Necessary Libraries

# Import FastLanguageModel from Unsloth
# - Fast optimized wrapper for loading and handling LLMs with 4-bit quantization and flash attention.
from unsloth import FastLanguageModel

# Import torch
# - Core deep learning library for handling tensors and model operations.
import torch

# Import SFTTrainer
# - Trainer from TRL (Transformer Reinforcement Learning) for supervised fine-tuning (SFT) tasks.
from trl import SFTTrainer

# Import utility function to check if bfloat16 is supported on current hardware (for mixed precision training).
from unsloth import is_bfloat16_supported

# Import Hugging Face Hub login utility
# - Needed to authenticate and access models from HuggingFace Hub.
from huggingface_hub import login

# Import TrainingArguments
# - Class to define hyperparameters and settings for training a transformer model.
from transformers import TrainingArguments

# Import load_dataset
# - Used to load datasets from HuggingFace Datasets library (local or online).
from datasets import load_dataset

# Import Weights and Biases (wandb)
# - For experiment tracking, live visualization of training metrics (optional but useful).
import wandb


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch SmolVLMForConditionalGeneration forward function.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
# Step 4: Authenticate Hugging Face Account using Token

# Import Google Colab's userdata module
# - Allows secure access to stored private data like API tokens inside Colab.
from google.colab import userdata

# Retrieve the Hugging Face API token stored securely in userdata
# - 'HF_TOKEN' must be added manually in your Colab session settings.
hf_token = userdata.get('HF_TOKEN')

# Login to Hugging Face Hub
# - Authenticates your session so you can load, fine-tune, and push models to Hugging Face.
login(hf_token)


In [None]:
# Optional: Check GPU Availability

# Import torch
# - Core deep learning library, also used to check hardware (GPU) availability.
import torch

# Print whether CUDA (GPU support) is available
# - CUDA is NVIDIA's technology that accelerates deep learning operations.
print("CUDA available:", torch.cuda.is_available())

# Print the name of the GPU device if available
# - If a GPU is available, it prints the specific GPU model (like Tesla T4, A100).
# - If no GPU, it prints "No GPU".
print("GPU device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


CUDA available: True
GPU device: Tesla T4


In [None]:
# Step 5: Setup Pretrained DeepSeek-R1 Model for Fine-Tuning

# Define the model name
# - We are using DeepSeek R1 Distilled Llama 8B model from Hugging Face.
model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"

# Set the maximum sequence length
# - Defines how many tokens the model can handle per input during fine-tuning or inference.
# - 2048 tokens = about 1000–1600 words depending on tokenization.
max_sequence_length = 2048

# Set datatype
# - dtype is kept None here to automatically choose the best format (float16, bfloat16).
dtype = None

# Choose loading mode
# - load_in_4bit=True loads the model in 4-bit precision for huge memory savings.
# - Can also choose 8-bit, 16-bit, or 32-bit loading if you have enough GPU memory.
load_in_4bit = True

# Load the model and tokenizer using Unsloth's FastLanguageModel
# - Fast loading with support for 4-bit quantization, flash attention, and memory optimization.
# - Needs Hugging Face authentication token to download private or large models.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_sequence_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token
)


==((====))==  Unsloth 2025.4.1: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
# Step 6: Setup the System Prompt Template

# Define the system prompt style
# - This prompt sets the instructions for how the model should behave during fine-tuning or inference.
# - Helps the model focus on providing structured, step-by-step reasoning in its answers.
prompt_style = """
Below is a task description along with additional context provided in the input section. Your goal is to provide a well-reasoned response that effectively addresses the request.

Before crafting your answer, take a moment to carefully analyze the question. Develop a clear, step-by-step thought process to ensure your response is both logical and accurate.

### Task:
You are a medical expert specializing in clinical reasoning, diagnostics, and treatment planning. Answer the medical question below using your advanced knowledge.

### Query:
{}

### Answer:
<think>{}
"""


In [None]:
# Step 7: Run Inference on the Fine-Tuned Model

# Define a test question for the model
# - Medical diagnostic question to check if the model can generate clinical reasoning answers.
question = """A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or
              sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings,
              what would cystometry most likely reveal about her residual volume and detrusor contractions?"""

# Set the model in inference (evaluation) mode
# - Disables gradient computation, saving memory and speeding up generation.
FastLanguageModel.for_inference(model)

# Tokenize the input
# - Converts text into model-readable token IDs.
# - Applies the earlier defined prompt format.
# - Moves input tensors to GPU ("cuda") for faster processing.
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

# Generate a response
# - Runs the model forward pass to predict the answer.
# - `max_new_tokens=1200` means model can generate up to 1200 tokens in the answer.
# - `use_cache=True` enables faster decoding by caching key/values from attention layers.
outputs = model.generate(
    input_ids = inputs.input_ids,
    attention_mask = inputs.attention_mask,
    max_new_tokens = 1200,
    use_cache = True
)

# Decode the output tokens back into human-readable text
response = tokenizer.batch_decode(outputs)

# Print the final generated response
print(response)


["<｜begin▁of▁sentence｜>\nBelow is a task description along with additional context provided in the input section. Your goal is to provide a well-reasoned response that effectively addresses the request.\n\nBefore crafting your answer, take a moment to carefully analyze the question. Develop a clear, step-by-step thought process to ensure your response is both logical and accurate.\n\n### Task:\nYou are a medical expert specializing in clinical reasoning, diagnostics, and treatment planning. Answer the medical question below using your advanced knowledge.\n\n### Query:\nA 61-year-old woman with a long history of involuntary urine loss during activities like coughing or\n              sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings,\n              what would cystometry most likely reveal about her residual volume and detrusor contractions?\n\n### Answer:\n<think>\nOkay, so I'm trying to figure out what cystometry would show for this

In [None]:
# Post-processing: Extract and Print Only the Model's Answer

# - After generation, the full output includes the full prompt + the answer.
# - We split the output at the special marker "### Answer:" to isolate only the generated answer.
# - Then print the pure answer for clean display.

print(response[0].split("### Answer:")[1])



<think>
Okay, so I'm trying to figure out what cystometry would show for this 61-year-old woman. Let me start by breaking down the information given. She has a history of involuntary urine loss when she coughs or sneezes, but she doesn't leak at night. That makes me think about possible causes for her symptoms.

First, the Q-tip test was done during her gynecological exam. I remember that the Q-tip test is used to check for urethral obstruction. The provider inserts a Q-tip catheter into the urethra and measures the pressure. If the pressure is high, it suggests that the urethral opening is narrow, leading to difficulty in voiding and possibly causing urinary retention or other symptoms.

Now, considering her symptoms—involuntary leakage during activities like coughing or sneezing—this is classic for stress urinary incontinence (SUI). SUI typically occurs due to a weak pelvic floor muscle, which can't effectively support the urethra during activities that increase abdominal pressure. 

In [None]:
# Step 8: Setup Fine-Tuning

# Load the Medical Reasoning Dataset
# - We are using the 'medical-o1-reasoning-SFT' dataset from Hugging Face.
# - "en" specifies the English version of the dataset.
# - 'train[:500]' means we are only loading the first 500 examples for faster testing/training.
# - 'trust_remote_code=True' allows loading custom dataset scripts if the dataset repo uses them.
medical_dataset = load_dataset(
    "FreedomIntelligence/medical-o1-reasoning-SFT",
    "en",
    split="train[:500]",
    trust_remote_code=True
)


Generating train split:   0%|          | 0/19704 [00:00<?, ? examples/s]

In [None]:
# View a Single Example from the Dataset

# - Access and display the second item (index 1) from the loaded medical dataset.
# - Useful to inspect the format and fields of the dataset (e.g., input text, target answer).
# - Important step before fine-tuning to make sure the dataset structure matches your model input format.

medical_dataset[1]


{'Question': 'A 33-year-old woman is brought to the emergency department 15 minutes after being stabbed in the chest with a screwdriver. Given her vital signs of pulse 110/min, respirations 22/min, and blood pressure 90/65 mm Hg, along with the presence of a 5-cm deep stab wound at the upper border of the 8th rib in the left midaxillary line, which anatomical structure in her chest is most likely to be injured?',
 'Complex_CoT': "Okay, let's figure out what's going on here. A woman comes in with a stab wound from a screwdriver. It's in her chest, upper border of the 8th rib, left side, kind of around the midaxillary line. First thought, that's pretty close to where the lung sits, right?\n\nLet's talk about location first. This spot is along the left side of her body. Above the 8th rib, like that, is where a lot of important stuff lives, like the bottom part of the left lung, possibly the diaphragm too, especially considering how deep the screwdriver went.\n\nThe wound is 5 cm deep. Tha

In [None]:
# Define End-of-Sequence (EOS) Token

# - The EOS token (End Of Sequence) is a special token that tells the model
#   where the input or output should stop during generation.
# - It is important during training to mark the end of each generated answer properly.
# - The EOS token is usually something like '</s>' depending on the tokenizer used.

EOS_TOKEN = tokenizer.eos_token

# Display the EOS token to confirm what symbol it uses
EOS_TOKEN


'<｜end▁of▁sentence｜>'

In [None]:
# Step: Define the Training Prompt Style (Updated for Fine-Tuning)

# - Define a new training prompt format specifically for fine-tuning.
# - This updated prompt now includes the special </think> tag.
# - </think> tag clearly marks the end of the "thought" or reasoning section before generating the final answer.
# - Using structured prompts like this improves model logical flow during training.

train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answer the following medical question.

### Question:
{}

### Response:
<think>
{}
</think>
{}"""


In [None]:
# Step: Prepare the Data for Fine-Tuning

# Define a function to preprocess input data from the dataset
def preprocess_input_data(examples):
    # Extract fields from the dataset
    # - "Question" = the medical question asked.
    # - "Complex_CoT" = chain of thought (step-by-step reasoning).
    # - "Response" = final answer text.
    inputs = examples["Question"]
    cots = examples["Complex_CoT"]
    outputs = examples["Response"]

    texts = []  # Initialize list to hold formatted training examples

    # Loop through each sample and format it using the defined prompt style
    for input, cot, output in zip(inputs, cots, outputs):
        # Fill the training prompt with input question, chain of thought, and response
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN
        texts.append(text)

    # Return dictionary format expected by the trainer (a "texts" field)
    return {
        "texts": texts,
    }


In [None]:
# Step: Apply Preprocessing to the Dataset

# Map the preprocessing function over the entire medical dataset
# - 'map' applies 'preprocess_input_data' to each batch of examples.
# - 'batched=True' means it processes multiple samples at once (more efficient).
# - This will add a new "texts" field to each example, formatted for model input.

finetune_dataset = medical_dataset.map(preprocess_input_data, batched=True)


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
# Step: View a Preprocessed Example

# Access and display the first preprocessed example from the fine-tuning dataset
# - This shows how the 'Question', 'Chain of Thought', and 'Response' were combined.
# - Useful for verifying that the input text is correctly formatted before training starts.

finetune_dataset["texts"][0]


"Below is an instruction that describes a task, paired with an input that provides further context.\nWrite a response that appropriately completes the request.\nBefore answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n\n### Instruction:\nYou are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.\nPlease answer the following medical question.\n\n### Question:\nGiven the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings?\n\n### Response:\n<think>\nOkay, let's see what's going on here. We've got sudden weakness in the person's left arm and leg - and that screams something neuro-related, maybe a stroke?\n\nBut wait, there's more. The right lower leg is sw

In [None]:
# Step 9: Setup and Apply LoRA Fine-Tuning to the Model

# Prepare the model for LoRA fine-tuning using Unsloth's FastLanguageModel wrapper
model_lora = FastLanguageModel.get_peft_model(
    model = model,  # Base model loaded earlier
    r = 16,  # LoRA rank: size of the small adapter matrices inserted into the model layers

    # Specify the target modules where LoRA adapters should be injected
    # - Attention projections (q_proj, k_proj, v_proj, o_proj)
    # - Feedforward network projections (gate_proj, up_proj, down_proj)
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ],

    lora_alpha = 16,  # LoRA scaling factor: controls strength of LoRA updates
    lora_dropout = 0,  # Dropout applied inside LoRA (set to 0 for deterministic fine-tuning)
    bias = "none",  # No bias added in LoRA adapters
    use_gradient_checkpointing = "unsloth",  # Enable gradient checkpointing to save memory during backpropagation
    random_state = 3047,  # Set random seed for reproducibility
    use_rslora = False,  # Do not use rank-stable LoRA (advanced, optional technique)
    loftq_config = None  # No LoFTQ quantization applied (can be used if combining LoRA + quantization tricks)
)


Unsloth 2025.4.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
# Important: Clean Up Model Before Creating Trainer

# - Check if the model has an attribute called '_unwrapped_old_generate'.
# - This attribute is sometimes added by Unsloth or during fast generation hacks.
# - It can interfere with HuggingFace's Trainer, causing errors when generating outputs.
# - If it exists, delete it safely before initializing the trainer.

if hasattr(model, '_unwrapped_old_generate'):
    del model._unwrapped_old_generate


In [None]:
# Step: Setup the Trainer for Fine-Tuning

# Initialize the SFTTrainer (Supervised Fine-Tuning Trainer)
trainer = SFTTrainer(
    model = model_lora,            # The model with LoRA adapters applied
    tokenizer = tokenizer,         # Tokenizer for converting text to tokens
    train_dataset = finetune_dataset,  # Preprocessed fine-tuning dataset
    dataset_text_field = "texts",   # The dataset field name containing formatted input texts
    max_seq_length = max_sequence_length,  # Maximum sequence length during training
    dataset_num_proc = 1,           # Number of CPU processes for data loading (1 = no multiprocessing)

    # Define detailed training arguments
    args = TrainingArguments(
        per_device_train_batch_size = 2,  # Train 2 examples per device at a time
        gradient_accumulation_steps = 4,  # Accumulate gradients over 4 steps to simulate a batch size of 8
        num_train_epochs = 1,             # Train for 1 full pass over the dataset
        warmup_steps = 5,                 # Number of warmup steps for learning rate scheduler
        max_steps = 60,                   # Stop training after 60 optimization steps
        learning_rate = 2e-4,              # Initial learning rate (0.0002)
        fp16 = not is_bfloat16_supported(),  # Use 16-bit floats (fp16) if bf16 is not available
        bf16 = is_bfloat16_supported(),      # Use bfloat16 precision if available (better stability)
        logging_steps = 10,               # Log metrics every 10 steps
        optim = "adamw_8bit",              # Use 8-bit AdamW optimizer (memory efficient)
        weight_decay = 0.01,               # Apply weight decay regularization to prevent overfitting
        lr_scheduler_type = "linear",      # Linearly decrease learning rate over time
        seed = 3407,                       # Set random seed for reproducibility
        output_dir = "outputs",            # Save model checkpoints and logs into the "outputs" directory
    ),
)


Unsloth: Tokenizing ["texts"]:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
# Step: Setup Weights and Biases (WandB) for Experiment Tracking

# Import Colab userdata to securely fetch private tokens
from google.colab import userdata

# Retrieve the WANDB API token from Colab's secure storage
# - 'WANDB_API_TOKEN' must be set in the Colab environment for this to work.
wnb_token = userdata.get("WANDB_API_TOKEN")

# Login to Weights and Biases using the retrieved token
# - Enables tracking training metrics, model checkpoints, and logs automatically.
wandb.login(key=wnb_token)

# Initialize a new WandB run
run = wandb.init(
    project='Fine-tune-DeepSeek-R1-on-Medical-CoT-Dataset',  # Project name in your WandB account
    job_type="training",  # Label this run as a "training" job
    anonymous="allow"     # Allow anonymous access if the token is not linked to a public account
)


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msheryar-malik[0m ([33msheryar-malik-ayass-bioscience[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Step: Start the Fine-Tuning Process

# Begin training the model using the SFTTrainer
# - This will start the supervised fine-tuning process based on the defined dataset, model, and training arguments.
# - All training metrics will automatically be logged to WandB if it was initialized.

trainer_stats = trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,1.9312
20,1.4245
30,1.4067
40,1.365
50,1.3914
60,1.3694


In [None]:
# Step: Finish and Close the WandB Run

# - Gracefully close the Weights and Biases (WandB) logging session.
# - This finalizes uploading all metrics, logs, artifacts to the WandB dashboard.
# - Prevents memory leaks and ensures the run is properly recorded.

wandb.finish()


0,1
train/epoch,▁▂▄▅▇██
train/global_step,▁▂▄▅▇██
train/grad_norm,█▂▁▁▂▁
train/learning_rate,█▇▅▄▂▁
train/loss,█▂▂▁▁▁

0,1
total_flos,1.673729227431936e+16
train/epoch,0.96
train/global_step,60.0
train/grad_norm,0.25691
train/learning_rate,0.0
train/loss,1.3694
train_loss,1.48138
train_runtime,1064.0712
train_samples_per_second,0.451
train_steps_per_second,0.056


In [None]:
# Step 10: Testing the Model After Fine-Tuning

# Define a sample medical question to test the fine-tuned model
question = """A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing
              but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings,
              what would cystometry most likely reveal about her residual volume and detrusor contractions?"""

# Set the model in inference (evaluation) mode
# - Disables gradient updates and enables efficient generation settings.
FastLanguageModel.for_inference(model_lora)

# Tokenize the test input
# - Format the question using the training prompt style.
# - Convert the formatted text into token IDs.
# - Move input tensors to GPU ("cuda") for faster inference.
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

# Generate a response from the model
# - 'max_new_tokens=1200' limits the maximum number of tokens the model can generate.
# - 'use_cache=True' enables faster decoding by caching key/values during generation.
outputs = model_lora.generate(
    input_ids = inputs.input_ids,
    attention_mask = inputs.attention_mask,
    max_new_tokens = 1200,
    use_cache = True
)

# Decode the generated tokens back into human-readable text
response = tokenizer.batch_decode(outputs)

# Print the final model-generated answer
print(response)


["<｜begin▁of▁sentence｜>\nBelow is a task description along with additional context provided in the input section. Your goal is to provide a well-reasoned response that effectively addresses the request.\n\nBefore crafting your answer, take a moment to carefully analyze the question. Develop a clear, step-by-step thought process to ensure your response is both logical and accurate.\n\n### Task:\nYou are a medical expert specializing in clinical reasoning, diagnostics, and treatment planning. Answer the medical question below using your advanced knowledge.\n\n### Query:\nA 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing\n              but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings,\n              what would cystometry most likely reveal about her residual volume and detrusor contractions?\n\n### Answer:\n<think>\nAlright, let's think about this. This woman is 61 and has been dealin

In [None]:
print(response[0].split("### Answer:")[1])


<think>
Alright, let's think about this. This woman is 61 and has been dealing with involuntary urine loss for a long time, especially when she coughs or sneezes. That's a classic sign of urinary incontinence, probably due to an overactive bladder. 

Now, she's done a Q-tip test, which is often used to check for urethral obstruction. If the Q-tip test is negative, it usually means there's no obstruction there. That's good news because if there were a blockage, we might need to do something about it.

Given that she's not leaking at night, it suggests that her bladder capacity isn't being exceeded during sleep. This is a good sign because it means her bladder can hold up to 500 ml without leaking. So, if we look at her bladder, it should be able to hold that volume comfortably.

Now, let's think about what cystometry might show. In cystometry, we measure how much her bladder can hold and how it reacts when we add pressure or stimulation. If her bladder is functioning well and she doesn

In [None]:
# Step: Test the Fine-Tuned Model with New Medical Questions

# Define a new medical question (or multiple related questions)
# - These questions ask about important biological pathways related to cancer.
question = """Describe the PI3K-AKT pathway and its role in cancer.
What genes are involved in the MAPK signaling pathway?"""

# Set the model in inference (evaluation) mode
# - Ensures the model runs efficiently without tracking gradients.
FastLanguageModel.for_inference(model_lora)

# Tokenize the input
# - Format the new question using the training prompt style.
# - Convert the formatted text into token IDs.
# - Move the input tensors to the GPU ("cuda") for fast inference.
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

# Generate a response from the model
# - 'max_new_tokens=1200' allows up to 1200 new tokens for detailed answers.
# - 'use_cache=True' improves decoding speed.
outputs = model_lora.generate(
    input_ids = inputs.input_ids,
    attention_mask = inputs.attention_mask,
    max_new_tokens = 1200,
    use_cache = True
)

# Decode the generated output tokens back into readable text
response = tokenizer.batch_decode(outputs)

# Post-process the output
# - Split the generated response at "### Answer:" to isolate only the final clean answer.
print(response[0].split("### Answer:")[1])



<think>
Okay, so I'm trying to understand this PI3K-AKT pathway and its role in cancer. Let's start by breaking down what PI3K and AKT are. PI3K stands for phosphatidylinositol 3-kinase. It sounds a bit complicated, but I remember that it's part of this signaling pathway that's really important for cell growth and survival. When PI3K is active, it basically takes a lipid called phosphatidylinositol and turns it into phosphatidylinositol 3-phosphate. This is like a signal that tells the cell to grow and survive.

Now, what does AKT do? AKT, which is also known as protein kinase B, is a key player in this pathway. It takes the signal from PI3K and modifies it in a way that tells the cell to live longer. AKT does this by phosphorylating other proteins, which in turn affects their activity. For example, it can turn on certain survival signals that keep the cell growing and prevent it from undergoing apoptosis, which is programmed cell death.

I've heard that mutations in the PI3K-AKT path

In [None]:
model.save_pretrained("final_model")
tokenizer.save_pretrained("final_model")


In [None]:
!zip -r final_model.zip final_model


In [None]:
from google.colab import files
files.download('final_model.zip')
