In [4]:
# %%capture
# %pip install -U bitsandbytes
# %pip install -U transformers
# %pip install -U peft
# %pip install -U accelerate
# !pip install -U trl 


### Notebook 1: Fine-Tuning a Medical Text Simplification Model
 This notebook sets up the environment for fine-tuning a pre-trained language model with LoRA
 (Low-Rank Adaptation) for a medical text simplification task. The steps include loading the dataset,
 configuring the base model, applying efficient fine-tuning with adapters, setting training hyperparameters,
 and finally saving and testing the model.

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch, wandb
from datasets import load_dataset
from trl import SFTTrainer
import pandas as pd
from datasets import Dataset

os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# secret_hf = user_secrets.get_secret("HUGGINGFACE_TOKEN")
# secret_wandb = user_secrets.get_secret("wandb")

In [3]:
# !huggingface-cli login --token $secret_hf

In [4]:
# Monitering the LLM
# wandb.login(key = secret_wandb)
# run = wandb.init(
#     project='Fine tuning mistral 12B', 
#     job_type="training", 
#     anonymous="allow"
# )

In [5]:
# dataset_name = "liliya-makhmutova/medical_texts_simplification"

In [6]:
# dataset_name

In [7]:
# dataset = load_dataset(dataset_name)


### Dataset Preparation
 
 Here we prepare the dataset by reading a CSV file containing original and simplified medical texts.
 We then combine the texts with special tokens to denote instruction format and convert the DataFrame into a Hugging Face dataset.


In [None]:
dataset = pd.read_csv("/root/med/med_texts_simplified.csv")

In [3]:
dataset

Unnamed: 0,file_number,line_number,original,human_simplification,chatgpt_simplification,images
0,1001,1,CC: Difficulty with word finding.,Main complaint: Difficulty with word finding.,Problem with finding the right words.,
1,1001,2,HX: This 27y/o RHF experienced sudden onset wo...,History: This 27-year-old patient with very we...,This 27-year-old woman suddenly had trouble fi...,
2,1001,3,"She denied any associated dysphagia, diplopia,...",She denied any associated swallowing difficult...,"She did not have difficulty swallowing, double...",
3,1001,4,She went to sleep with her symptoms on 2/19/96...,She went to sleep with her symptoms on 2/19/96...,She went to bed with these symptoms on Februar...,
4,1001,5,She also awoke with a headache (HA) and mild n...,She also awoke with a headache and mild neck s...,She also woke up with a headache and mild stif...,
...,...,...,...,...,...,...
792,1093,21,The aorta is normal in contour and caliber.,The major blood vessel that carries blood away...,"The main blood vessel in your body, the aorta,...",
793,1093,22,There is no adenopathy.,There are no large or swollen lymph nodes.,No swollen lymph nodes.,
794,1093,23,Degenerative changes are present in the lumbar...,Wearing down of spinal disks in the lower back...,Your lower back spine shows signs of wear and ...,
795,1093,24,IMPRESSION: Findings consistent with diverticu...,IMPRESSION: Findings consistent with a disease...,Summary: You likely have diverticulitis based ...,


In [None]:
# Create a new column 'text' by combining the original text and its simplification 
# with tokens that help the model understand instruction format.
dataset['text'] = '<s>[INST] ' + dataset['original'] + ' [/INST] ' + dataset['human_simplification'] + ' </s>'

In [5]:
# the base and new model names

base_model = "mistralai/Mistral-Nemo-Instruct-2407"
# dataset_name = "mlabonne/guanaco-llama2-1k"
new_model = "mistral_nemo_12b_medical_ft_v1"

In [6]:
dataset = Dataset.from_pandas(dataset)

In [7]:
#Importing the dataset
# dataset = load_dataset(dataset_name, split="train")
dataset["text"][10]

'<s>[INST] The episodes were not associated with any other symptoms. [/INST] These episodes did not have any other symptoms. </s>'

### Base Model Loading and Configuration

In this section, we configure and load the base language model. We set options for quantization (memory efficiency), precision,
and device placement. Additionally, we load the tokenizer corresponding to the base model.

In [8]:
# Load base model(Mistral 7B)
bnb_config = BitsAndBytesConfig(  
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
        base_model,
        # load_in_4bit=True,
        # quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        # device_map="cuda:0",
        device_map="auto",
    
        trust_remote_code=True,
)
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
# tokenizer.add_bos_token, tokenizer.add_eos_token


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

## Applying LoRA for Parameter-Efficient Fine-Tuning

Here we add LoRA adapters to the base model. LoRA is a method that allows fine-tuning using fewer additional parameters,
making the process more efficient. We define the adapter configuration and update the model accordingly.


In [9]:
#Adding the adapters in the layers
# model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16, #scaling factor
    lora_dropout=0.05, # dropout rate to avoid overfit
    r=64, # Rank for low-rank matrices (controls additional parameters)
    bias="none",
    task_type="CAUSAL_LM", # Specify the task as causal language modeling
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"] 
)
model = get_peft_model(model, peft_config)

### Setting Up Training Hyperparameters and Trainer
We now define the training hyperparameters such as epochs, batch size, learning rate, and logging intervals.
The SFTTrainer (Supervised Fine-Tuning Trainer) is then initialized with the model, dataset, and training arguments.



In [10]:
# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
#Hyperparamter
#Refine training arguments including hyperparameters and logging settings.

training_arguments = TrainingArguments(
    output_dir="./results",             # Directory to store model checkpoints
    num_train_epochs=2,                 # Total epochs to train over the dataset
    per_device_train_batch_size=4,      # Batch size for each GPU device
    gradient_accumulation_steps=1,      # Number of steps to accumulate gradients before an update
    optim="paged_adamw_32bit",          # Optimizer choice for efficient training
    save_steps=25,                      # Frequency (in steps) to save model checkpoints
    logging_steps=25,                   # Frequency (in steps) to log training information
    learning_rate=2e-4,                 # Learning rate for the optimizer
    weight_decay=0.001,                 # Weight decay to prevent overfitting
    fp16=False,                         # Do not use 16-bit floating point precision in this case
    bf16=True,                          # Use bfloat16 precision for efficiency
    max_grad_norm=0.3,                  # Maximum gradient norm for gradient clipping
    max_steps=-1,                       # Total training steps (-1 means no limit)
    warmup_ratio=0.03,                  # Ratio of warmup steps before full learning rate
    group_by_length=True,               # Group inputs of similar lengths to speed up training
    lr_scheduler_type="constant",       # Use a constant learning rate schedule after warmup
    report_to="wandb"                   # Report training metrics to Weights & Biases (WandB)
)


In [12]:
# Initialize the Supervised Fine-Tuning Trainer with the model, dataset, and training parameters.
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    args=training_arguments,
    # Uncomment and set additional parameters if needed (e.g., tokenizer or sequence length)
    # max_seq_length= None,
    # dataset_text_field="text",
    # tokenizer=tokenizer,
    # packing= False,
)

Converting train dataset to ChatML:   0%|          | 0/797 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/797 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/797 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/797 [00:00<?, ? examples/s]

[2025-04-13 17:37:06,625] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [13]:
trainer.train()

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnischaychai[0m ([33malexdaoud-chain-store-age[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
25,2.3901
50,2.1977
75,1.8721
100,2.1308
125,1.8628
150,2.0724
175,1.8186
200,2.032
225,1.4031
250,1.6209


TrainOutput(global_step=400, training_loss=1.7950226879119873, metrics={'train_runtime': 128.9558, 'train_samples_per_second': 12.361, 'train_steps_per_second': 3.102, 'total_flos': 4668175426836480.0, 'train_loss': 1.7950226879119873})

In [14]:
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
wandb.finish()
model.config.use_cache = True
model.eval()

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
train/epoch,▁▁▂▂▃▃▄▄▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▄▄▅▅▆▆▇▇███
train/grad_norm,▃▁▂▁▁▁▂█▂▂▁▂▂▁▂▅
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▇▄▆▄▆▄▅▁▃▁▃▁▃▁▃
train/mean_token_accuracy,▁▁▃▁▃▂▄▂█▇█▇▇▆█▆
train/num_tokens,▁▁▂▂▃▃▄▄▅▅▆▆▇▇██

0,1
total_flos,4668175426836480.0
train/epoch,2.0
train/global_step,400.0
train/grad_norm,1.61188
train/learning_rate,0.0002
train/loss,1.6627
train/mean_token_accuracy,0.65645
train/num_tokens,63964.0
train_loss,1.79502
train_runtime,128.9558


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(131072, 5120)
        (layers): ModuleList(
          (0-39): 40 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=5120, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=5120, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lor

In [None]:
trainer.model.push_to_hub(new_model, use_temp_dir=False, token="replace_hf_token")


adapter_model.safetensors:   0%|          | 0.00/514M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shikhac30/mistral_nemo_12b_medical_ft_v1/commit/d662cbcf920e46cacd151d9c75e4845c7e258eb4', commit_message='Upload model', commit_description='', oid='d662cbcf920e46cacd151d9c75e4845c7e258eb4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/shikhac30/mistral_nemo_12b_medical_ft_v1', endpoint='https://huggingface.co', repo_type='model', repo_id='shikhac30/mistral_nemo_12b_medical_ft_v1'), pr_revision=None, pr_num=None)

In [None]:
# Set logging level to CRITICAL to reduce verbosity in output.
logging.set_verbosity(logging.CRITICAL)

# Define a prompt with a medical scenario to test the model's domain-specific capabilities.

prompt = """Patient presents with acute onset of dyspnea and orthopnea;
"CXR reveals bilateral perihilar infiltrates suggestive of cardiogenic pulmonary edema.
BNP elevated, echocardiogram pending to evaluate for systolic dysfunction."""
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Patient presents with acute onset of dyspnea and orthopnea;
"CXR reveals bilateral perihilar infiltrates suggestive of cardiogenic pulmonary edema.
BNP elevated, echocardiogram pending to evaluate for systolic dysfunction. [/INST] The patient has difficulty breathing and shortness of breath when lying down. An X-ray shows inflammation in both lungs, which might be due to heart failure. A blood test shows high levels of a certain protein, and a heart test is planned to check for heart function. 
