In [None]:
#login to hugging face
!huggingface-cli login

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.41.0 trl==0.4.7

In [None]:
!pip install --upgrade accelerate peft bitsandbytes transformers trl



In [None]:
!pip install tokenizers datasets


# **Text preprocessing**


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
#load dataset
df=pd.read_csv("/content/train.csv")
df.head()
print(len(df))

In [None]:
print("\n🔍 Null values per column:")
print(df.isnull().sum())

In [None]:
df = df.dropna(subset=['DDX SNOMED'])

In [None]:
print("\n📊 Unique values per column:")
for col in df.columns:
    unique_vals = df[col].nunique()
    print(f"\n🔹 Column: {col}")
    print(f"   Unique Count: {unique_vals}")

    # Print actual unique values, limiting to first 20 if too many
    unique_items = df[col].unique()
    if len(unique_items) > 20:
        print(f"   Sample Values (20 of {len(unique_items)}): {unique_items[:20]}")
    else:
        print(f"   Values: {unique_items}")


In [None]:
# 3. Plot count of unique values (for categorical columns)
categorical_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].nunique() < 50]

for col in categorical_cols:
    plt.figure(figsize=(8, 4))
    df[col].value_counts().plot(kind='bar')
    plt.title(f"Value Counts for '{col}'")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
def convert_to_chat_template_format(df):
    conversations = []

    for _, row in df.iterrows():
        convo = {
            "messages": [
                {
                    "role": "system",
                    "content": (
                        "You are a clinical decision support assistant. "
                        f"Context:\n"
                        f"- location: {row['County']}, Kenya\n"
                        f"- Facility: {row['Health level']}\n"
                        f"- Nursing Competency: {row['Nursing Competency']}\n"
                        f"- Clinical Panel: {row['Clinical Panel']}"
                    )
                },
                {"role": "user", "content": row["Prompt"]},
                {"role": "assistant", "content": row["Clinician"]},
            ]
        }
        conversations.append(convo)

    return conversations
chat_data = convert_to_chat_template_format(df)
chat_data[0]

In [None]:
from transformers import AutoTokenizer

# Load tokenizer
model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

# Get the max sequence length from the tokenizer
max_length_tokenizer = tokenizer.model_max_length

# Add special tokens (if necessary)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Apply the chat template formatting
def apply_chat_template(convo):
    formatted_messages = []
    for message in convo["messages"]:
        formatted_message = f"{message['role']}: {message['content']}"
        formatted_messages.append(formatted_message)
    return "\n".join(formatted_messages)

# Function to tokenize conversations using the chat template
def tokenize_conversations_with_chat_template(conversations, tokenizer, max_length=max_length_tokenizer ):
    tokenized_data = []
    
    for convo in conversations:
        formatted_convo = apply_chat_template(convo)
        
        # Tokenize the formatted conversation
        tokenized_convo = tokenizer(
            formatted_convo, 
            padding=True, 
            truncation=True, 
            return_tensors="pt", 
            max_length=max_length
        )
        
        # Shift the labels for causal language modeling (next token prediction)
        labels = tokenized_convo['input_ids'].clone()  # Clone to keep a copy for labels
        labels[:, :-1] = tokenized_convo['input_ids'][:, 1:].clone()  # Shift labels by 1
        labels[:, -1] = tokenizer.pad_token_id  # Set the last token as padding in the labels
        
        # Append the tokenized conversation with labels
        tokenized_data.append({
            "input_ids": tokenized_convo['input_ids'].squeeze(0),  # Remove batch dimension
            "attention_mask": tokenized_convo['attention_mask'].squeeze(0),  # Remove batch dimension
            "labels": labels.squeeze(0)  # Remove batch dimension
        })
    
    return tokenized_data

# Example conversion and tokenization
tokenized_chat_data = tokenize_conversations_with_chat_template(chat_data, tokenizer)

# Example output of the first tokenized conversation
print(tokenized_chat_data[0])

# **Finetuning LLM model**

In [None]:
# Import necessary packages for the fine-tuning process
import os                          # Operating system functionalities
import torch                       # PyTorch library for deep learning
from datasets import load_dataset  # Loading datasets for training
from transformers import (
    AutoModelForCausalLM,          # AutoModel for language modeling tasks
    AutoTokenizer,                # AutoTokenizer for tokenization
    BitsAndBytesConfig,           # Configuration for BitsAndBytes
    HfArgumentParser,             # Argument parser for Hugging Face models
    TrainingArguments,            # Training arguments for model training
    pipeline,                     # Creating pipelines for model inference
    logging,                      # Logging information during training
)
from peft import LoraConfig, PeftModel  # Packages for parameter-efficient fine-tuning (PEFT)
from trl import SFTTrainer         # SFTTrainer for supervised fine-tuning

In [None]:
# The model that you want to train from the Hugging Face hub
#model_name = "meta-llama/Llama-3.2-1B"



# Fine-tuned model name
new_model = "finetuned_model"

In [None]:
################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [None]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 1

# Batch size per GPU for evaluation
#per_device_eval_batch_size = 8

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 25

# Log every X updates steps
logging_steps = 25

In [None]:
################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
# # Step 1 : Load dataset (you can process it here)
# # The instruction dataset to use
# dataset_name = "mlabonne/guanaco-llama2-1k"
# dataset = load_dataset(dataset_name, split="train")
# dataset

In [None]:
# def tokenize(example):
#     return tokenizer(
#         example["text"],
#         truncation=True,
#         padding="max_length",
#         max_length=512,  # 👈 set it here
#     )

# tokenized_dataset = formatted_text.map(tokenize, batched=True)


In [None]:
# tokenized_dataset

In [None]:
# Step 2 :Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Step 3 :Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [None]:
# Step 4 :Load base model
model_base = AutoModelForCausalLM.from_pretrained(
    model_name,
    #quantization_config=bnb_config,
    device_map=device_map
)
model_base .config.use_cache = False
model_base .config.pretraining_tp = 1

In [None]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 8

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

In [None]:
# Step 6 :Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["q_proj"]#["q_proj"],# "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"
)

In [None]:
from peft import get_peft_model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
# Step 7 :Set training parameters
training_args = TrainingArguments(
    output_dir=output_dir,  # Directory to save model checkpoints & logs
    num_train_epochs=num_train_epochs,  # How many times to iterate over the full dataset
    per_device_train_batch_size=per_device_train_batch_size,  # Batch size per GPU
    gradient_accumulation_steps=gradient_accumulation_steps,  # Simulates larger batch sizes by accumulating gradients
    optim=optim,  # Optimizer (e.g., "adamw_torch")
    save_steps=save_steps,  # How often to save a checkpoint
    logging_steps=logging_steps,  # How often to log training info
    learning_rate=learning_rate,  # Initial learning rate
    weight_decay=weight_decay,  # L2 regularization
    fp16=fp16,  # Use mixed precision (16-bit floating point)
    bf16=bf16,  # Use bfloat16 (on compatible hardware, like A100s)
    max_grad_norm=max_grad_norm,  # Gradient clipping to prevent exploding gradients
    max_steps=max_steps,  # Max training steps (overrides `num_train_epochs` if set)
    warmup_ratio=warmup_ratio,  # Portion of training used for learning rate warm-up
    group_by_length=group_by_length,  # Whether to group sequences of similar lengths for efficient batching
    lr_scheduler_type=lr_scheduler_type,  # Scheduler type (e.g. "cosine", "linear", etc.)
    report_to="tensorboard"  # Reporting backend (could be "wandb", "tensorboard", etc.)
)

In [None]:
# Step 8 :Set supervised fine-tuning parameters

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset,
    peft_config=peft_config
)

In [None]:
# Step 9 :Train model
trainer.train()

# Step 10 :Save trained model
trainer.model.save_pretrained(new_model)
#save tokenizer
tokenizer.save_pretrained(new_model)

In [None]:
%load_ext tensorboard
%tensorboard --logdir results/runs

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel  # This is assuming you're using PEFT (such as LoRA or other PEFT methods)

# Path to the base foundation model (e.g., GPT-2, T5, etc.)
foundation_model_path = "HuggingFaceTB/SmolLM2-135M-Instruct"  # E.g., "gpt2", or the path to your base model directory


# Path to the fine-tuned PEFT model (your checkpoint directory)
peft_model_path = "/kaggle/working/results/checkpoint-100"  # Path to the PEFT model directory

# Load the base foundation model (e.g., GPT-2)
foundation_model = AutoModelForCausalLM.from_pretrained(foundation_model_path)

# Load the PEFT (fine-tuned) model
fine_tuned_model = PeftModel.from_pretrained(foundation_model,
                                             peft_model_path,
                                             is_trainable=False)

# Load the tokenizer (make sure the tokenizer is compatible with the model)
tokenizer = AutoTokenizer.from_pretrained(foundation_model_path)

# Verify the loaded fine-tuned model
fine_tuned_model

In [None]:
input_text = "i am a nurse with 18 years of experience in general nursing working in a sub county hospitals and nursing homes in uasin gishu county in kenya a 4 year old child presents to the emergency department with second degree burns on the forearm after accidentally touching a hot stove the child was playing in the kitchen when they reached out to touch the stove the burns cover about 5 of the total body surface area the child is alert and crying with redness blisters and swelling on the affected area the burns appear to be superficial to moderate in severity the child is in mild pain and there is no indication of airway or breathing distress no other injuries are noted questions 1 what is the immediate treatment protocol for second degree burns in paediatric patients 2 should any tetanus prophylaxis be considered in this case 3 what follow up care should be recommended for burn healing"

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt")

# Generate text using the fine-tuned model with added parameters for diversity
generated_text = fine_tuned_model.generate(
    **inputs, 
    max_length=500, 
    num_return_sequences=1, 
    do_sample=True,       # Allow sampling for more diversity
    temperature=1.0,      # Increase temperature for more creativity
    top_p=0.9,            # Nucleus sampling for more diverse outputs
    no_repeat_ngram_size=2  # Prevent repeated phrases
)

# Decode the generated tokens into readable text and skip the input part
output = tokenizer.decode(generated_text[0], skip_special_tokens=True)

# Print only the generated text (not the input prompt)
print(output[len(input_text):])