In [1]:
!pip install -q bitsandbytes transformers datasets peft accelerate

import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m530.9/530.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.1


In [2]:
from kaggle_secrets import UserSecretsClient
secret_label = "hf_token"
hf_token = UserSecretsClient().get_secret(secret_label)

In [2]:
import json
import torch
import numpy as np
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset

2026-02-07 18:29:17.750276: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770488957.926456      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770488957.975513      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770488958.380340      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770488958.380371      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770488958.380373      55 computation_placer.cc:177] computation placer alr

#### - Note: Kaggle is fickle at times. The dataset created on Kaggle sometimes appear under /kaggle/input/ or under /kaggle/input/datasets/username/; the path names could use either `_` or `-`. So list the files to get the correct path if you encounter a `File Not Found Error`

Steps to run
- Create datasets on kaggle with the training-data/finetuning and training-data/splits from notebook 04-prepare-training-data
- Change the `DIMENSION` parameter one at a time. The list of values are ["accuracy", "clarity", "completeness", compliance", "risk"]
- Click on Save Version at the top right corner
- Leave default version name, and ensure Version Type is Save & Run All (Commit). Click Continue
- Check log for successful completion
- Access the files from the Output tab. Download them, and then upload them to Kaggle as a dataset

### ============================================================================
### CONFIGURATION
### ============================================================================

In [13]:
DIMENSION = "risk"
MODEL_NAME = "google/medgemma-4b-it"
MAX_SEQ_LENGTH = 2048
LORA_RANK = 16
OUTPUT_DIR = f"/kaggle/working/models/{DIMENSION}_agent"



### ============================================================================
### LOAD DATA
### ============================================================================

In [14]:
def load_json(filepath):
    with open(filepath, 'r') as f:
        return json.load(f)

print(f"Loading {DIMENSION} training data...")
train_data = load_json(f"/kaggle/input/datasets/laxmsun/training-data/finetuning/{DIMENSION}_train.json")
test_data = load_json(f"/kaggle/input/datasets/laxmsun/training-data/finetuning/{DIMENSION}_test.json")

print(f"Train: {len(train_data)}, Test: {len(test_data)}")



Loading completeness training data...
Train: 120, Test: 30


### ============================================================================
### FORMAT DATA FOR TRAINING
### ============================================================================

In [None]:

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_list(train_data)

test_dataset = Dataset.from_list(test_data)

print("\nSample training example:")
print(train_dataset[0]['text'][:500])



### ============================================================================
### LOAD MODEL WITH LORA
### ============================================================================

In [None]:


print("\nLoading model...")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    token = hf_token,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=hf_token)
tokenizer.pad_token = tokenizer.eos_token

# Prepare for LoRA
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
print(f"Trainable params: {model.print_trainable_parameters()}")



### ============================================================================
### TRAINING CONFIGURATION
### ============================================================================

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=2048,
        padding="max_length"
    )

tokenized_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text", "note_id"]
)

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=2048)

tokenized_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text", "note_id"]
)


### ============================================================================
### TRAINING
### ============================================================================

In [None]:

training_args = TrainingArguments(
    output_dir=f"./{DIMENSION}_agent",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    logging_steps=5,
    save_strategy="epoch",
    fp16=True,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


### ============================================================================
### FINE-TUNE
### ============================================================================

In [None]:


print("\nStarting fine-tuning...")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

print("\nTraining...")
trainer.train()


print("\nTraining complete!")



### ============================================================================
### SAVE MODEL
### ============================================================================

In [None]:


print(f"\nSaving model to {OUTPUT_DIR}...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# # Also save merged model (LoRA + base)
# print("Saving merged model...")
# model.save_pretrained_merged(
#     f"{OUTPUT_DIR}/merged",
#     tokenizer,
#     save_method="merged_16bit",
# )



### ============================================================================
### EVALUATION
### ============================================================================

In [None]:
print("\n" + "="*80)
print("EVALUATION")
print("="*80)

model.eval()

def predict_score(note_text):
    """Predict score for a clinical note"""
    # Create prompt in Gemma format
    prompt = (
        f"<start_of_turn>user\n"
        f"Score the {DIMENSION} of this clinical note (0-100):\n\n"
        f"{note_text[:500]}\n"  # Truncate long notes
        f"<end_of_turn>\n"
        f"<start_of_turn>model\n"
    )
    
    inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=True).to("cuda")
    
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=10,
            do_sample=True,
            temperature=0.4,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Extract only generated tokens
    prompt_length = inputs.input_ids.shape[1]
    generated_tokens = output[0][prompt_length:]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
    
    # Parse number from response
    try:
        return int(response)
    except:
        import re
        numbers = re.findall(r'\d+', response)
        return int(numbers[0]) if numbers else 50  # Default fallback

# Load test data
test_data = load_json(f"/kaggle/input/datasets/laxmsun/training-data/finetuning/{DIMENSION}_test.json")
test_split = load_json("/kaggle/input/datasets/laxmsun/training-data/splits/test_split.json")

predictions = []
actuals = []

print(f"\nEvaluating on {min(20, len(test_data))} test samples...")

for i, item in enumerate(test_data[:20]):
    # Find corresponding note
    note = next((n for n in test_split if n['note_id'] == item['note_id']), None)
    
    if note is None:
        print(f"Warning: Note {item['note_id']} not found in test split")
        continue
    
    pred = predict_score(note['note_text'])
    output_part = item['text'].split("<start_of_turn>model\n")[1].split("<end_of_turn>")[0]
    actual = int(output_part)
    
    predictions.append(pred)
    actuals.append(actual)
    
    if i < 5:  # Show first 5
        print(f"{item['note_id']}: Predicted={pred}, Actual={actual}, Error={abs(pred-actual)}")

# Calculate metrics
predictions = np.array(predictions)
actuals = np.array(actuals)

mae = np.mean(np.abs(predictions - actuals))
rmse = np.sqrt(np.mean((predictions - actuals)**2))

print("\n" + "="*80)
print("RESULTS")
print("="*80)
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"Within 10pts: {np.mean(np.abs(predictions - actuals) <= 10)*100:.1f}%")
print(f"Within 20pts: {np.mean(np.abs(predictions - actuals) <= 20)*100:.1f}%")


results = {
    "dimension": DIMENSION,
    "mae": float(mae),
    "rmse": float(rmse),
    "within_10": float(np.mean(np.abs(predictions - actuals) <= 10)*100),
    "within_20": float(np.mean(np.abs(predictions - actuals) <= 20)*100),
    "predictions": predictions.tolist(),
    "actuals": actuals.tolist(),
}

with open(f"{OUTPUT_DIR}/results.json", 'w') as f:
    json.dump(results, f, indent=2)

print(f"\nResults saved to {OUTPUT_DIR}/results.json")
print("Done!")