In [None]:
### 1. Install & Imports

# Install necessary packages
!pip install -q anthropic google-cloud-bigquery transformers datasets scikit-learn

# Standard imports
import os
import pandas as pd
import numpy as np

# BigQuery client
from google.cloud import bigquery

# For Anthropic
import anthropic
from anthropic import HUMAN_PROMPT, AI_PROMPT


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/296.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.6/296.6 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import auth
auth.authenticate_user()


In [None]:
# Set up BigQuery client
project_id = 'aihc-project'
client = bigquery.Client(project=project_id)


In [None]:
# 1. Mount Drive
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [None]:
import pandas as pd

# 2. File paths
NOTES_PATH = '/content/drive/MyDrive/mimic_data/NOTEEVENTS.csv'
D_PATH  = '/content/drive/MyDrive/mimic_data/DIAGNOSES_ICD.csv'

notes = pd.read_csv(NOTES_PATH)
diagnoses = pd.read_csv(D_PATH)


  notes = pd.read_csv(NOTES_PATH)


In [None]:
print("NOTEEVENTS columns:", notes.columns.tolist())
print("ICD9 columns:    ", diagnoses.columns.tolist())


NOTEEVENTS columns: ['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'CHARTTIME', 'STORETIME', 'CATEGORY', 'DESCRIPTION', 'CGID', 'ISERROR', 'TEXT']
ICD9 columns:     ['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE']


In [None]:




# 2. Filter to discharge summaries
disch_notes = notes[notes['CATEGORY'] == 'Discharge summary']

# 3. Keep only the *last* summary per admission
disch_notes = (
    disch_notes
      .sort_values(by='CHARTDATE')
      .groupby('HADM_ID', as_index=False)['TEXT']
      .last()
)

# 4. Group all ICD9 codes per admission into a list
diag_codes = (
    diagnoses
      .groupby('HADM_ID')['ICD9_CODE']
      .apply(list)
      .reset_index()
)

# 5. Merge and drop any rows missing TEXT or codes
merged = pd.merge(disch_notes, diag_codes, on='HADM_ID')
merged = merged.dropna(subset=['TEXT', 'ICD9_CODE'])

# 6. Ensure every code is a string
merged['ICD9_CODE'] = merged['ICD9_CODE'].apply(lambda codes: [str(c) for c in codes])

print("Merged DataFrame shape:", merged.shape)

Merged DataFrame shape: (52726, 3)


In [None]:
# Cell: Clean TEXT and filter to top-100 ICD-9 codes on existing `merged` DataFrame

import re
from collections import Counter

# 1. Text cleaning function (remove PHI placeholders, lowercase, collapse spaces)
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'\[\*\*.*?\*\*\]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Apply cleaning to merged['TEXT']
merged['TEXT'] = merged['TEXT'].apply(clean_text)

# 2. Compute top-100 most frequent ICD-9 codes
all_codes = [c for codes in merged['ICD9_CODE'] for c in codes]
top100    = {code for code, _ in Counter(all_codes).most_common(100)}

# 3. Filter each admission’s code list to only top-100
def keep_top100(codes):
    filtered = [c for c in codes if c in top100]
    return filtered if filtered else None

merged['ICD9_CODE'] = merged['ICD9_CODE'].apply(keep_top100)

# 4. Drop admissions that lost all codes
merged = merged.dropna(subset=['ICD9_CODE']).reset_index(drop=True)

print(f"Post-cleaning & top-100 filter → {len(merged)} admissions, {len(top100)} codes")



Post-cleaning & top-100 filter → 50625 admissions, 100 codes


In [None]:
# Set Claude API key as an env var beforehand
anthropic_api_key = "APIKEY"
client = anthropic.Client(api_key=anthropic_api_key)
# 4.1 Install & import Anthropic
!pip install -q anthropic

import os, random
from anthropic import Anthropic
import re


In [None]:
# Claude multi-label ICD-9 (top-100) + micro-metrics — single cell

import re, json, time, random
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer

# ----- 0) Build top-100 label space from current merged -----
all_codes = [c for codes in merged["ICD9_CODE"] for c in codes]
top100 = {code for code, _ in Counter(all_codes).most_common(100)}

def keep_top100(codes):
    kept = [c for c in codes if c in top100]
    return kept if kept else None

filtered = merged.copy()
filtered["ICD9_CODE"] = filtered["ICD9_CODE"].apply(keep_top100)
filtered = filtered.dropna(subset=["ICD9_CODE"]).reset_index(drop=True)

# ----- 1) Train/val/test split (we only need test for evaluation here) -----
X = filtered["TEXT"].tolist()
Y = filtered["ICD9_CODE"].tolist()

X_trainval, X_test, Y_trainval, Y_test = train_test_split(
    X, Y, test_size=0.20, random_state=42
)

# MultiLabelBinarizer over exactly the top-100 set (stable class order)
mlb = MultiLabelBinarizer(classes=sorted(top100))
mlb.fit([[]])   # initialize with fixed classes
Y_test_bin = mlb.transform(Y_test)   # shape (N_test, 100)

# ----- 2) Claude helper: choose codes FROM the provided candidate list -----
# IMPORTANT: We force strict JSON array output and only allow codes from the candidate set.
def call_claude_multilabel(note: str, candidates: list[str], k: int = 5) -> list[str]:
    cand_str = ", ".join(candidates)  # e.g., "4019, 25000, V3000, E8156"
    system_prompt = (
        "You are a hospital coding assistant. "
        "Choose the ICD-9 diagnosis codes for the note ONLY "
        "from the provided candidate list. "
        f"Return a strict JSON array of strings (length 0..{k}). "
        "Each element must be EXACTLY one candidate code string "
        "(keep any leading letters like V or E; no dots). "
        "Return [] if none apply. Do not include any text besides the JSON."
    )
    user_prompt = (
        "Candidates: [" + cand_str + "]\n\n"
        "Clinical note:\n" + note + "\n\n"
        "JSON only:"
    )
    resp = client.messages.create(
        model="claude-sonnet-4-20250514",
        system=system_prompt,
        messages=[{"role": "user", "content": user_prompt}],
        max_tokens=128,
        temperature=0.0,
    )
    raw = resp.content[0].text.strip()

    # Robust parsing: try JSON first, then fall back to regex filter
    try:
        out = json.loads(raw)
        if isinstance(out, list):
            preds = [str(x).strip() for x in out]
        else:
            preds = []
    except Exception:
        # fallback: pick any token that exactly matches a candidate code
        toks = re.findall(r"[A-Z]?\d{3,5}", raw.upper())
        preds = [t for t in toks if t in candidates]

    # keep only valid candidates and cap to k
    cand_set = set(candidates)
    preds = [p for p in preds if p in cand_set]
    # de-duplicate preserving order
    seen = set(); preds = [p for p in preds if not (p in seen or seen.add(p))]
    return preds[:k]

# ----- 3) Run Claude on a test subset (to keep cost/time under control) -----
SAMPLE_N = 200  # adjust up/down depending on quota/time
random.seed(42)
idxs = random.sample(range(len(X_test)), k=min(SAMPLE_N, len(X_test)))

candidates = sorted(top100)  # pass the same label space every time
pred_lists = []
for i in idxs:
    pred = call_claude_multilabel(X_test[i], candidates, k=5)
    pred_lists.append(pred)
    time.sleep(0.15)  # mild pacing for API rate limits

# ----- 4) Evaluate like the transformers: micro-precision/recall/F1 -----
Y_true_sub = [Y_test[i] for i in idxs]
Y_true_bin = mlb.transform(Y_true_sub)

# ensure predictions are in the same space
pred_lists = [[p for p in preds if p in top100] for preds in pred_lists]
Y_pred_bin = mlb.transform(pred_lists)

micro_prec = precision_score(Y_true_bin, Y_pred_bin, average="micro", zero_division=0)
micro_rec  = recall_score( Y_true_bin, Y_pred_bin, average="micro", zero_division=0)
micro_f1   = f1_score(     Y_true_bin, Y_pred_bin, average="micro", zero_division=0)

print(f"Claude multi-label (top-100) on {len(idxs)} notes → "
      f"micro-precision: {micro_prec:.4f}, micro-recall: {micro_rec:.4f}, micro-F1: {micro_f1:.4f}")

# (Optional) a few qualitative examples
for j in range(min(5, len(idxs))):
    i = idxs[j]
    print("\n--- Example", j+1, "---")
    print("True:", Y_test[i])
    print("Pred:", pred_lists[j])


Claude multi-label (top-100) on 200 notes → micro-precision: 0.3635, micro-recall: 0.2476, micro-F1: 0.2945

--- Example 1 ---
True: ['486', '5856', '40391', '4280', '79902', '41401', 'V4581', '42731', 'V5867', 'V4501', '4439', '2449', 'V1582']
Pred: ['4280', '486', '5849', '25000', '412']

--- Example 2 ---
True: ['5849', '5990', '42731', '2768', '60000']
Pred: ['2930', '5119', '5990', '70703', '42731']

--- Example 3 ---
True: ['2851', '4019', '42789']
Pred: ['2851', '53081', '5715', '2761']

--- Example 4 ---
True: ['769']
Pred: []

--- Example 5 ---
True: ['41401', '4111', '2720', '4019', '53081', '2449', '3051']
Pred: ['41400', '4019', '5849', '2449', '5990']


In [None]:
# Claude multi-label ICD-9 (top-100) with hidden CoT + self-consistency voting

import re, json, time, random, numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer

# =========================
# Tunables
# =========================
SAMPLE_N    = 40     # how many test notes to score (controls cost)
TOPK_PER_NOTE = 5     # max codes Claude may return per note
USE_VOTING  = True    # True = self-consistency voting; False = single pass
N_SAMPLES   = 3       # # of samples per note when voting
TEMP_SINGLE = 0.0     # temperature for single-pass
TEMP_VOTE   = 0.6     # temperature used when voting
PAUSE_S     = 0.05    # small sleep to avoid rate limits

# =========================
# 0) Build top-100 label space from current merged
# =========================
all_codes = [c for codes in merged["ICD9_CODE"] for c in codes]
top100 = {code for code, _ in Counter(all_codes).most_common(100)}

def keep_top100(codes):
    kept = [c for c in codes if c in top100]
    return kept if kept else None

filtered = merged.copy()
filtered["ICD9_CODE"] = filtered["ICD9_CODE"].apply(keep_top100)
filtered = filtered.dropna(subset=["ICD9_CODE"]).reset_index(drop=True)

# =========================
# 1) Split data
# =========================
X = filtered["TEXT"].tolist()
Y = filtered["ICD9_CODE"].tolist()

X_trainval, X_test, Y_trainval, Y_test = train_test_split(
    X, Y, test_size=0.20, random_state=42
)

# Stable class order for evaluation
mlb = MultiLabelBinarizer(classes=sorted(top100))
mlb.fit([[]])  # lock classes
# (We’ll transform test labels only for evaluation subset)

# =========================
# 2) Claude helpers
# =========================
def _parse_candidates_from_text(raw: str, candidates: list[str]) -> list[str]:
    # fallback: exact candidate matches (handles stray text)
    cand_set = set(candidates)
    # Extract tokens like V3000, E8156, 4019, 25000 (no dots)
    toks = re.findall(r"[A-Z]?\d{3,5}", raw.upper())
    # Normalize tokens to original candidate casing if present
    cand_map = {c.upper(): c for c in candidates}
    preds = [cand_map[t] for t in toks if t in cand_map]
    # dedupe preserving order
    seen = set(); preds = [p for p in preds if not (p in seen or seen.add(p))]
    return preds

def call_claude_multilabel(
    note: str,
    candidates: list[str],
    k: int = TOPK_PER_NOTE,
    hidden_cot: bool = False,
    temperature: float = 0.0,
) -> list[str]:
    cand_str = ", ".join(candidates)
    if hidden_cot:
        system_prompt = (
            "You are a hospital coding assistant.\n"
            "Think step-by-step in a hidden scratchpad that you will NOT include in your final answer.\n"
            f"Your FINAL answer must be a STRICT JSON array of strings of length 0..{k}, choosing ONLY from the provided candidates.\n"
            "Keep code formatting exactly as in candidates (preserve leading letters like V/E; no dots). "
            "If unsure, return fewer items. Output JSON only—no prose."
        )
        user_prompt = (
            "Candidates: [" + cand_str + "]\n\n"
            "Task: Determine which diagnosis codes best apply to this note. Reason silently.\n"
            "When ready, output JSON only.\n\n"
            "Clinical note:\n" + note + "\n\n"
            "JSON:"
        )
    else:
        system_prompt = (
            "You are a hospital coding assistant. "
            "Choose the ICD-9 diagnosis codes for the note ONLY from the provided candidate list. "
            f"Return a strict JSON array of strings (length 0..{k}). "
            "Each element must be EXACTLY one candidate code string (keep any leading letters like V or E; no dots). "
            "Return [] if none apply. Do not include any text besides the JSON."
        )
        user_prompt = (
            "Candidates: [" + cand_str + "]\n\n"
            "Clinical note:\n" + note + "\n\n"
            "JSON only:"
        )

    resp = client.messages.create(
        model="claude-sonnet-4-20250514",
        system=system_prompt,
        messages=[{"role": "user", "content": user_prompt}],
        max_tokens=128,
        temperature=temperature,
    )
    raw = resp.content[0].text.strip()

    # Prefer strict JSON; otherwise fallback to candidate filtering
    preds: list[str]
    try:
        out = json.loads(raw)
        preds = [str(x).strip() for x in out] if isinstance(out, list) else []
    except Exception:
        preds = _parse_candidates_from_text(raw, candidates)

    # keep only valid candidates, dedupe, cap to k
    cand_set = set(candidates)
    seen = set()
    preds = [p for p in preds if p in cand_set and not (p in seen or seen.add(p))]
    return preds[:k]

def predict_with_voting(note: str, candidates: list[str], k: int, n_samples: int, temp: float) -> list[str]:
    votes = Counter()
    for _ in range(n_samples):
        p = call_claude_multilabel(
            note, candidates, k=k, hidden_cot=True, temperature=temp
        )
        votes.update(p)
        time.sleep(PAUSE_S)
    ranked = [c for c, _ in votes.most_common()]
    return ranked[:k]

# =========================
# 3) Run Claude on a test subset
# =========================
random.seed(42)
idxs = random.sample(range(len(X_test)), k=min(SAMPLE_N, len(X_test)))
candidates = sorted(top100)

pred_lists = []
for i in idxs:
    if USE_VOTING:
        pred = predict_with_voting(
            X_test[i], candidates, k=TOPK_PER_NOTE, n_samples=N_SAMPLES, temp=TEMP_VOTE
        )
    else:
        pred = call_claude_multilabel(
            X_test[i], candidates, k=TOPK_PER_NOTE, hidden_cot=False, temperature=TEMP_SINGLE
        )
    pred_lists.append(pred)
    if not USE_VOTING:
        time.sleep(PAUSE_S)  # mild pacing even for single-pass

# =========================
# 4) Evaluate (micro-precision/recall/F1) same as transformers
# =========================
Y_true_sub = [Y_test[i] for i in idxs]
Y_true_bin = mlb.transform(Y_true_sub)
# ensure predictions are in the same space
pred_lists = [[p for p in preds if p in top100] for preds in pred_lists]
Y_pred_bin = mlb.transform(pred_lists)

micro_prec = precision_score(Y_true_bin, Y_pred_bin, average="micro", zero_division=0)
micro_rec  = recall_score( Y_true_bin, Y_pred_bin, average="micro", zero_division=0)
micro_f1   = f1_score(     Y_true_bin, Y_pred_bin, average="micro", zero_division=0)

mode = "CoT+Voting" if USE_VOTING else "Single-pass"
print(f"Claude multi-label (top-100, {mode}) on {len(idxs)} notes → "
      f"micro-precision: {micro_prec:.4f}, micro-recall: {micro_rec:.4f}, micro-F1: {micro_f1:.4f}")

# Optional: qualitative examples
for j in range(min(5, len(idxs))):
    i = idxs[j]
    print("\n--- Example", j+1, "---")
    print("True:", Y_test[i])
    print("Pred:", pred_lists[j])


Claude multi-label (top-100, CoT+Voting) on 40 notes → micro-precision: 0.5455, micro-recall: 0.0242, micro-F1: 0.0463

--- Example 1 ---
True: ['486', '5856', '40391', '4280', '79902', '41401', 'V4581', '42731', 'V5867', 'V4501', '4439', '2449', 'V1582']
Pred: ['42833', '99591']

--- Example 2 ---
True: ['5849', '5990', '42731', '2768', '60000']
Pred: []

--- Example 3 ---
True: ['2851', '4019', '42789']
Pred: []

--- Example 4 ---
True: ['769']
Pred: []

--- Example 5 ---
True: ['41401', '4111', '2720', '4019', '53081', '2449', '3051']
Pred: ['41071']


In [None]:
# 1. Install & imports
!pip install -q transformers datasets scikit-learn

#Restart session now!

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/11.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/11.3 MB[0m [31m1.9 MB/s[0m eta [36m0:00:06[0m[2K   [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/11.3 MB[0m [31m3.7 MB/s[0m eta [36m0:00:04[0m[2K   [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/11.3 MB[0m [31m6.5 MB/s[0m eta [36m0:00:02[0m[2K   [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/11.3 MB[0m [31m11.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/11.3 MB[0m [31m20.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m7.0/11.3 MB[0m [31m34.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m11.1/11.3 MB[0m [31m89.7 MB/s

In [None]:
# --- 6. Multi-Label ICD-9 with DistilBERT (labels as float32) ---

# 6.1 Install & imports
# !pip install -q transformers datasets scikit-learn

import numpy as np
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoConfig,
    AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)

# 6.2 Binarize ICD-9 lists and cast to float32
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(merged["ICD9_CODE"].tolist()).astype(np.float32)
X = merged["TEXT"].tolist()

# 6.3 Train/test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

# 6.4 Build HF Datasets (lists of floats persist)
train_ds = Dataset.from_dict({"text": X_train, "labels": Y_train.tolist()})
test_ds  = Dataset.from_dict({"text": X_test,  "labels": Y_test.tolist()})

# 6.5 Tokenize
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

train_tok = train_ds.map(tokenize_fn, batched=True)
test_tok  = test_ds.map(tokenize_fn,  batched=True)

# 6.6 Set format to torch (labels now float tensors)
train_tok.set_format("torch", columns=["input_ids","attention_mask","labels"])
test_tok.set_format("torch",  columns=["input_ids","attention_mask","labels"])

# 6.7 Load model configured for multi-label
config = AutoConfig.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", config=config
).to("cuda")

# 6.8 Define micro-metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    probs = torch.sigmoid(torch.from_numpy(logits))
    preds = (probs > 0.5).int().numpy()
    return {
        "micro_precision": precision_score(labels, preds, average="micro", zero_division=0),
        "micro_recall":    recall_score(labels, preds, average="micro", zero_division=0),
        "micro_f1":        f1_score(labels, preds, average="micro", zero_division=0),
    }

# 6.9 TrainingArguments & Trainer
training_args = TrainingArguments(
    output_dir="results/distilbert_multilabel",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=2,
    report_to=[]  # disable WandB
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    compute_metrics=compute_metrics
)

# 6.10 Train & evaluate
trainer.train()
metrics = trainer.evaluate()
print("Multi-label micro-metrics:", metrics)


Map:   0%|          | 0/40500 [00:00<?, ? examples/s]

Map:   0%|          | 0/10125 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.5655
100,0.3814
150,0.2649
200,0.2229
250,0.2136
300,0.2131
350,0.2134
400,0.2072
450,0.2057
500,0.2087


Multi-label micro-metrics: {'eval_loss': 0.1548798680305481, 'eval_micro_precision': 0.7187262666811327, 'eval_micro_recall': 0.22264607525165106, 'eval_micro_f1': 0.3399751093776061, 'eval_runtime': 78.5263, 'eval_samples_per_second': 128.938, 'eval_steps_per_second': 16.122, 'epoch': 3.0}


In [None]:
from pathlib import Path

drive_path = "/content/drive/MyDrive/mimic_data/distilbert_multilabel"
Path(drive_path).mkdir(parents=True, exist_ok=True)

# Save both model & tokenizer
trainer.save_model(drive_path)
tokenizer.save_pretrained(drive_path)

print("✅ Multi-label DistilBERT saved to:", drive_path)


✅ Multi-label DistilBERT saved to: /content/drive/MyDrive/mimic_data/distilbert_multilabel


In [None]:
# Cell 7: Threshold Tuning for Multi-Label ICD-9 Classification

# 7.1 Imports
import numpy as np
import torch
from sklearn.metrics import precision_score, recall_score, f1_score

# 7.2 Get predictions (logits) and true labels
pred_out = trainer.predict(test_tok)     # uses Trainer from Section 6
logits   = pred_out.predictions          # shape (N_examples, N_labels)
y_true   = pred_out.label_ids            # same shape

# 7.3 Convert to probabilities
probs = torch.sigmoid(torch.from_numpy(logits)).numpy()

# 7.4 Sweep thresholds
best_thr, best_f1 = 0.5, 0.0
for thr in np.arange(0.1, 0.91, 0.05):
    y_pred = (probs > thr).astype(int)
    f1     = f1_score(y_true, y_pred, average="micro", zero_division=0)
    if f1 > best_f1:
        best_f1, best_thr = f1, thr

print(f"Best threshold = {best_thr:.2f}, micro-F1 = {best_f1:.4f}")

# 7.5 Report precision & recall at that threshold
y_pred_best = (probs > best_thr).astype(int)
prec = precision_score(y_true, y_pred_best, average="micro", zero_division=0)
rec  = recall_score(y_true, y_pred_best, average="micro", zero_division=0)
print(f"At threshold {best_thr:.2f} → micro-precision: {prec:.4f}, micro-recall: {rec:.4f}, micro-F1: {best_f1:.4f}")


Best threshold = 0.20, micro-F1 = 0.4444
At threshold 0.20 → micro-precision: 0.4395, micro-recall: 0.4494, micro-F1: 0.4444


In [None]:
# Cell 6b‐LM: Memory‐Safe Scratch‐BERT Multi‐Label Training (on a 20k subset)

# 1. Imports & disable WandB
import os
os.environ["WANDB_DISABLED"] = "true"

import numpy as np
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from datasets import Dataset
from transformers import (
    BertConfig, BertForSequenceClassification,
    AutoTokenizer, Trainer, TrainingArguments
)

# 2. Subsample to 20K examples to fit in RAM
merged_small = merged.sample(n=20000, random_state=42)
X = merged_small["TEXT"].tolist()
Y = MultiLabelBinarizer().fit_transform(merged_small["ICD9_CODE"].tolist()).astype(np.float32)

# 3. Train/test split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

# 4. Build HF Datasets & tokenize with shorter max_length
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_fn(batch):
    return tokenizer(batch["text"],
                     padding="max_length",
                     truncation=True,
                     max_length=128)  # shorter

train_ds = Dataset.from_dict({"text": X_train, "labels": Y_train.tolist()})
test_ds  = Dataset.from_dict({"text": X_test,  "labels": Y_test.tolist()})
train_tok = train_ds.map(tokenize_fn, batched=True)
test_tok  = test_ds.map(tokenize_fn,  batched=True)
train_tok.set_format("torch", ["input_ids","attention_mask","labels"])
test_tok.set_format("torch",  ["input_ids","attention_mask","labels"])

# 5. Build a tiny BERT config & model from scratch
config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=128,
    num_hidden_layers=3,
    num_attention_heads=4,
    intermediate_size=512,
    max_position_embeddings=128,
    num_labels=Y.shape[1],
    problem_type="multi_label_classification"
)
model_scratch = BertForSequenceClassification(config).to("cuda")

# 6. Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    probs = torch.sigmoid(torch.from_numpy(logits))
    preds = (probs > 0.5).int().numpy()
    return {
        "micro_precision": precision_score(labels, preds, average="micro", zero_division=0),
        "micro_recall":    recall_score(labels, preds, average="micro", zero_division=0),
        "micro_f1":        f1_score(labels, preds, average="micro", zero_division=0),
    }

# 7. TrainingArguments: small batch, gradient accumulation, fp16
training_args_scratch = TrainingArguments(
    output_dir="results/bert_scratch_multilabel",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,    # effective batch size = 8
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,                        # use half precision
    logging_steps=50,
    save_total_limit=1,
    report_to=[]
)

trainer_scratch = Trainer(
    model=model_scratch,
    args=training_args_scratch,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    compute_metrics=compute_metrics
)

# 8. Train & evaluate
trainer_scratch.train()
metrics_scratch = trainer_scratch.evaluate()
print("Scratch-BERT multi-label micro-metrics:", metrics_scratch)


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Step,Training Loss
50,0.6613
100,0.56
150,0.4379
200,0.3485
250,0.2959
300,0.2692
350,0.2517
400,0.2408
450,0.2286
500,0.225


Scratch-BERT multi-label micro-metrics: {'eval_loss': 0.201567143201828, 'eval_micro_precision': 0.0, 'eval_micro_recall': 0.0, 'eval_micro_f1': 0.0, 'eval_runtime': 2.5712, 'eval_samples_per_second': 1555.689, 'eval_steps_per_second': 194.461, 'epoch': 5.0}


In [None]:
# Cell 7b: Threshold Tuning for Scratch-BERT

import numpy as np
import torch
from sklearn.metrics import precision_score, recall_score, f1_score

# 7b.1 Get logits & true labels
pred_out = trainer_scratch.predict(test_tok)
logits   = pred_out.predictions
y_true   = pred_out.label_ids
probs    = torch.sigmoid(torch.from_numpy(logits)).numpy()

# 7b.2 Sweep thresholds
best_thr, best_f1 = 0.5, 0.0
for thr in np.arange(0.1, 0.91, 0.05):
    y_pred = (probs > thr).astype(int)
    f1     = f1_score(y_true, y_pred, average="micro", zero_division=0)
    if f1 > best_f1:
        best_f1, best_thr = f1, thr

print(f"Best threshold = {best_thr:.2f}, micro-F1 = {best_f1:.4f}")

prec = precision_score(y_true, (probs>best_thr).astype(int), average="micro", zero_division=0)
rec  = recall_score(y_true, (probs>best_thr).astype(int), average="micro", zero_division=0)
print(f"At thr={best_thr:.2f} → precision: {prec:.4f}, recall: {rec:.4f}")


Best threshold = 0.15, micro-F1 = 0.2735
At thr=0.15 → precision: 0.2524, recall: 0.2984


In [None]:
# Cell 8b: Save Scratch-BERT Model & Tokenizer to Google Drive

from pathlib import Path

drive_path = "/content/drive/MyDrive/mimic_data/bert_scratch_multilabel"
Path(drive_path).mkdir(parents=True, exist_ok=True)

trainer_scratch.save_model(drive_path)
tokenizer.save_pretrained(drive_path)

print("✅ Scratch-BERT multi-label model saved to:", drive_path)


✅ Scratch-BERT multi-label model saved to: /content/drive/MyDrive/mimic_data/bert_scratch_multilabel


In [None]:
# Cell 7c: Multi-Label ICD-9 Classification with Pretrained GPT-2

# 7c.1 Install & imports
!pip install -q transformers datasets scikit-learn

import numpy as np
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoConfig,
    GPT2ForSequenceClassification,
    Trainer, TrainingArguments
)

# 7c.2 Prepare multi-label data (use full merged)
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(merged["ICD9_CODE"].tolist()).astype(np.float32)
X = merged["TEXT"].tolist()

# 7c.3 Train/validation split (for multi-label, we'll tune threshold on val later)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

# 7c.4 Build HF Datasets & tokenize
train_ds = Dataset.from_dict({"text": X_train, "labels": Y_train.tolist()})
test_ds  = Dataset.from_dict({"text": X_test,  "labels": Y_test.tolist()})

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad by default

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

train_tok = train_ds.map(tokenize_fn, batched=True)
test_tok  = test_ds.map(tokenize_fn,  batched=True)
train_tok.set_format("torch", columns=["input_ids","attention_mask","labels"])
test_tok.set_format("torch",  columns=["input_ids","attention_mask","labels"])

# 7c.5 Load GPT-2 with a multi-label head
config = AutoConfig.from_pretrained(
    "gpt2",
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification",
    pad_token_id=tokenizer.eos_token_id
)
model = GPT2ForSequenceClassification.from_pretrained("gpt2", config=config).to("cuda")

# 7c.6 Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    probs = torch.sigmoid(torch.from_numpy(logits))
    preds = (probs > 0.5).int().numpy()
    return {
        "micro_precision": precision_score(labels, preds, average="micro", zero_division=0),
        "micro_recall":    recall_score(labels, preds, average="micro", zero_division=0),
        "micro_f1":        f1_score(labels, preds, average="micro", zero_division=0),
    }

# 7c.7 TrainingArguments & Trainer
training_args = TrainingArguments(
    output_dir="results/gpt2_multilabel",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,   # effective batch=8
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    weight_decay=0.01,
    fp16=True,
    logging_steps=50,
    save_total_limit=1,
    report_to=[]
)

trainer_gpt2 = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    compute_metrics=compute_metrics
)

# 7c.8 Train & evaluate
trainer_gpt2.train()
metrics_gpt2 = trainer_gpt2.evaluate()
print("Pretrained GPT-2 multi-label metrics:", metrics_gpt2)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/40500 [00:00<?, ? examples/s]

Map:   0%|          | 0/10125 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.6422
100,0.2126
150,0.2068
200,0.2101
250,0.2095
300,0.2112
350,0.212
400,0.2044
450,0.2032
500,0.2055


Pretrained GPT-2 multi-label metrics: {'eval_loss': 0.15306520462036133, 'eval_micro_precision': 0.6916261666164896, 'eval_micro_recall': 0.27023711496126507, 'eval_micro_f1': 0.3886270813697769, 'eval_runtime': 66.015, 'eval_samples_per_second': 153.374, 'eval_steps_per_second': 19.177, 'epoch': 3.0}


In [None]:
# Cell 8c: Threshold Tuning & Evaluation for Pretrained GPT-2 Fine-Tuned Multi-Label

import numpy as np
import torch
from sklearn.metrics import precision_score, recall_score, f1_score

# 8c.1 Get logits & true labels on the test split from Section 7c
pred_out = trainer_gpt2.predict(test_tok)   # uses  Trainer from the GPT-2 fine-tune cell
logits   = pred_out.predictions             # shape (N, num_labels)
y_true   = pred_out.label_ids               # same shape

# 8c.2 Convert to probabilities
probs = torch.sigmoid(torch.from_numpy(logits)).numpy()

# 8c.3 Sweep thresholds and pick best by micro-F1
best_thr, best_f1 = 0.5, 0.0
for thr in np.arange(0.1, 0.91, 0.05):
    y_pred = (probs > thr).astype(int)
    f1     = f1_score(y_true, y_pred, average="micro", zero_division=0)
    if f1 > best_f1:
        best_f1, best_thr = f1, thr

print(f"🔍 Pretrained GPT-2 best threshold = {best_thr:.2f}, micro-F1 = {best_f1:.4f}")

# 8c.4 Report precision & recall at best threshold
y_pred_best = (probs > best_thr).astype(int)
prec = precision_score(y_true, y_pred_best, average="micro", zero_division=0)
rec  = recall_score(y_true, y_pred_best, average="micro", zero_division=0)
print(f"At thr={best_thr:.2f} → micro-precision: {prec:.4f}, micro-recall: {rec:.4f}")


🔍 Pretrained GPT-2 best threshold = 0.25, micro-F1 = 0.4518
At thr=0.25 → micro-precision: 0.4680, micro-recall: 0.4368


In [None]:
# Cell 10c: Save Pretrained GPT-2 Fine-Tuned Multi-Label Model to Google Drive

from pathlib import Path

# 1. Define target directory in Drive
drive_path = Path("/content/drive/MyDrive/mimic_data/gpt2_multilabel")
drive_path.mkdir(parents=True, exist_ok=True)

# 2. Save model & tokenizer
trainer_gpt2.save_model(drive_path)       # saves model weights + config
tokenizer.save_pretrained(drive_path)     # saves tokenizer files

print(f"✅ GPT-2 fine-tuned model saved to: {drive_path}")


✅ GPT-2 fine-tuned model saved to: /content/drive/MyDrive/mimic_data/gpt2_multilabel


In [None]:
# Cell 7d: Multi-Label ICD-9 Classification with Tiny GPT-2 Trained From Scratch

# 7d.1 Install & imports
!pip install -q transformers datasets scikit-learn

import numpy as np
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from datasets import Dataset
from transformers import (
    GPT2Config, GPT2ForSequenceClassification,
    AutoTokenizer, Trainer, TrainingArguments
)

# 7d.2 Subsample 20k for scratch training
merged_small = merged.sample(n=20000, random_state=42)
X = merged_small["TEXT"].tolist()
Y = MultiLabelBinarizer().fit_transform(merged_small["ICD9_CODE"].tolist()).astype(np.float32)

# 7d.3 Split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

# 7d.4 Build & tokenize
train_ds = Dataset.from_dict({"text": X_train, "labels": Y_train.tolist()})
test_ds  = Dataset.from_dict({"text": X_test,  "labels": Y_test.tolist()})

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

train_tok = train_ds.map(tokenize_fn, batched=True)
test_tok  = test_ds.map(tokenize_fn,  batched=True)
train_tok.set_format("torch", columns=["input_ids","attention_mask","labels"])
test_tok.set_format("torch",  columns=["input_ids","attention_mask","labels"])

# 7d.5 Build a tiny GPT-2 config & model from scratch
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=128,
    n_ctx=128,
    n_embd=128,
    n_layer=3,
    n_head=4,
    num_labels=Y.shape[1],
    problem_type="multi_label_classification",
    pad_token_id=tokenizer.eos_token_id
)
model_scratch_gpt2 = GPT2ForSequenceClassification(config).to("cuda")

# 7d.6 Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    probs = torch.sigmoid(torch.from_numpy(logits))
    preds = (probs > 0.5).int().numpy()
    return {
        "micro_precision": precision_score(labels, preds, average="micro", zero_division=0),
        "micro_recall":    recall_score(labels, preds, average="micro", zero_division=0),
        "micro_f1":        f1_score(labels, preds, average="micro", zero_division=0),
    }

# 7d.7 TrainingArguments & Trainer
training_args_scratch = TrainingArguments(
    output_dir="results/gpt2_scratch_multilabel",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,
    logging_steps=100,
    save_total_limit=1,
    report_to=[]
)

trainer_scratch_gpt2 = Trainer(
    model=model_scratch_gpt2,
    args=training_args_scratch,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    compute_metrics=compute_metrics
)

# 7d.8 Train & evaluate
trainer_scratch_gpt2.train()
metrics_scratch_gpt2 = trainer_scratch_gpt2.evaluate()
print("Scratch GPT-2 multi-label metrics:", metrics_scratch_gpt2)


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Step,Training Loss
100,0.5221
200,0.3718
300,0.2924
400,0.2557
500,0.233
600,0.22
700,0.2146
800,0.2109
900,0.2051
1000,0.2056


Scratch GPT-2 multi-label metrics: {'eval_loss': 0.18259835243225098, 'eval_micro_precision': 0.6295387634936211, 'eval_micro_recall': 0.10755302204711208, 'eval_micro_f1': 0.1837187656619174, 'eval_runtime': 3.2463, 'eval_samples_per_second': 1232.186, 'eval_steps_per_second': 154.023, 'epoch': 5.0}


In [None]:
# Cell 9c: Threshold Tuning & Evaluation for Tiny GPT-2 Trained From Scratch

import numpy as np
import torch
from sklearn.metrics import precision_score, recall_score, f1_score

# 9c.1 Get logits & true labels on the test split from Section 7d
pred_out = trainer_scratch_gpt2.predict(test_tok)
logits   = pred_out.predictions
y_true   = pred_out.label_ids

# 9c.2 Convert to probabilities
probs = torch.sigmoid(torch.from_numpy(logits)).numpy()

# 9c.3 Sweep thresholds for best micro-F1
best_thr_s, best_f1_s = 0.5, 0.0
for thr in np.arange(0.1, 0.91, 0.05):
    y_pred = (probs > thr).astype(int)
    f1     = f1_score(y_true, y_pred, average="micro", zero_division=0)
    if f1 > best_f1_s:
        best_f1_s, best_thr_s = f1, thr

print(f"🔍 Scratch GPT-2 best threshold = {best_thr_s:.2f}, micro-F1 = {best_f1_s:.4f}")

# 9c.4 Report precision & recall at that threshold
y_pred_best_s = (probs > best_thr_s).astype(int)
prec_s = precision_score(y_true, y_pred_best_s, average="micro", zero_division=0)
rec_s  = recall_score(y_true, y_pred_best_s, average="micro", zero_division=0)
print(f"At thr={best_thr_s:.2f} → micro-precision: {prec_s:.4f}, micro-recall: {rec_s:.4f}")


🔍 Scratch GPT-2 best threshold = 0.15, micro-F1 = 0.3389
At thr=0.15 → micro-precision: 0.2921, micro-recall: 0.4035


In [None]:
# Cell 11c: Save Scratch GPT-2 Multi-Label Model to Google Drive

from pathlib import Path

# 1. Define target directory in Drive
drive_path = Path("/content/drive/MyDrive/mimic_data/gpt2_scratch_multilabel")
drive_path.mkdir(parents=True, exist_ok=True)

# 2. Save model & tokenizer
trainer_scratch_gpt2.save_model(drive_path)
tokenizer.save_pretrained(drive_path)

print(f"✅ Scratch GPT-2 model saved to: {drive_path}")


✅ Scratch GPT-2 model saved to: /content/drive/MyDrive/mimic_data/gpt2_scratch_multilabel


In [None]:
# Cell A (corrected): Bio_ClinicalBERT Multi‐Label Training with Enhanced Head

# 1) Install & imports
!pip install -q transformers datasets scikit-learn

import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoConfig,
    BertForSequenceClassification,
    Trainer, TrainingArguments
)

# 2) Prepare data
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(merged["ICD9_CODE"].tolist()).astype(np.float32)
X = merged["TEXT"].tolist()

# 3) 3‐way split
X_trainval, X_test,  Y_trainval, Y_test  = train_test_split(X, Y, test_size=0.20, random_state=42)
X_train,    X_val,   Y_train,    Y_val    = train_test_split(X_trainval, Y_trainval, test_size=0.25, random_state=42)

# 4) Build HF Datasets
train_ds = Dataset.from_dict({"text": X_train, "labels": Y_train.tolist()})
val_ds   = Dataset.from_dict({"text": X_val,   "labels": Y_val.tolist()})

# 5) Tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

def tokenize_fn(batch):
    return tokenizer(batch["text"],
                     padding="max_length",
                     truncation=True,
                     max_length=256)

# 6) Map & set_format separately
train_tok = train_ds.map(tokenize_fn, batched=True)
train_tok.set_format("torch", columns=["input_ids","attention_mask","labels"])

val_tok = val_ds.map(tokenize_fn, batched=True)
val_tok.set_format("torch", columns=["input_ids","attention_mask","labels"])

# 7) Sanity check lengths
print(f"train_tok length = {len(train_tok)} examples")
print(f"val_tok   length = {len(val_tok)} examples")

# 8) Load model & attach EnhancedHead
config = AutoConfig.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=Y.shape[1],
    problem_type="multi_label_classification"
)
model = BertForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT", config=config
)

class EnhancedHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.fc1     = nn.Linear(config.hidden_size, config.hidden_size // 2)
        self.act     = nn.ReLU()
        self.out     = nn.Linear(config.hidden_size // 2, config.num_labels)
    def forward(self, features, **kwargs):
        x = self.dropout(features)
        x = self.act(self.fc1(x))
        return self.out(x)

model.classifier = EnhancedHead(config).to("cuda")

# 9) Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    probs = torch.sigmoid(torch.from_numpy(logits))
    preds = (probs > 0.5).int().numpy()
    return {
        "micro_precision": precision_score(labels, preds, average="micro", zero_division=0),
        "micro_recall":    recall_score(labels, preds, average="micro", zero_division=0),
        "micro_f1":        f1_score(labels, preds, average="micro", zero_division=0),
    }

# 10) Trainer & train
training_args = TrainingArguments(
    output_dir="results/clinicalbert_multilabel",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=1,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    compute_metrics=compute_metrics
)

trainer.train()


Map:   0%|          | 0/30375 [00:00<?, ? examples/s]

Map:   0%|          | 0/10125 [00:00<?, ? examples/s]

train_tok length = 30375 examples
val_tok   length = 10125 examples


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.599
100,0.4197
150,0.2833
200,0.2251
250,0.2172
300,0.2127
350,0.2053
400,0.2005
450,0.2083
500,0.2049


TrainOutput(global_step=11391, training_loss=0.1839593316011962, metrics={'train_runtime': 1615.6218, 'train_samples_per_second': 56.402, 'train_steps_per_second': 7.051, 'total_flos': 1.2034502947584e+16, 'train_loss': 0.1839593316011962, 'epoch': 3.0})

In [None]:
# Cell B: Threshold Tuning on the Validation Set

import numpy as np
import torch
from sklearn.metrics import f1_score

# 1. Get logits & true labels on val
pred_out = trainer.predict(val_tok)
logits   = pred_out.predictions
y_val    = pred_out.label_ids
probs_val = torch.sigmoid(torch.from_numpy(logits)).numpy()

# 2. Sweep thresholds for best micro-F1
best_thr, best_f1 = 0.5, 0.0
for thr in np.arange(0.1, 0.91, 0.05):
    preds = (probs_val > thr).astype(int)
    f1    = f1_score(y_val, preds, average="micro", zero_division=0)
    if f1 > best_f1:
        best_f1, best_thr = f1, thr

print(f"🔍 Best validation threshold = {best_thr:.2f}, micro-F1 = {best_f1:.4f}")


🔍 Best validation threshold = 0.20, micro-F1 = 0.3801


In [None]:
#Acknowledgement: Chatgpt was used for formatting the notebook.