In [30]:
import pandas as pd
import torch
import string
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BertTokenizer

os.environ["WANDB_DISABLED"] = "true"  # Disable Weights & Biases logging

In [31]:
# Load dataset
df = pd.read_csv("MLHC_train_classification_2.csv")

# Text cleaning function
def clean_text(text):
    text = text.lower().translate(str.maketrans("", "", string.punctuation))  # Lowercase & remove punctuation
    text = text.replace(" ", "")  # Remove spaces
    return text

# Mapping triage levels to numbers
triage_mapping = {
    "immediate": 0,
    "emergent": 1,
    "urgent": 2,
    "semiurgent": 3,
    "nonurgent": 4
}

df["triage_value"] = df["triage_level"].astype(str).apply(clean_text).map(triage_mapping)
df.dropna(inplace=True)

In [32]:

# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text_data"].tolist(), df["triage_value"].tolist(), test_size=0.2, random_state=42
)

In [33]:
# Load ClinicalBERT tokenizer
tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Tokenization function
def tokenize_data(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)


In [34]:
# Tokenizing train and validation texts
train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)

In [35]:
# Define a PyTorch Dataset class
class TriageDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])  # Ensure labels are tensors
        return item

In [36]:
# Convert to PyTorch dataset format
train_dataset = TriageDataset(train_encodings, train_labels)
val_dataset = TriageDataset(val_encodings, val_labels)

In [37]:
# Define compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Get predicted class by taking the argmax over the logits
    predictions = np.argmax(logits, axis=1)
    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

In [38]:
# Load and modify the configuration with increased dropout rates
config = AutoConfig.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=5,
)

# Load the tokenizer as before
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Load the model for a classification task with the new configuration
model = AutoModelForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", config=config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)  # Move model to the correct device

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [40]:
# Define your training arguments
training_args = TrainingArguments(
    output_dir="./clinicalbert_triage",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
)



In [41]:
# Define your Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.0426,0.999017,0.583691
2,0.8882,0.985125,0.593501
3,0.8736,0.986294,0.591048


TrainOutput(global_step=1224, training_loss=1.0104815952138964, metrics={'train_runtime': 2637.896, 'train_samples_per_second': 7.418, 'train_steps_per_second': 0.464, 'total_flos': 5148958929878016.0, 'train_loss': 1.0104815952138964, 'epoch': 3.0})

In [42]:
# Evaluate on validation set
results = trainer.evaluate()
print(results)

{'eval_loss': 0.986294150352478, 'eval_accuracy': 0.5910484365419988, 'eval_runtime': 34.8365, 'eval_samples_per_second': 46.819, 'eval_steps_per_second': 2.928, 'epoch': 3.0}
