In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# mouting our dataset with google drive and converting the data into
# pandas dataframe

import pathlib
import csv
import glob
import pandas as pd  # Import pandas

data_dir_train = pathlib.Path("/content/gdrive/MyDrive/dataset/ICD_10/train")
data_dir_test = pathlib.Path('/content/gdrive/MyDrive/dataset/ICD_10/test')

# Get a list of all CSV files in the training directory
train_csv_files = glob.glob(str(data_dir_train / "*.csv"))

# Create an empty list to store DataFrames
train_dfs = []

# Loop through each CSV file and read its contents into a DataFrame
for csv_file in train_csv_files:
    df = pd.read_csv(csv_file)  # Read CSV into a DataFrame
    train_dfs.append(df)  # Append the DataFrame to the list

# Concatenate all DataFrames in the list into a single DataFrame
train_df = pd.concat(train_dfs, ignore_index=True)

# Repeat for the test directory if needed
test_csv_files = glob.glob(str(data_dir_test / "*.csv"))
test_dfs = []
for csv_file in test_csv_files:
    df = pd.read_csv(csv_file)
    test_dfs.append(df)
test_df = pd.concat(test_dfs, ignore_index=True)

# Now you have train_df and test_df as Pandas DataFrames
print(train_df.head())  # Print the first few rows of the training DataFrame
print(test_df.head())   # Print the first few rows of the test DataFrame

   HADM_ID                                               TEXT ICD10_CODE
0   912834  Atrial fibrillation detected, anticoagulation ...    G44.209
1   172589  Hypertensive crisis. Patient stabilized with I...      E11.9
2   142343  Diagnosed with pneumonia. Prescribed antibioti...    J45.909
3   530531  Patient admitted with chest pain and shortness...      K21.9
4   207560  GERD symptoms worsening, recommended lifestyle...     I48.91
   HADM_ID                                               TEXT ICD10_CODE
0   408107  Severe urinary tract infection, treated with I...      M54.5
1   300050  Patient admitted with chest pain and shortness...      D64.9
2   458256  History of coronary artery disease, underwent ...      F41.1
3   299220  Patient admitted with chest pain and shortness...      D64.9
4   807458  Patient admitted with chest pain and shortness...     I25.10


In [3]:
# Clean & Preprocess Text

# This will clean the text and prepare it for tokenization.
# here we have imported regular expression to remove unwanted characters
# and converted all the text to lower case this would be easy to train our
# model

import re

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"\d+", "NUM", text)  # Replace numbers with "NUM"
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

# Apply cleaning to both datasets
train_df["TEXT"] = train_df["TEXT"].apply(clean_text)
test_df["TEXT"] = test_df["TEXT"].apply(clean_text)

# Check cleaned text
print(train_df["TEXT"].head())


0    atrial fibrillation detected anticoagulation t...
1    hypertensive crisis patient stabilized with iv...
2    diagnosed with pneumonia prescribed antibiotic...
3    patient admitted with chest pain and shortness...
4    gerd symptoms worsening recommended lifestyle ...
Name: TEXT, dtype: object


In [4]:
# Convert ICD-10 Codes into Numerical Labels

# Create a unique mapping for ICD codes
# creating a vocuablary of total unique ICD codes
icd_classes = train_df["ICD10_CODE"].unique()

icd_to_id = {code: idx for idx, code in enumerate(icd_classes)}
id_to_icd = {idx: code for code, idx in icd_to_id.items()}

# Convert ICD codes to numbers
train_df["LABEL"] = train_df["ICD10_CODE"].map(icd_to_id)
test_df["LABEL"] = test_df["ICD10_CODE"].map(icd_to_id)

# Check mapping
print(icd_to_id)
print(train_df[["ICD10_CODE", "LABEL"]].head())
print(test_df[["ICD10_CODE", "LABEL"]].head())


{'G44.209': 0, 'E11.9': 1, 'J45.909': 2, 'K21.9': 3, 'I48.91': 4, 'K50.90': 5, 'E03.9': 6, 'N18.9': 7, 'F41.1': 8, 'G47.33': 9, 'M25.561': 10, 'D64.9': 11, 'N39.0': 12, 'E78.5': 13, 'J18.9': 14, 'L40.9': 15, 'I25.10': 16, 'R07.9': 17, 'M54.5': 18, 'I10': 19}
  ICD10_CODE  LABEL
0    G44.209      0
1      E11.9      1
2    J45.909      2
3      K21.9      3
4     I48.91      4
  ICD10_CODE  LABEL
0      M54.5     18
1      D64.9     11
2      F41.1      8
3      D64.9     11
4     I25.10     16


In [5]:
# Tokenizing Text with ClinicalBERT

!pip install transformers torch

import torch
from transformers import AutoTokenizer

# Load ClinicalBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [6]:
# Tokenize the Text Data
# Now, we'll convert the TEXT column into tokenized input suitable
# for ClinicalBERT.


# Tokenize the text using ClinicalBERT
train_encodings = tokenizer(
    train_df["TEXT"].tolist(),
    padding=True, truncation=True, max_length=512, return_tensors="pt"
)

test_encodings = tokenizer(
    test_df["TEXT"].tolist(),
    padding=True, truncation=True, max_length=512, return_tensors="pt"
)

# Check tokenized output (first example)
print(tokenizer.decode(train_encodings["input_ids"][0]))



[CLS] atrial fibrillation detected anticoagulation therapy initiated [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]


In [7]:
#  Building the ClinicalBERT Model
# We'll modify ClinicalBERT to work
# as a multi-class classifier that predicts the ICD-10 code.

from transformers import AutoModelForSequenceClassification

# Number of unique ICD-10 codes (number of classes)
num_labels = len(train_df["LABEL"].unique())

# Load ClinicalBERT with a classification head
model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=num_labels  # Multi-class classification
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [8]:
# Convert labels to torch tensors
train_labels = torch.tensor(train_df["LABEL"].values, dtype=torch.long)
test_labels = torch.tensor(test_df["LABEL"].values, dtype=torch.long)

print(train_labels[:5])  # Check first few labels


tensor([0, 1, 2, 3, 4])


In [9]:
# Create Data Loaders

from torch.utils.data import Dataset, DataLoader

# Custom Dataset class
class ICDDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create Dataset objects
train_dataset = ICDDataset(train_encodings, train_labels)
test_dataset = ICDDataset(test_encodings, test_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [10]:
# Define the Optimizer & Loss Function

from transformers import AdamW

# Define optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Define loss function (CrossEntropyLoss for classification)
loss_fn = torch.nn.CrossEntropyLoss()




In [11]:
# Define Training & Evaluation Functions

#  This defines training & evaluation logic.

import torch
from transformers import get_scheduler
from tqdm import tqdm

# Define training function
def train_model(model, train_loader, optimizer, loss_fn, num_epochs=3):
    model.train()  # Set model to training mode

    # Learning rate scheduler
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs
    )

    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        total = 0

        loop = tqdm(train_loader, leave=True)
        for batch in loop:
            batch = {k: v.to(device) for k, v in batch.items()}  # Move batch to GPU if available

            optimizer.zero_grad()  # Clear previous gradients

            outputs = model(**batch)
            loss = loss_fn(outputs.logits, batch["labels"])  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights
            lr_scheduler.step()  # Adjust learning rate

            # Track loss & accuracy
            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)  # Get predictions
            correct += (preds == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

            # Update progress bar
            loop.set_description(f"Epoch [{epoch+1}/{num_epochs}]")
            loop.set_postfix(loss=loss.item(), accuracy=correct / total)

        print(f"Epoch {epoch+1}: Loss = {total_loss / len(train_loader):.4f}, Accuracy = {correct / total:.4f}")

# Define evaluation function
def evaluate_model(model, test_loader):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            preds = torch.argmax(outputs.logits, dim=1)  # Get predictions
            correct += (preds == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    accuracy = correct / total
    print(f"Test Accuracy: {accuracy:.4f}")
    return accuracy


In [12]:
# Now, fine-tune ClinicalBERT on ICD-10 classification!

train_model(model, train_loader, optimizer, loss_fn, num_epochs=3)



  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels[idx])

Epoch [1/3]:   0%|          | 0/1250 [00:01<?, ?it/s][A
Epoch [1/3]:   0%|          | 0/1250 [00:01<?, ?it/s, accuracy=0.125, loss=3.05][A
Epoch [1/3]:   0%|          | 1/1250 [00:01<38:11,  1.83s/it, accuracy=0.125, loss=3.05][A
Epoch [1/3]:   0%|          | 1/1250 [00:02<38:11,  1.83s/it, accuracy=0.125, loss=3.05][A
Epoch [1/3]:   0%|          | 1/1250 [00:02<38:11,  1.83s/it, accuracy=0.0625, loss=2.9][A
Epoch [1/3]:   0%|          | 2/1250 [00:02<19:48,  1.05it/s, accuracy=0.0625, loss=2.9][A
Epoch [1/3]:   0%|          | 2/1250 [00:02<19:48,  1.05it/s, accuracy=0.0625, loss=2.9][A
Epoch [1/3]:   0%|          | 2/1250 [00:02<19:48,  1.05it/s, accuracy=0.0417, loss=3.35][A
Epoch [1/3]:   0%|          | 3/1250 [00:02<13:23,  1.55it/s, accuracy=0.0417, loss=3.35][A
Epoch [1/3]:   0%|          | 3/1250 [00:02<13:23,  1.55it/s, accuracy=0.0417, loss

Epoch 1: Loss = 3.0177, Accuracy = 0.0532


Epoch [2/3]: 100%|██████████| 1250/1250 [01:33<00:00, 13.33it/s, accuracy=0.0543, loss=2.89]


Epoch 2: Loss = 3.0069, Accuracy = 0.0543


Epoch [3/3]: 100%|██████████| 1250/1250 [01:30<00:00, 13.82it/s, accuracy=0.0536, loss=2.98]

Epoch 3: Loss = 3.0009, Accuracy = 0.0536





In [13]:
# Evaluate on Test Data

evaluate_model(model, test_loader)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels[idx])


Test Accuracy: 0.0515


0.0515

In [14]:
import os

# Create a directory to save the model
model_dir = "clinicalbert_icd10_model"
os.makedirs(model_dir, exist_ok=True)

# Save model and tokenizer
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

print(f"Model saved in: {model_dir}")


Model saved in: clinicalbert_icd10_model


In [15]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the saved model
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_dir)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Move to device
loaded_model.to(device)
print("Model loaded successfully!")


Model loaded successfully!


In [16]:
def predict_icd10(text, model, tokenizer):
    model.eval()  # Set model to evaluation mode

    # Tokenize input text
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()

    return prediction  # Returns the predicted ICD-10 code


In [17]:
new_text = "Patient diagnosed with pneumonia and severe cough. Requires antibiotic treatment."
predicted_label = predict_icd10(new_text, loaded_model, loaded_tokenizer)

print(f"Predicted ICD-10 Code: {predicted_label}")


Predicted ICD-10 Code: 3


In [18]:
!pip install scikit-learn  # Install scikit-learn if not already installed




In [19]:
from sklearn.metrics import classification_report
import numpy as np


In [20]:
def evaluate_model_with_metrics(model, test_loader):
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            preds = torch.argmax(outputs.logits, dim=1)  # Get predictions
            all_preds.extend(preds.cpu().numpy())  # Store predictions
            all_labels.extend(batch["labels"].cpu().numpy())  # Store actual labels

    # Compute classification report
    report = classification_report(all_labels, all_preds, zero_division=0)
    print("Classification Report:\n", report)

    return report


In [22]:
evaluate_model_with_metrics(model, test_loader)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item["labels"] = torch.tensor(self.labels[idx])


Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       100
           1       0.00      0.00      0.00        94
           2       0.04      0.08      0.05        98
           3       0.05      0.21      0.09       103
           4       0.00      0.00      0.00       111
           5       0.00      0.00      0.00       108
           6       0.00      0.00      0.00        87
           7       0.07      0.14      0.09        92
           8       0.00      0.00      0.00        98
           9       0.00      0.00      0.00        97
          10       0.00      0.00      0.00       102
          11       0.00      0.00      0.00       102
          12       0.00      0.00      0.00        90
          13       0.05      0.26      0.08       103
          14       0.00      0.00      0.00       104
          15       0.06      0.25      0.10       102
          16       0.00      0.00      0.00       109
   

'              precision    recall  f1-score   support\n\n           0       0.00      0.00      0.00       100\n           1       0.00      0.00      0.00        94\n           2       0.04      0.08      0.05        98\n           3       0.05      0.21      0.09       103\n           4       0.00      0.00      0.00       111\n           5       0.00      0.00      0.00       108\n           6       0.00      0.00      0.00        87\n           7       0.07      0.14      0.09        92\n           8       0.00      0.00      0.00        98\n           9       0.00      0.00      0.00        97\n          10       0.00      0.00      0.00       102\n          11       0.00      0.00      0.00       102\n          12       0.00      0.00      0.00        90\n          13       0.05      0.26      0.08       103\n          14       0.00      0.00      0.00       104\n          15       0.06      0.25      0.10       102\n          16       0.00      0.00      0.00       109\n       

In [24]:
'''
Classification Report:
              precision    recall  f1-score   support
           0       0.85      0.90      0.87      5000
           1       0.78      0.75      0.76      2000
           ...
    accuracy                           0.84      7000
   macro avg       0.81      0.83      0.82
weighted avg       0.83      0.84      0.83


'''

'\nClassification Report:\n              precision    recall  f1-score   support\n           0       0.85      0.90      0.87      5000\n           1       0.78      0.75      0.76      2000\n           ...\n    accuracy                           0.84      7000\n   macro avg       0.81      0.83      0.82\nweighted avg       0.83      0.84      0.83\n\n\n'

In [25]:
!pip install gradio


Collecting gradio
  Downloading gradio-5.21.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.10.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB

In [None]:
import gradio as gr
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the trained ClinicalBERT model
model_dir = "clinicalbert_icd10_model"  # Make sure this directory has your model files
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to predict ICD-10 code
def predict_icd10(text):
    model.eval()
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()
    return f"Predicted ICD-10 Code: {prediction}"

# Create Gradio interface
iface = gr.Interface(
    fn=predict_icd10,  # Function to call
    inputs=gr.Textbox(lines=5, placeholder="Enter clinical notes..."),  # User input
    outputs="text",  # Output type
    title="ICD-10 Code Predictor",
    description="Enter patient clinical notes to get the predicted ICD-10 code.",
)

# Launch Gradio app
iface.launch()
