In [1]:
# Install required libraries
%pip install transformers datasets scikit-learn pandas torch --quiet


In [3]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from sklearn.model_selection import train_test_split
import torch

In [4]:
# Upload your expanded_noisy_training_data.csv file before running this cell
df = pd.read_excel("/content/sample_data/icd10_noisy_descriptions_long.xlsx").dropna()
df = df[["noisy_text", "clean_text"]]
# Shuffle the DataFrame
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
#get random 15000 records
df = df.sample(n=15000, random_state=42).reset_index(drop=True)
df.head()


Unnamed: 0,noisy_text,clean_text
0,Type 1 diabetes mellitus with unspecified diab...,Type 1 diabetes mellitus with unspecified diab...
1,Atherosclerotic heart disease of native corona...,Atherosclerotic heart disease of native corona...
2,"Toxic effect of othek peticndes, intentional s...","Toxic effect of other pesticides, intentional ..."
3,Malignant neoplasm offleft upper limb,Malignant neoplasm of left upper limb
4,yiffuse traumatic brain injury with loss of co...,Diffuse traumatic brain injury with loss of co...


In [5]:
# Split into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [6]:

from transformers import EncoderDecoderModel, BertTokenizerFast, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
import torch
# Define the maximum input and target lengths
# Load tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = EncoderDecoderModel.from_encoder_decoder_pretrained("emilyalsentzer/Bio_ClinicalBERT", "emilyalsentzer/Bio_ClinicalBERT")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertLMHeadModel were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.

In [7]:

# Tokenization
def tokenize_function(example):
    input_encodings = tokenizer(example["noisy_text"], padding="max_length", truncation=True, max_length=64)
    target_encodings = tokenizer(example["clean_text"], padding="max_length", truncation=True, max_length=64)
    input_encodings["labels"] = target_encodings["input_ids"]
    return input_encodings


In [8]:
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(df, test_size=0.1, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

tokenized_train = train_dataset.map(tokenize_function, batched=False)
tokenized_eval = eval_dataset.map(tokenize_function, batched=False)


Map:   0%|          | 0/13500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest",  # or "max_length"
    return_tensors="pt"

)


In [13]:
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = Seq2SeqTrainingArguments(
    output_dir="./model_output_final",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    report_to="none"   # disable wandb / tensorboard / comet
)



In [14]:

# Set decoder_start_token_id and pad_token_id
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id


# Trainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator
)



trainer.train()
# Save the model and tokenizer
model.save_pretrained("./clinicalbert_noisy_text_correction_large_3")
tokenizer.save_pretrained("./clinicalbert_noisy_text_correction_large_3")

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,0.1182,0.087797
2,0.0533,0.060136
3,0.0321,0.064538
4,0.0214,0.081845
5,0.0167,0.117432
6,0.0125,0.093527
7,0.0106,0.142622
8,0.0093,0.171575
9,0.0082,0.201133
10,0.0067,0.182581


There were missing keys in the checkpoint model loaded: ['decoder.cls.predictions.decoder.weight', 'decoder.cls.predictions.decoder.bias'].


('./clinicalbert_noisy_text_correction_large_3/tokenizer_config.json',
 './clinicalbert_noisy_text_correction_large_3/special_tokens_map.json',
 './clinicalbert_noisy_text_correction_large_3/vocab.txt',
 './clinicalbert_noisy_text_correction_large_3/added_tokens.json',
 './clinicalbert_noisy_text_correction_large_3/tokenizer.json')

In [15]:
import pandas as pd
import torch
import re
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.metrics import f1_score, recall_score, accuracy_score, confusion_matrix

def normalize(text):
    # Lowercase, remove spaces around punctuation, collapse multiple spaces
    text = text.lower()
    text = re.sub(r'\s*([-,])\s*', r'\1', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def evaluate_seq2seq_model(eval_df, model_path, noisy_col="noisy_text", clean_col="clean_text", sample_size=200, batch_size=64):
    # Shuffle and randomly select samples for evaluation
    eval_df = eval_df.sample(frac=1, random_state=42).reset_index(drop=True)
    texts = [normalize(t) for t in eval_df[noisy_col].tolist()]
    true_labels = [normalize(t) for t in eval_df[clean_col].tolist()]
    sample_indices = pd.Series(texts).sample(n=sample_size, random_state=42).index
    texts = [texts[i] for i in sample_indices]
    true_labels = [true_labels[i] for i in sample_indices]

    # Load the saved seq2seq model and tokenizer
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Move model to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # Batch prediction
    decoded_preds = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                decoder_start_token_id=model.config.decoder_start_token_id,
                pad_token_id=model.config.pad_token_id,
                max_length=128,
                num_beams=1
            )
        decoded_preds.extend([normalize(tokenizer.decode(output, skip_special_tokens=True)) for output in outputs])

    # Word match metrics
    y_true = []
    y_pred = []
    for true_text, pred_text in zip(true_labels, decoded_preds):
        true_words = set(true_text.split())
        pred_words = set(pred_text.split())
        match = int(true_words.issubset(pred_words))
        y_true.append(1)
        y_pred.append(match)

    # Calculate metrics
    f1 = f1_score(y_true, y_pred, average='binary')
    recall = recall_score(y_true, y_pred, average='binary')
    accuracy = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)

    print("F1 Score:", f1)
    print("Recall:", recall)
    print("Accuracy:", accuracy)
    print("Confusion Matrix:\n", cm)

    for i in range(len(true_labels)):
        print(f"True: {true_labels[i]}\nPredicted: {decoded_preds[i]}\nMatch: {y_pred[i]}")

# Example usage:
evaluate_seq2seq_model(eval_df, "./clinicalbert_noisy_text_correction_large_3")

F1 Score: 0.7341772151898734
Recall: 0.58
Accuracy: 0.58
Confusion Matrix:
 [[  0   0]
 [ 84 116]]
True: depression,unspecified
Predicted: 
Match: 0
True: congenital malformations of corpus callosum
Predicted: congenital malformations of corpus callosum
Match: 1
True: unstable burst fracture of unspecified lumbar vertebra,initial encounter for closed fracture
Predicted: unstable burst fracture of unspecified lumbar vertebra,initial encounter for closed fracture
Match: 1
True: other specified diabetes mellitus with unspecified diabetic retinopathy without macular edema
Predicted: other specified diabetes mellitus with unspecified diabetic retinopathy without macular edema
Match: 1
True: malignant neoplasm of peripheral nerves of right lower limb,including hip
Predicted: 
Match: 0
True: burkitt lymphoma,lymph nodes of multiple sites
Predicted: 
Match: 0
True: toxic effect of chlorine gas,intentional self-harm,sequela
Predicted: toxic effect of chlorine gas,intentional self-harm,sequela
M

In [None]:
import shutil, os
from pathlib import Path

NOTEBOOK_PATH = "/content/biobert-finetune 1 (2).ipynb"
MODEL_DIR     = "/content/clinicalbert_noisy_text_correction_large_3"
LOG_DIR       = "/content/logs"
STAGING       = "/content/export_to_github"

shutil.rmtree(STAGING, ignore_errors=True)
os.makedirs(STAGING, exist_ok=True)

# Copy model
shutil.copytree(MODEL_DIR, Path(STAGING)/"models"/"clinicalbert_noisy_text_correction_large_3")

# Copy notebook
if os.path.exists(NOTEBOOK_PATH):
    shutil.copy2(NOTEBOOK_PATH, STAGING)

# Copy logs
if os.path.isdir(LOG_DIR):
    shutil.copytree(LOG_DIR, Path(STAGING)/"logs", dirs_exist_ok=True)

print("✅ Staging complete →", STAGING)
