In [None]:
%%capture
!pip install simple-icd-10 
!pip install simple-icd-10-cm
!pip install datasets transformers

In [None]:
import json 
import pandas as pd
import simple_icd_10_cm as icd
from datasets import load_dataset,Dataset
from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn import metrics
import numpy as np
from transformers import (
    BertTokenizer
    ,BertForSequenceClassification
    ,DataCollatorWithPadding
    ,TrainingArguments
    ,Trainer
    ,pipeline
)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [None]:
df = pd.DataFrame(json.load(open("/content/icd_json.json","r"))) 
df["text"] = df.text.apply(lambda i: i.get("codeDescription"))
df["icd-l3"] = df.icd10Code.str[0:3]
df["icd_group"] = df["icd-l3"].apply(icd.get_parent)
df["icd_group_description"] = df["icd_group"].apply(icd.get_description)
df["icd_general_group"] = df["icd_group"].apply(icd.get_parent)
df["icd_general_group_description"] = df["icd_general_group"].apply(icd.get_description)

In [None]:
label_encoder = LabelEncoder()
df["labels"] = label_encoder.fit_transform(df["icd_group_description"])
df["labels"] = df["icd_group_description"]

In [None]:
df[["text","icd_group_description","labels"]].to_json("icd_dataset.jsonl",orient="records",lines=True)

In [None]:
dataset = load_dataset("json",data_files="/content/icd_dataset.jsonl",split="train")



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-8ae1c4d4ea9d6a45/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-8ae1c4d4ea9d6a45/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


In [None]:
dataset = dataset.class_encode_column("labels")

Casting to class labels:   0%|          | 0/72 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/8 [00:00<?, ?ba/s]

In [None]:
tokenizer = BertTokenizer.from_pretrained("dmis-lab/biobert-v1.1")

In [None]:
def tokenize_text(batch):
    texts = batch["text"]
    return tokenizer(texts,truncation=True)

In [None]:
tokenized_dataset = dataset.map(tokenize_text,remove_columns=['text', 'icd_group_description'])

  0%|          | 0/71480 [00:00<?, ?ex/s]

In [None]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

In [None]:
train_dataset = tokenized_dataset["train"].shuffle(7854)
validation_dataset = tokenized_dataset["test"]

In [None]:
train_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 57184
})

In [None]:
target_feature = tokenized_dataset["train"].features["labels"]
num_classes = target_feature.num_classes
label_names = target_feature.names

id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
#id2icd = {i:l for i,l in enumerate(label_encoder.classes_)}
#icd2id = {l:i for i,l in enumerate(label_encoder.classes_)}

In [None]:
model = BertForSequenceClassification.from_pretrained("dmis-lab/biobert-v1.1",num_labels=275,id2label=id2label,label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
num_freeze_param = 190
for i,p in enumerate(model.bert.parameters()):
    if i < num_freeze_param:
        p.requires_grad = False

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
batch_size=512

In [None]:
args = TrainingArguments(
    "biobert-ner",
    evaluation_strategy="epoch",
    overwrite_output_dir=True,
    save_total_limit = 3,
    save_strategy="epoch",
    learning_rate=1e-3,
    num_train_epochs=10,
    weight_decay=0.005,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*2
)

In [None]:
def compute_metrics(eval_pred):
    logits_, labels_ = eval_pred
    predictions = np.argmax(logits_, axis=-1)

    accuracy = metrics.accuracy_score(labels_, predictions)
    f1_score_micro = metrics.f1_score(labels_, predictions, average='micro')
    f1_score_macro = metrics.f1_score(labels_, predictions, average='macro')

    return {"accuracy": accuracy, "f1_score_micro": f1_score_micro, "f1_score_macro": f1_score_macro}

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)
trainer.train()

***** Running training *****
  Num examples = 57184
  Num Epochs = 10
  Instantaneous batch size per device = 512
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 1120


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score Micro,F1 Score Macro
1,No log,0.918478,0.741606,0.741606,0.327259
2,No log,0.509191,0.856603,0.856603,0.539826
3,No log,0.376228,0.891928,0.891928,0.637681
4,No log,0.303986,0.915571,0.915571,0.686972
5,0.921400,0.257553,0.926133,0.926133,0.708756
6,0.921400,0.221157,0.941032,0.941032,0.754794
7,0.921400,0.19676,0.945859,0.945859,0.762241
8,0.921400,0.180806,0.952084,0.952084,0.770291
9,0.304100,0.164086,0.95852,0.95852,0.793418
10,0.304100,0.158036,0.960758,0.960758,0.799646


***** Running Evaluation *****
  Num examples = 14296
  Batch size = 1024
Saving model checkpoint to biobert-ner/checkpoint-112
Configuration saved in biobert-ner/checkpoint-112/config.json
Model weights saved in biobert-ner/checkpoint-112/pytorch_model.bin
tokenizer config file saved in biobert-ner/checkpoint-112/tokenizer_config.json
Special tokens file saved in biobert-ner/checkpoint-112/special_tokens_map.json
Deleting older checkpoint [biobert-ner/checkpoint-4766] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 14296
  Batch size = 1024
Saving model checkpoint to biobert-ner/checkpoint-224
Configuration saved in biobert-ner/checkpoint-224/config.json
Model weights saved in biobert-ner/checkpoint-224/pytorch_model.bin
tokenizer config file saved in biobert-ner/checkpoint-224/tokenizer_config.json
Special tokens file saved in biobert-ner/checkpoint-224/special_tokens_map.json
Deleting older checkpoint [biobert-ner/checkpoint-7149] due to args.save_total_

TrainOutput(global_step=1120, training_loss=0.5706945640700204, metrics={'train_runtime': 2033.9292, 'train_samples_per_second': 281.15, 'train_steps_per_second': 0.551, 'total_flos': 1.2954276138792576e+16, 'train_loss': 0.5706945640700204, 'epoch': 10.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/other/icd_biobert")

Saving model checkpoint to /content/drive/MyDrive/other/icd_biobert
Configuration saved in /content/drive/MyDrive/other/icd_biobert/config.json
Model weights saved in /content/drive/MyDrive/other/icd_biobert/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/other/icd_biobert/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/other/icd_biobert/special_tokens_map.json


In [None]:
model_ = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/other/icd_biobert")
tokenizer_ = BertTokenizer.from_pretrained("/content/drive/MyDrive/other/icd_biobert")

loading configuration file /content/drive/MyDrive/other/icd_biobert/config.json
Model config BertConfig {
  "_name_or_path": "dmis-lab/biobert-v1.1",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Abnormal findings on diagnostic imaging and in function studies, without diagnosis (R90-R94)",
    "1": "Abnormal findings on examination of blood, without diagnosis (R70-R79)",
    "2": "Abnormal findings on examination of other body fluids, substances and tissues, without diagnosis (R83-R89)",
    "3": "Abnormal findings on examination of urine, without diagnosis (R80-R82)",
    "4": "Abnormal tumor markers",
    "5": "Accidental exposure to other specified factors (X52-X58)",
    "6": "Accidental non-transport drowning and submersion (W65-W74)",
    "7": "Acute kidney fail

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline("text-classification",model=model_,tokenizer=tokenizer_)

In [None]:
pipe(["high-grade gliomas or medulloblastomas"])

[{'label': 'Neoplasms of uncertain behavior, polycythemia vera and myelodysplastic syndromes (D37-D48)',
  'score': 0.5645073652267456}]

In [None]:
validation_dataset

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 14296
})

In [None]:
validation_ = trainer.predict(validation_dataset)

***** Running Prediction *****
  Num examples = 14296
  Batch size = 1024


In [None]:
y_hat = np.argmax(validation_.predictions,axis=1)

In [None]:
y_true = validation_dataset["labels"]

In [None]:
y_hat_ = [id2label.get(str(i)) for i in y_hat]

In [None]:
y_true_ = [id2label.get(str(i)) for i in y_true]

In [None]:
print(metrics.classification_report(y_true_,y_hat_,zero_division=False))

                                                                                                                                                                                          precision    recall  f1-score   support

                                                                                            Abnormal findings on diagnostic imaging and in function studies, without diagnosis (R90-R94)       0.91      0.91      0.91        11
                                                                                                                  Abnormal findings on examination of blood, without diagnosis (R70-R79)       0.89      0.67      0.76        12
                                                                              Abnormal findings on examination of other body fluids, substances and tissues, without diagnosis (R83-R89)       0.96      1.00      0.98        23
                                                                                               

In [None]:
import os
model_.push_to_hub("biobert-ICD10-L3",use_auth_token=os.getenv("TOKEN_HF"))
tokenizer_.push_to_hub("biobert-ICD10-L3",use_auth_token=os.getenv("TOKEN_HF"))

Configuration saved in /tmp/tmpezto9jjw/config.json
Model weights saved in /tmp/tmpezto9jjw/pytorch_model.bin
Uploading the following files to rjac/biobert-ICD10-L3: pytorch_model.bin,config.json
tokenizer config file saved in /tmp/tmpemjmldis/tokenizer_config.json
Special tokens file saved in /tmp/tmpemjmldis/special_tokens_map.json
Uploading the following files to rjac/biobert-ICD10-L3: special_tokens_map.json,vocab.txt,tokenizer_config.json


CommitInfo(commit_url='https://huggingface.co/rjac/biobert-ICD10-L3/commit/2db78aaec0703db5f06d837f0f70d1faa2a2be81', commit_message='Upload tokenizer', commit_description='', oid='2db78aaec0703db5f06d837f0f70d1faa2a2be81', pr_url=None, pr_revision=None, pr_num=None)