In [None]:
%%capture
!pip install simple-icd-10 
!pip install simple-icd-10-cm
!pip install datasets transformers

In [None]:
import json 
import pandas as pd
import simple_icd_10_cm as icd
from datasets import load_dataset,Dataset
from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn import metrics
import numpy as np
from transformers import (
    AutoTokenizer
    ,AutoModelForSequenceClassification
    ,DataCollatorWithPadding
    ,TrainingArguments
    ,Trainer
    ,pipeline
)


In [None]:
MODEL_VERSION="rjac/setfit-ST-ICD10-L3"

In [None]:
df = pd.DataFrame(json.load(open("/content/icd_json.json","r"))) 
df["text"] = df.text.apply(lambda i: i.get("codeDescription"))
df["icd-l3"] = df.icd10Code.str[0:3]
df["icd_group"] = df["icd-l3"].apply(icd.get_parent)
df["icd_group_description"] = df["icd_group"].apply(icd.get_description)
df["icd_general_group"] = df["icd_group"].apply(icd.get_parent)
df["icd_general_group_description"] = df["icd_general_group"].apply(icd.get_description)

In [None]:
df["labels"] = df["icd_group_description"]

In [None]:
df[["text","icd_group_description","labels"]].to_json("icd_dataset.jsonl",orient="records",lines=True)

In [None]:
dataset = load_dataset("json",data_files="/content/icd_dataset.jsonl",split="train")



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-e0479e0881c49dd3/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-e0479e0881c49dd3/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


In [None]:
dataset = dataset.class_encode_column("labels")

Casting to class labels:   0%|          | 0/72 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/8 [00:00<?, ?ba/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_VERSION)

Downloading:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
def tokenize_text(batch):
    texts = batch["text"]
    return tokenizer(texts,truncation=True)

In [None]:
tokenized_dataset = dataset.map(tokenize_text,remove_columns=['text', 'icd_group_description'])

  0%|          | 0/71480 [00:00<?, ?ex/s]

In [None]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

In [None]:
train_dataset = tokenized_dataset["train"].shuffle(7854)
validation_dataset = tokenized_dataset["test"]

In [None]:
train_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 57184
})

In [None]:
target_feature = tokenized_dataset["train"].features["labels"]
num_classes = target_feature.num_classes
label_names = target_feature.names

id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
#id2icd = {i:l for i,l in enumerate(label_encoder.classes_)}
#icd2id = {l:i for i,l in enumerate(label_encoder.classes_)}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_VERSION,num_labels=num_classes,id2label=id2label,label2id=label2id)

Downloading:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at rjac/setfit-ST-ICD10-L3 were not used when initializing MPNetForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing MPNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MPNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at rjac/setfit-ST-ICD10-L3 and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to 

In [None]:
num_freeze_param = 190
for i,p in enumerate(model.mpnet.parameters()):
    if i < num_freeze_param:
        p.requires_grad = False

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
batch_size=512

In [None]:
args = TrainingArguments(
    "model",
    evaluation_strategy="epoch",
    overwrite_output_dir=True,
    save_total_limit = 3,
    save_strategy="epoch",
    learning_rate=1e-3,
    num_train_epochs=10,
    weight_decay=0.005,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*2
)

In [None]:
def compute_metrics(eval_pred):
    logits_, labels_ = eval_pred
    predictions = np.argmax(logits_, axis=-1)

    accuracy = metrics.accuracy_score(labels_, predictions)
    f1_score_micro = metrics.f1_score(labels_, predictions, average='micro')
    f1_score_macro = metrics.f1_score(labels_, predictions, average='macro')

    return {"accuracy": accuracy, "f1_score_micro": f1_score_micro, "f1_score_macro": f1_score_macro}

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)
trainer.train()

***** Running training *****
  Num examples = 57184
  Num Epochs = 10
  Instantaneous batch size per device = 512
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 1120
You're using a MPNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Score Micro,F1 Score Macro
1,No log,0.813335,0.730834,0.730834,0.456656
2,No log,0.529755,0.813514,0.813514,0.588727
3,No log,0.407093,0.864158,0.864158,0.649337
4,No log,0.357667,0.882275,0.882275,0.690422
5,0.838800,0.311951,0.892068,0.892068,0.732508
6,0.838800,0.29025,0.9034,0.9034,0.748269
7,0.838800,0.253276,0.920817,0.920817,0.785354
8,0.838800,0.221692,0.93082,0.93082,0.802538
9,0.366100,0.20919,0.937045,0.937045,0.819868
10,0.366100,0.199223,0.941662,0.941662,0.829331


***** Running Evaluation *****
  Num examples = 14296
  Batch size = 1024
Saving model checkpoint to model/checkpoint-112
Configuration saved in model/checkpoint-112/config.json
Model weights saved in model/checkpoint-112/pytorch_model.bin
tokenizer config file saved in model/checkpoint-112/tokenizer_config.json
Special tokens file saved in model/checkpoint-112/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 14296
  Batch size = 1024
Saving model checkpoint to model/checkpoint-224
Configuration saved in model/checkpoint-224/config.json
Model weights saved in model/checkpoint-224/pytorch_model.bin
tokenizer config file saved in model/checkpoint-224/tokenizer_config.json
Special tokens file saved in model/checkpoint-224/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 14296
  Batch size = 1024
Saving model checkpoint to model/checkpoint-336
Configuration saved in model/checkpoint-336/config.json
Model weights saved in model/checkpoint-336/py

TrainOutput(global_step=1120, training_loss=0.5692728723798479, metrics={'train_runtime': 2941.4099, 'train_samples_per_second': 194.41, 'train_steps_per_second': 0.381, 'total_flos': 1.1826999557225664e+16, 'train_loss': 0.5692728723798479, 'epoch': 10.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/other/icd_stclassification")

Saving model checkpoint to /content/drive/MyDrive/other/icd_stclassification
Configuration saved in /content/drive/MyDrive/other/icd_stclassification/config.json
Model weights saved in /content/drive/MyDrive/other/icd_stclassification/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/other/icd_stclassification/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/other/icd_stclassification/special_tokens_map.json


In [None]:
model_ = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/other/icd_stclassification")
tokenizer_ = AutoTokenizer.from_pretrained("/content/drive/MyDrive/other/icd_stclassification")

loading configuration file /content/drive/MyDrive/other/icd_stclassification/config.json
Model config MPNetConfig {
  "_name_or_path": "/content/drive/MyDrive/other/icd_stclassification",
  "architectures": [
    "MPNetForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Abnormal findings on diagnostic imaging and in function studies, without diagnosis (R90-R94)",
    "1": "Abnormal findings on examination of blood, without diagnosis (R70-R79)",
    "2": "Abnormal findings on examination of other body fluids, substances and tissues, without diagnosis (R83-R89)",
    "3": "Abnormal findings on examination of urine, without diagnosis (R80-R82)",
    "4": "Abnormal tumor markers",
    "5": "Accidental exposure to other specified factors (X52-X58)",
    "6": "Accidental non-transport drowning and submersion (W65-W74)",
    "7": "A

In [None]:
%env TOKEN_HF=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

env: TOKEN_HF=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


In [None]:
import os
model_.push_to_hub("setfit-ICD10-L3",use_auth_token=os.getenv("TOKEN_HF"))
tokenizer_.push_to_hub("setfit-ICD10-L3",use_auth_token=os.getenv("TOKEN_HF"))

Configuration saved in /tmp/tmpqstr17vq/config.json
Model weights saved in /tmp/tmpqstr17vq/pytorch_model.bin
Uploading the following files to rjac/setfit-ICD10-L3: config.json,pytorch_model.bin
tokenizer config file saved in /tmp/tmphkr23a2p/tokenizer_config.json
Special tokens file saved in /tmp/tmphkr23a2p/special_tokens_map.json
Uploading the following files to rjac/setfit-ICD10-L3: tokenizer.json,special_tokens_map.json,tokenizer_config.json,vocab.txt


CommitInfo(commit_url='https://huggingface.co/rjac/setfit-ICD10-L3/commit/42f1d61d186ee71899114a89e9f3b8f37f89a4a4', commit_message='Upload tokenizer', commit_description='', oid='42f1d61d186ee71899114a89e9f3b8f37f89a4a4', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline("text-classification",model=model_,tokenizer=tokenizer_)

In [None]:
pipe(["high-grade gliomas or medulloblastomas"])

Disabling tokenizer parallelism, we're using DataLoader multithreading already


[{'label': 'Malignant neoplasms of lymphoid, hematopoietic and related tissue (C81-C96)',
  'score': 0.372622549533844}]

In [None]:
validation_dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 14296
})

In [None]:
validation_ = trainer.predict(validation_dataset)

***** Running Prediction *****
  Num examples = 14296
  Batch size = 1024


In [None]:
y_hat = np.argmax(validation_.predictions,axis=1)

In [None]:
y_true = validation_dataset["labels"]

In [None]:
y_hat_ = [id2label.get(str(i)) for i in y_hat]

In [None]:
y_true_ = [id2label.get(str(i)) for i in y_true]

In [None]:
print(metrics.classification_report(y_true_,y_hat_,zero_division=False))

                                                                                                                                                                                          precision    recall  f1-score   support

                                                                                            Abnormal findings on diagnostic imaging and in function studies, without diagnosis (R90-R94)       1.00      0.83      0.91         6
                                                                                                                  Abnormal findings on examination of blood, without diagnosis (R70-R79)       0.88      0.58      0.70        12
                                                                              Abnormal findings on examination of other body fluids, substances and tissues, without diagnosis (R83-R89)       0.95      1.00      0.97        19
                                                                                               