In [None]:
%%capture
!pip install simple-icd-10 
!pip install simple-icd-10-cm
!pip install datasets sentence-transformers setfit


In [None]:
import json 
import pandas as pd
import simple_icd_10_cm as icd
from datasets import load_dataset,Dataset
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer
from sklearn.preprocessing import LabelEncoder
import joblib

In [None]:
df = pd.DataFrame(json.load(open("/content/icd_json.json","r"))) 
df["text"] = df.text.apply(lambda i: i.get("codeDescription"))
df["icd-l3"] = df.icd10Code.str[0:3]
df["icd-l3-description"] = df["icd-l3"].apply(icd.get_description)
df["icd_group"] = df["icd-l3"].apply(icd.get_parent)
df["icd_group_description"] = df["icd_group"].apply(icd.get_description)
df["icd_general_group"] = df["icd_group"].apply(icd.get_parent)
df["icd_general_group_description"] = df["icd_general_group"].apply(icd.get_description)
#label_encoder = LabelEncoder()
#df["labels"] = label_encoder.fit_transform(df["icd_group_description"])
df["labels"] = df["icd-l3-description"]

In [None]:
sdf = df.groupby("labels").sample(15,replace=True)

In [None]:
dataset = Dataset.from_pandas(sdf)

In [None]:
dataset = dataset.class_encode_column("labels")

Casting to class labels:   0%|          | 0/26 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/3 [00:00<?, ?ba/s]

In [None]:
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
# Simulate the few-shot regime by sampling 8 examples per class
#num_classes = len(label_encoder.classes_)
#train_dataset = dataset["train"].shuffle(seed=5896)
#eval_dataset = dataset["test"]

target_feature = dataset["train"].features["labels"]
num_classes = target_feature.num_classes
label_names = target_feature.names

id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
train_dataset = dataset["train"].shuffle(7854)
eval_dataset = dataset["test"]

In [None]:
train_dataset

Dataset({
    features: ['icd10Code', 'text', 'icd-l3', 'icd-l3-description', 'icd_group', 'icd_group_description', 'icd_general_group', 'icd_general_group_description', 'labels', '__index_level_0__'],
    num_rows: 20232
})

In [None]:

# Load a SetFit model from Hub
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2",num_labels=num_classes,id2label=id2label,label2id=label2id)

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss_class=CosineSimilarityLoss,
    metric="accuracy",
    batch_size=450,
    num_iterations=15, # The number of text pairs to generate for contrastive learning
    num_epochs=5, # The number of epochs to use for constrastive learning
    column_mapping={"text": "text", "labels": "label"} # Map dataset columns to text/label expected by trainer
)

# Train and evaluate
trainer.train()
metrics = trainer.evaluate()


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
***** Running training *****
  Num examples = 606960
  Num epochs = 5
  Total optimization steps = 1349
  Total train batch size = 450


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1349 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1349 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1349 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1349 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1349 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Applying column mapping to evaluation dataset


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

***** Running evaluation *****


In [None]:
trainer.model.save_pretrained("/content/drive/MyDrive/other/icd_setfit",)

In [None]:
joblib.dump([id2label,label2id],"/content/drive/MyDrive/other/icd_setfit/label_encoder.joblib")

['/content/drive/MyDrive/other/icd_setfit/label_encoder.joblib']

In [None]:
%env TOKEN_HF=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

env: TOKEN_HF=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


In [None]:
import os
trainer.push_to_hub("setfit-ST-ICD10-L3",use_auth_token=os.getenv("TOKEN_HF"))

Cloning https://huggingface.co/rjac/setfit-ST-ICD10-L3 into local empty directory.


Upload file pytorch_model.bin:   0%|          | 3.34k/418M [00:00<?, ?B/s]

Upload file model_head.pkl:   0%|          | 3.34k/9.91M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/rjac/setfit-ST-ICD10-L3
   637a5a8..76e1c33  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/rjac/setfit-ST-ICD10-L3
   637a5a8..76e1c33  main -> main



'https://huggingface.co/rjac/setfit-ST-ICD10-L3/commit/76e1c33f9eeca324f7d36e09901d214d8b78bc21'

In [None]:
model_final = SetFitModel.from_pretrained("rjac/setfit-ST-ICD10-L3")

Downloading:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.4M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.4M [00:00<?, ?B/s]

In [None]:
model_final(["Diabetis"])

array([661])

In [None]:
labeler = joblib.load("/content/drive/MyDrive/other/icd_setfit/label_encoder.joblib")

In [None]:
labeler[0]

{'0': 'Abdominal and pelvic pain',
 '1': 'Abnormal and inconclusive findings on diagnostic imaging of breast',
 '2': 'Abnormal blood-pressure reading, without diagnosis',
 '3': 'Abnormal findings in cerebrospinal fluid',
 '4': 'Abnormal findings in other body fluids and substances',
 '5': 'Abnormal findings in specimens from digestive organs and abdominal cavity',
 '6': 'Abnormal findings in specimens from female genital organs',
 '7': 'Abnormal findings in specimens from male genital organs',
 '8': 'Abnormal findings in specimens from other organs, systems and tissues',
 '9': 'Abnormal findings in specimens from respiratory organs and thorax',
 '10': 'Abnormal findings on antenatal screening of mother',
 '11': 'Abnormal findings on diagnostic imaging of central nervous system',
 '12': 'Abnormal findings on diagnostic imaging of lung',
 '13': 'Abnormal findings on diagnostic imaging of other body structures',
 '14': 'Abnormal involuntary movements',
 '15': 'Abnormal results of function

In [None]:
labeler[0].get(str(model_final(["Injure in the back while it was driven a VAN"])[0]))

'Occupant of special all-terrain or other off-road motor vehicle, injured in transport accident'

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
model_ = AutoModelForSequenceClassification.from_pretrained("rjac/setfit-ST-ICD10-L3")

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at rjac/setfit-ST-ICD10-L3 were not used when initializing MPNetForSequenceClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing MPNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MPNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at rjac/setfit-ST-ICD10-L3 and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to 

MPNetForSequenceClassification(
  (mpnet): MPNetModel(
    (embeddings): MPNetEmbeddings(
      (word_embeddings): Embedding(30527, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): MPNetEncoder(
      (layer): ModuleList(
        (0): MPNetLayer(
          (attention): MPNetAttention(
            (attn): MPNetSelfAttention(
              (q): Linear(in_features=768, out_features=768, bias=True)
              (k): Linear(in_features=768, out_features=768, bias=True)
              (v): Linear(in_features=768, out_features=768, bias=True)
              (o): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         