<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/text-similarity-works/22_icd_code_descriptions_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##ICD Code Descriptions Fine-Tuning

**Reference**:

https://www.pinecone.io/learn/unsupervised-training-sentence-transformers/

##Setup

In [None]:
!pip install datasets
!pip install sentence-transformers

In [1]:
import datasets

from sentence_transformers import SentenceTransformer, models
from sentence_transformers.datasets import DenoisingAutoEncoderDataset
from sentence_transformers.losses import DenoisingAutoEncoderLoss
from sentence_transformers import InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

import torch
from torch.utils.data import DataLoader

import re
import pandas as pd

In [2]:
from semantic_search import SemanticSearch
import config as cfg

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
!rm -rf data
!unzip data.zip

##ICD Dataset

In [8]:
icd_description_df = pd.read_csv("combined_icd_code_and_description_v1.csv")
icd_description_df.head()

Unnamed: 0,Code,Keyword,SynId,Field_Target
0,1.0,Cholera due to vibrio cholerae,,
1,1.1,Cholera due to vibrio cholerae el tor,,
2,1.9,Cholera unspecified,,
3,2.0,Typhoid fever,,
4,2.1,Paratyphoid fever A,,


In [23]:
sentences = []
for keyword in icd_description_df["Keyword"]:
  sentences.append(keyword)
sentences[:5]

['Cholera due to vibrio cholerae',
 'Cholera due to vibrio cholerae el tor',
 'Cholera unspecified',
 'Typhoid fever',
 'Paratyphoid fever A']

In [28]:
# dataset class with noise functionality built-in
train_data = DenoisingAutoEncoderDataset(sentences)

# we use a dataloader as usual
data_loader = DataLoader(train_data, batch_size=4, shuffle=True, drop_last=True)

##Model and Training

In [29]:
torch.cuda.empty_cache()

In [None]:
bert_base = models.Transformer("bert-base-uncased")
pooling = models.Pooling(bert_base.get_word_embedding_dimension(), "cls")

model = SentenceTransformer(modules=[bert_base, pooling])

In [31]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [None]:
loss = DenoisingAutoEncoderLoss(model, tie_encoder_decoder=True)

In [33]:
# now ready to begin fine-tuning
model.fit(train_objectives=[(data_loader, loss)],
          epochs=1,
          weight_decay=0,
          scheduler="constantlr",
          optimizer_params={"lr": 3e-5},
          show_progress_bar=True)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/28798 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [35]:
model.save("synodex-icd-bert-base-uncased")

In [37]:
!zip synodex-icd-bert-base-uncased.zip synodex-icd-bert-base-uncased/*

  adding: synodex-icd-bert-base-uncased/1_Pooling/ (stored 0%)
  adding: synodex-icd-bert-base-uncased/config.json (deflated 48%)
  adding: synodex-icd-bert-base-uncased/config_sentence_transformers.json (deflated 27%)
  adding: synodex-icd-bert-base-uncased/modules.json (deflated 53%)
  adding: synodex-icd-bert-base-uncased/pytorch_model.bin (deflated 8%)
  adding: synodex-icd-bert-base-uncased/README.md (deflated 58%)
  adding: synodex-icd-bert-base-uncased/sentence_bert_config.json (deflated 4%)
  adding: synodex-icd-bert-base-uncased/special_tokens_map.json (deflated 42%)
  adding: synodex-icd-bert-base-uncased/tokenizer_config.json (deflated 43%)
  adding: synodex-icd-bert-base-uncased/tokenizer.json (deflated 71%)
  adding: synodex-icd-bert-base-uncased/vocab.txt (deflated 53%)


##Semantic Search

In [3]:
!mkdir -p /home/ocreng/ocrhigh/txt-files
!cp page-7.txt /home/ocreng/ocrhigh/txt-files/

In [3]:
icd10_code_list = ['Z20.822', 'R05.9', 'E78.3']

In [4]:
semantic_search = SemanticSearch()
semantic_search.init_corpus_embedding(page_num=7)

In [6]:
matched_dict = []
for code in icd10_code_list:
  matched_dict.append(semantic_search.get_similarity_score(code))
matched_dict

[{'corpus_id': 10,
  'score': 0.681359052658081,
  'original_description': 'Contact with and (suspected) exposure to COVID-19',
  'match_description': 'Diagnosis Contact suspected exposure COVID19 Z20822'},
 {'corpus_id': 21,
  'score': 0.5364741086959839,
  'original_description': 'Cough, unspecified',
  'match_description': 'Diagnosis Cough R059'},
 {'corpus_id': 13,
  'score': 0.4869689643383026,
  'original_description': 'Hyperchylomicronemia',
  'match_description': 'COVID19 Rapid Test Negative Reavionied'}]

In [7]:
print(len(semantic_search.sentence_list))
semantic_search.sentence_list

40


['Patient DOB',
 'Reviewed Tristan Guevara',
 'Order Date',
 'Ordered Muneer behalf',
 'Tristan Guevara',
 'Collected Date Site LABORATORY',
 'Reported Date Requisition',
 'Lab Reviewed COVID19 Rapid Test Accession',
 'Reviewed Tristan Guevara',
 'Notify Tristan Guevara',
 'Diagnosis Contact suspected exposure COVID19 Z20822',
 'Test Name Result Units Normal Range Status',
 'Final',
 'COVID19 Rapid Test Negative Reavionied',
 'Patient DOB',
 'aps Reviewed Bryonna Williams',
 'Order Date Ordered Emily Martinbianco',
 'Collected Date Site LABORATORY',
 'Reported Date Requisition',
 'Lab Reviewed COVID19 Rapid Test Accession',
 'Reviewed Bryonna Williams',
 'Diagnosis Cough R059',
 'Test Name Result Units Normal Range Status',
 'COVID19 Rapid Test Positive Abn Final',
 'Reviewed',
 'Patient DOB',
 'Reviewed Tristan Guevara',
 'Ordered Brenda Bierenga FNPBC',
 'Order Date',
 'behalf Tristan Guevara',
 'Collected Date Site Silver Pine Lab',
 'Reported Date',
 'Lab Reviewed Comprehensive Met