<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/text-similarity-works/22_icd_code_descriptions_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##ICD Code Descriptions Fine-Tuning

**Reference**:

https://www.pinecone.io/learn/unsupervised-training-sentence-transformers/

##Setup

In [None]:
!pip install datasets
!pip install sentence-transformers

In [None]:
!pip install sentence-transformers

In [None]:
import datasets

from sentence_transformers import SentenceTransformer, models
from sentence_transformers.datasets import DenoisingAutoEncoderDataset
from sentence_transformers.losses import DenoisingAutoEncoderLoss
from sentence_transformers import InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

import torch
from torch.utils.data import DataLoader

import re

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

##Training Data

In [None]:
# iteratively download samples using streaming=True
oscar = datasets.load_dataset("oscar", "unshuffled_deduplicated_en", split="train", streaming=True)

In [None]:
for row in oscar:
  break
row

In [None]:
# split each of these into single sentences
splitter = re.compile(r"\.\s?\n?")
splitter.split(row["text"])[:10]

In [None]:
# create a list of 100K sentences to feed into TSDAE fine-tuning
num_sentences = 0
sentences = []
for row in oscar:
  new_sentences = splitter.split(row["text"])
  new_sentences = [line for line in new_sentences if len(line) > 10]
  # we will need a list of sentences (remove too short ones above)
  sentences.extend(new_sentences)

  # the full OSCAR en corpus is huge, we don't need all that data
  num_sentences += len(new_sentences)
  if num_sentences > 50000:
    # Sentence transformers recommends 10-100K sentences for training
    break

In [None]:
sentences[:2]

['Mtendere Village was inspired by the vision of Chief Napoleon Dzombe, which he shared with John Blanchard during his first visit to Malawi',
 'Chief Napoleon conveyed the desperate need for a program to intervene and care for the orphans and vulnerable children (OVC) in Malawi, and John committed to help']

In [None]:
# dataset class with noise functionality built-in
train_data = DenoisingAutoEncoderDataset(sentences)

# we use a dataloader as usual
data_loader = DataLoader(train_data, batch_size=4, shuffle=True, drop_last=True)

##Model and Training

In [None]:
torch.cuda.empty_cache()

In [None]:
bert_base = models.Transformer("bert-base-uncased")
pooling = models.Pooling(bert_base.get_word_embedding_dimension(), "cls")

model = SentenceTransformer(modules=[bert_base, pooling])

In [None]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [None]:
loss = DenoisingAutoEncoderLoss(model, tie_encoder_decoder=True)

In [None]:
# now ready to begin fine-tuning
model.fit(train_objectives=[(data_loader, loss)],
          epochs=1,
          weight_decay=0,
          scheduler="constantlr",
          optimizer_params={"lr": 3e-5},
          show_progress_bar=True)

In [None]:
model.save("output/tsdae-bert-base-uncased")

##Evaluate model performance

In [None]:
sts = datasets.load_dataset("glue", "stsb", split="validation")

In [None]:
sts

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 1500
})

In [None]:
# normalize the 0 -> 5 range
sts = sts.map(lambda x: {"label": x["label"] / 5.0})

In [None]:
samples = []

for sample in sts:
  # reformat to use InputExample
  samples.append(InputExample(
    texts=[sample["sentence1"], sample["sentence2"]],
    label = sample["label"]
  ))

In [None]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(samples, write_csv=False)

In [None]:
evaluator(model)

0.7607400232154499

In [None]:
# Let’s compare it to an untrained bert-base-uncased
bert_base2 = models.Transformer("bert-base-uncased")
pooling2 = models.Pooling(bert_base2.get_word_embedding_dimension(), "cls")

model2 = SentenceTransformer(modules=[bert_base2, pooling2])

In [None]:
evaluator(model2)

0.3173615250643977

In [None]:
# let's compare it with original SBERT
model3 = SentenceTransformer("bert-base-nli-mean-tokens")

In [None]:
evaluator(model3)

0.807870792395701

In [None]:
# more advanced model like MPNet
model4 = SentenceTransformer("flax-sentence-embeddings/all_datasets_v3_mpnet-base")

In [None]:
evaluator(model4)

0.8883451638682623