<a href="https://colab.research.google.com/github/pradeepbatchu/machine-learning/blob/master/nlp/ICD10_Chunk_Entity_Resolver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

Saving keys.json to keys.json


In [2]:
%%capture
for k,v in license_keys.items(): 
    %set_env $k=$v

!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jsl_colab_setup.sh
!bash jsl_colab_setup.sh

! pip install spark-nlp-display

In [4]:
import json
import os
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
from sparknlp.util import *
import sparknlp_jsl
import sparknlp

from sparknlp.pretrained import ResourceDownloader
from pyspark.sql import functions as F

params = {"spark.driver.memory":"16G",
"spark.kryoserializer.buffer.max":"2000M",
"spark.driver.maxResultSize":"2000M"}

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

Spark NLP Version : 3.0.2
Spark NLP_JSL Version : 3.0.2


In [5]:
spark

In [6]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP

documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

# Sentence Detector DL annotator, processes various sentences per line

sentenceDetectorDL = SentenceDetectorDLModel\
      .pretrained("sentence_detector_dl_healthcare", "en", 'clinical/models') \
      .setInputCols(["document"]) \
      .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP

tokenizer = Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("raw_token")

# Tokenizer splits words in a relevant format for NLP
stopwords = StopWordsCleaner()\
      .setInputCols(["raw_token"])\
      .setOutputCol("token")
  

sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 363.9 KB
[OK!]


In [7]:
# WordEmbeddingsModel pretrained "embeddings_clinical" includes a model of 1.7Gb that needs to be downloaded

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
      .setInputCols(["sentence", "token"])\
      .setOutputCol("embeddings")
  

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]


In [8]:
# Named Entity Recognition for clinical concepts.

clinical_ner = MedicalNerModel.pretrained("ner_clinical", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("ner")

# ner_converter = NerConverter() \
#       .setInputCols(["sentence", "token", "ner"]) \
#       .setOutputCol("ner_chunk")


ner_clinical download started this may take some time.
Approximate size to download 13.9 MB
[OK!]


In [9]:
# Named Entity Recognition concepts parser, transforms entities into CHUNKS (required for next step: assertion status)

ner_converter = NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner"]) \
    .setOutputCol("ner_chunk")\
    .setWhiteList(['PROBLEM'])
    #\.setPreservePosition(False)

chunk_embeddings = ChunkEmbeddings()\
    .setInputCols("ner_chunk", "embeddings")\
    .setOutputCol("chunk_embeddings")

# ICD resolution model

icd10cm_resolution = ChunkEntityResolverModel.pretrained("chunkresolve_icd10cm_clinical", "en", "clinical/models") \
    .setInputCols(["token", "chunk_embeddings"]) \
    .setOutputCol("icd10cm_code") \
    .setDistanceFunction("COSINE") \
    .setNeighbours(5)

# .setDistanceFunction("EUCLIDEAN")

chunkresolve_icd10cm_clinical download started this may take some time.
Approximate size to download 166.2 MB
[OK!]


In [11]:
clinical_note = (
    'A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years '
    'prior to presentation and subsequent type two diabetes mellitus (T2DM), one prior '
    'episode of HTG-induced pancreatitis three years prior to presentation, associated '
    'with an acute hepatitis, and obesity with a body mass index (BMI) of 33.5 kg/m2, '
    'presented with a one-week history of polyuria, polydipsia, poor appetite, and vomiting. '
    'Two weeks prior to presentation, she was treated with a five-day course of amoxicillin '
    'for a respiratory tract infection. She was on metformin, glipizide, and dapagliflozin '
    'for T2DM and atorvastatin and gemfibrozil for HTG. She had been on dapagliflozin for six months '
    'at the time of presentation. Physical examination on presentation was significant for dry oral mucosa; '
    'significantly, her abdominal examination was benign with no tenderness, guarding, or rigidity. Pertinent '
    'laboratory findings on admission were: serum glucose 111 mg/dl, bicarbonate 18 mmol/l, anion gap 20, '
    'creatinine 0.4 mg/dL, triglycerides 508 mg/dL, total cholesterol 122 mg/dL, glycated hemoglobin (HbA1c) '
    '10%, and venous pH 7.27. Serum lipase was normal at 43 U/L. Serum acetone levels could not be assessed '
    'as blood samples kept hemolyzing due to significant lipemia. The patient was initially admitted for '
    'starvation ketosis, as she reported poor oral intake for three days prior to admission. However, '
    'serum chemistry obtained six hours after presentation revealed her glucose was 186 mg/dL, the anion gap '
    'was still elevated at 21, serum bicarbonate was 16 mmol/L, triglyceride level peaked at 2050 mg/dL, and '
    'lipase was 52 U/L. The β-hydroxybutyrate level was obtained and found to be elevated at 5.29 mmol/L - '
    'the original sample was centrifuged and the chylomicron layer removed prior to analysis due to '
    'interference from turbidity caused by lipemia again. The patient was treated with an insulin drip '
    'for euDKA and HTG with a reduction in the anion gap to 13 and triglycerides to 1400 mg/dL, within '
    '24 hours. Her euDKA was thought to be precipitated by her respiratory tract infection in the setting '
    'of SGLT2 inhibitor use. The patient was seen by the endocrinology service and she was discharged on '
    '40 units of insulin glargine at night, 12 units of insulin lispro with meals, and metformin 1000 mg '
    'two times a day. It was determined that all SGLT2 inhibitors should be discontinued indefinitely. She '
    'had close follow-up with endocrinology post discharge.'
)

data_ner = spark.createDataFrame([[clinical_note]]).toDF("text")

In [12]:
pipeline_icd10 = Pipeline(
    stages = [
    documentAssembler,
    sentenceDetectorDL,
    tokenizer,
    stopwords,
    word_embeddings,
    clinical_ner,
    ner_converter,
    chunk_embeddings,
    icd10cm_resolution
  ])

model_icd10 = pipeline_icd10.fit(data_ner)


In [14]:
#import pickle
#pickle.dump(model_icd10, open("model.pkl", "wb"))

In [15]:
#model = pickle.load(open(model.pkl, 'rb')) 

In [16]:
light_pipeline_icd10 = LightPipeline(model_icd10)

In [17]:
full_light_result = light_pipeline_icd10.fullAnnotate('patient has a cold.')

full_light_result[0]['icd10cm_code']

[Annotation(entity, 14, 17, J00, {'chunk': '0', 'all_k_results': 'J00:::P800:::L502', 'all_k_distances': '0.5192:::0.7329:::1.3345', 'confidence': '0.4444', 'all_k_cosine_distances': '0.1785:::0.3330:::0.2385', 'all_k_resolutions': 'Acute nasopharyngitis [common cold]:::Cold injury syndrome:::Urticaria due to cold and heat', 'target_text': 'cold', 'all_k_aux_labels': '', 'token': 'cold', 'resolved_text': 'Acute nasopharyngitis [common cold]', 'all_k_confidences': '0.4444:::0.3589:::0.1967', 'distance': '0.5192', 'sentence': '0'})]

In [None]:
#full_light_result[0]

In [18]:
full_light_result[0]["ner_chunk"]

[Annotation(chunk, 14, 17, cold, {'entity': 'PROBLEM', 'sentence': '0', 'chunk': '0', 'confidence': '0.8805'})]

In [19]:
from sparknlp_display import EntityResolverVisualizer

vis = EntityResolverVisualizer()

# Change color of an entity label
vis.set_label_colors({'PROBLEM':'#008080'})

vis.display(full_light_result[0], 'ner_chunk', 'icd10cm_code')


In [20]:
def ICD10(text):
  full_light_result = light_pipeline_icd10.fullAnnotate(text)
  icd = full_light_result[0]['icd10cm_code']
  ner = full_light_result[0]['ner_chunk']
  return icd , ner



In [21]:
text ='A patient has a fever'

ICD10(text=text)

([Annotation(entity, 16, 20, A921, {'chunk': '0', 'all_k_results': 'A921:::A790:::A250:::B550', 'all_k_distances': '0.0000:::0.4048:::0.4048:::0.4048', 'confidence': '0.3332', 'all_k_cosine_distances': '0.0000:::0.0000:::0.0000:::0.0000', 'all_k_resolutions': "O'nyong-nyong fever:::Trench fever:::Spirillosis:::Visceral leishmaniasis", 'target_text': 'fever', 'all_k_aux_labels': '', 'token': 'fever', 'resolved_text': "O'nyong-nyong fever", 'all_k_confidences': '0.3332:::0.2223:::0.2223:::0.2223', 'distance': '0.0000', 'sentence': '0'})],
 [Annotation(chunk, 16, 20, fever, {'entity': 'PROBLEM', 'sentence': '0', 'chunk': '0', 'confidence': '0.7135'})])