![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zCMIC4DyUgCYibA69-rXsrPrVPMRIvUl?usp=sharing)

# **Interview Task**
[Running a Spark NLP Healthcare Pipeline and Training a Custom NER Model](https://docs.google.com/document/d/1l_SpYGAlVGAEe9x-b8avgvKipCXetdap2ttc4UKreO4/edit?tab=t.0)  
## **Chapter-I Pipeline Implementation:**


## 1. Set Up Spark NLP for Healthcare

In [4]:
import json
import os

from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

Saving Medical Language Models for Data Scientists  Training License.json to Medical Language Models for Data Scientists  Training License.json


In [5]:
license_keys.keys()

dict_keys(['SPARK_NLP_LICENSE', 'SECRET', 'JSL_VERSION', 'PUBLIC_VERSION', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_SESSION_TOKEN'])

In [6]:
license_keys['JSL_VERSION']

'5.4.1'

In [7]:
license_keys['PUBLIC_VERSION']

'5.4.1'

In [8]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.4.1 spark-nlp==$PUBLIC_VERSION


# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m579.2/579.2 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m554.8/554.8 kB[0m [31m909.8 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.6/95.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.9/66.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
import json
import os

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp_jsl.pipeline_tracer import PipelineTracer
from sparknlp_jsl.pipeline_output_parser import PipelineOutputParser

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel

import pandas as pd
pd.set_option('display.max_colwidth', 200)

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G",
          "spark.kryoserializer.buffer.max":"2000M",
          "spark.driver.maxResultSize":"2000M"}

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 5.4.1
Spark NLP_JSL Version : 5.4.1


In [10]:
from sparknlp_jsl.pretrained import InternalResourceDownloader

ner_models = InternalResourceDownloader.returnPrivateModels("MedicalNerModel")
for model, lang, version in ner_models:
  if lang == "en" and (model.startswith("ner_clinical") or model.startswith("ner_posology")):
    print(model)

ner_clinical
ner_posology
ner_posology_small
ner_posology_greedy
ner_posology_large
ner_clinical_large
ner_posology_healthcare
ner_posology_large_biobert
ner_clinical_biobert
ner_posology_biobert
ner_posology_experimental
ner_clinical_trials_abstracts
ner_posology_emb_clinical_medium
ner_posology_emb_clinical_large
ner_posology_langtest
ner_clinical_langtest
ner_clinical_large_langtest
ner_clinical_abbreviation_langtest


## 2. Dataset Selection

In [11]:
# mt_samples dataset from John Snow Labs
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/mt_samples_10.csv

In [12]:
mt_samples_df = spark.read.csv("mt_samples_10.csv", header=True, multiLine=True)

In [13]:
mt_samples_df.printSchema()

root
 |-- index: string (nullable = true)
 |-- text: string (nullable = true)



In [14]:
mt_samples_df.show(truncate=100)

+-----+----------------------------------------------------------------------------------------------------+
|index|                                                                                                text|
+-----+----------------------------------------------------------------------------------------------------+
|    0|Sample Type / Medical Specialty:\nHematology - Oncology\nSample Name:\nDischarge Summary - Mesoth...|
|    1|Sample Type / Medical Specialty:\nHematology - Oncology\nSample Name:\nBCCa Excision - Lower Lid\...|
|    2|Sample Type / Medical Specialty:\nHematology - Oncology\nSample Name:\nAnemia - Consult\nDescript...|
|    3|Sample Type / Medical Specialty:\nHematology - Oncology\nSample Name:\nIntensity-Modulated Radiat...|
|    4|Sample Type / Medical Specialty:\nHematology - Oncology\nSample Name:\nNeck Dissection\nDescripti...|
|    5|Sample Type / Medical Specialty:\nHematology - Oncology\nSample Name:\nHDR Brachytherapy\nDescrip...|
|    6|Sample Type 

In [15]:
print(mt_samples_df.limit(1).collect()[0]['text'])

Sample Type / Medical Specialty:
Hematology - Oncology
Sample Name:
Discharge Summary - Mesothelioma - 1
Description:
Mesothelioma, pleural effusion, atrial fibrillation, anemia, ascites, esophageal reflux, and history of deep venous thrombosis.
(Medical Transcription Sample Report)
PRINCIPAL DIAGNOSIS:
Mesothelioma.
SECONDARY DIAGNOSES:
Pleural effusion, atrial fibrillation, anemia, ascites, esophageal reflux, and history of deep venous thrombosis.
PROCEDURES
1. On August 24, 2007, decortication of the lung with pleural biopsy and transpleural fluoroscopy.
2. On August 20, 2007, thoracentesis.
3. On August 31, 2007, Port-A-Cath placement.
HISTORY AND PHYSICAL:
The patient is a 41-year-old Vietnamese female with a nonproductive cough that started last week. She has had right-sided chest pain radiating to her back with fever starting yesterday. She has a history of pericarditis and pericardectomy in May 2006 and developed cough with right-sided chest pain, and went to an urgent care cen

## 3. NER Pipeline Execution:

### Pipeline Function

In [16]:
def get_pipeline_model(embeddings, model_name = 'ner_clinical'):

  documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

  sentenceDetector = SentenceDetector()\
      .setInputCols(["document"])\
      .setOutputCol("sentence")

  tokenizer = Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("token")

  word_embeddings = WordEmbeddingsModel.pretrained(embeddings, "en", "clinical/models")\
      .setInputCols(["sentence", "token"])\
      .setOutputCol("embeddings")

  loaded_ner_model = MedicalNerModel.pretrained(model_name, "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("ner")

  ner_converter = NerConverterInternal() \
      .setInputCols(["sentence", "token", "ner"]) \
      .setOutputCol("ner_chunk")

  nlpPipeline = Pipeline(stages=[
      documentAssembler,
      sentenceDetector,
      tokenizer,
      word_embeddings,
      loaded_ner_model,
      ner_converter])

  model = nlpPipeline.fit(spark.createDataFrame([[""]]).toDF("text"))

  return model

### Test ner_clinical pipeline

In [17]:
embeddings = 'embeddings_clinical'
model_name = 'ner_clinical'

light_pipeline_model = get_pipeline_model(embeddings, model_name)
light_model = LightPipeline(light_pipeline_model)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_clinical download started this may take some time.
[OK!]


In [18]:
text = "I had a headache yesterday. I took an Advil."
light_result = light_model.fullAnnotate(text)
from pprint import pprint
pprint(light_result)

[{'document': [Annotation(document, 0, 43, I had a headache yesterday. I took an Advil., {}, [])],
  'embeddings': [Annotation(word_embeddings, 0, 0, I, {'isOOV': 'false', 'pieceId': '-1', 'isWordStart': 'true', 'token': 'I', 'sentence': '0'}, []),
                 Annotation(word_embeddings, 2, 4, had, {'isOOV': 'false', 'pieceId': '-1', 'isWordStart': 'true', 'token': 'had', 'sentence': '0'}, []),
                 Annotation(word_embeddings, 6, 6, a, {'isOOV': 'false', 'pieceId': '-1', 'isWordStart': 'true', 'token': 'a', 'sentence': '0'}, []),
                 Annotation(word_embeddings, 8, 15, headache, {'isOOV': 'false', 'pieceId': '-1', 'isWordStart': 'true', 'token': 'headache', 'sentence': '0'}, []),
                 Annotation(word_embeddings, 17, 25, yesterday, {'isOOV': 'false', 'pieceId': '-1', 'isWordStart': 'true', 'token': 'yesterday', 'sentence': '0'}, []),
                 Annotation(word_embeddings, 26, 26, ., {'isOOV': 'false', 'pieceId': '-1', 'isWordStart': 'true',

### Extract **ner_clinical** predictions from the mt_samples via the pipeline

#### Prepare input texts

In [19]:
# total_rows = mt_samples_df.count()
# texts = [mt_samples_df.select("text").collect()[i]['text'] for i in range(total_rows)]
texts = mt_samples_df.select("text").rdd.flatMap(lambda x: x).collect()
print(len(texts))
print(type(texts), type(texts[0]))

10
<class 'list'> <class 'str'>


In [20]:
for idx, text in enumerate(texts):
  print(f"Text {idx + 1}:".center(100, '-'))
  print(text)
  if idx == 2:
    break

----------------------------------------------Text 1:-----------------------------------------------
Sample Type / Medical Specialty:
Hematology - Oncology
Sample Name:
Discharge Summary - Mesothelioma - 1
Description:
Mesothelioma, pleural effusion, atrial fibrillation, anemia, ascites, esophageal reflux, and history of deep venous thrombosis.
(Medical Transcription Sample Report)
PRINCIPAL DIAGNOSIS:
Mesothelioma.
SECONDARY DIAGNOSES:
Pleural effusion, atrial fibrillation, anemia, ascites, esophageal reflux, and history of deep venous thrombosis.
PROCEDURES
1. On August 24, 2007, decortication of the lung with pleural biopsy and transpleural fluoroscopy.
2. On August 20, 2007, thoracentesis.
3. On August 31, 2007, Port-A-Cath placement.
HISTORY AND PHYSICAL:
The patient is a 41-year-old Vietnamese female with a nonproductive cough that started last week. She has had right-sided chest pain radiating to her back with fever starting yesterday. She has a history of pericarditis and peric

#### **pipeline_tracer** to get structured output

In [21]:
pipeline_tracer = PipelineTracer(light_pipeline_model)

column_maps = pipeline_tracer.createParserDictionary()
column_maps.update({"document_identifier": "ner_pipeline"})
pipeline_parser = PipelineOutputParser(column_maps)

#### Getting prediction

In [22]:
from pyspark.sql.functions import monotonically_increasing_id
import pandas as pd

# This Empty list stores the results
all_results = []
all_texts = []

# Apply the pipeline to each text
for idx, text in enumerate(texts):
    light_result = light_model.fullAnnotate([text])
    result = pipeline_parser.run(light_result)

    # add the results to the list
    for entity in result['result'][0]['entities']:
        all_results.append({
            'text_id': idx,
            'begin': entity['begin'],
            'end': entity['end'],
            'chunk': entity['chunk'],
            'entity': entity['ner_label']
        })

    all_texts.append({'text_id': idx,'text': text})

# convert the list to a pandas dataframe
result_df = pd.DataFrame(all_results)
text_df = pd.DataFrame(all_texts)

In [23]:
result_df.head(10)

Unnamed: 0,text_id,begin,end,chunk,entity
0,0,88,99,Mesothelioma,PROBLEM
1,0,118,129,Mesothelioma,PROBLEM
2,0,132,147,pleural effusion,PROBLEM
3,0,150,168,atrial fibrillation,PROBLEM
4,0,171,176,anemia,PROBLEM
5,0,179,185,ascites,PROBLEM
6,0,188,204,esophageal reflux,PROBLEM
7,0,222,243,deep venous thrombosis,PROBLEM
8,0,305,316,Mesothelioma,PROBLEM
9,0,340,355,Pleural effusion,PROBLEM


In [24]:
entity_counts = result_df['entity'].value_counts()
print(entity_counts)

entity
PROBLEM      400
TREATMENT    262
TEST         140
Name: count, dtype: int64


#### **Save** the results of the NER and the corresponding texts as a csv file.

In [25]:
result_df.to_csv("/content/ner_clinical_mtsamples_ner_results_for_conll.csv", index=False)
text_df.to_csv("/content/mtsamples_texts.csv", index=False)

### Test ner_posology pipeline

In [27]:
text ='''The patient was prescribed 1 capsule of Parol with meals .
He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day .
It was determined that all SGLT2 inhibitors should be discontinued indefinitely fro 3 months .'''

In [28]:
embeddings = 'embeddings_clinical'
model_name = 'ner_posology'

light_pipeline_model = get_pipeline_model(embeddings, model_name)
light_model = LightPipeline(light_pipeline_model)
light_result = light_model.fullAnnotate(text)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_posology download started this may take some time.
[OK!]


In [29]:
pprint(light_result)

[{'document': [Annotation(document, 0, 339, The patient was prescribed 1 capsule of Parol with meals .
He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals , and metformin 1000 mg two times a day .
It was determined that all SGLT2 inhibitors should be discontinued indefinitely fro 3 months ., {}, [])],
  'embeddings': [Annotation(word_embeddings, 0, 2, The, {'isOOV': 'false', 'pieceId': '-1', 'isWordStart': 'true', 'token': 'The', 'sentence': '0'}, []),
                 Annotation(word_embeddings, 4, 10, patient, {'isOOV': 'false', 'pieceId': '-1', 'isWordStart': 'true', 'token': 'patient', 'sentence': '0'}, []),
                 Annotation(word_embeddings, 12, 14, was, {'isOOV': 'false', 'pieceId': '-1', 'isWordStart': 'true', 'token': 'was', 'sentence': '0'}, []),
                 Annotation(word_embeddings, 16, 25, prescribed, {'isOOV': 'false', 'pieceId': '-1', 'isWordStart': 'true', 'token'

### Extract **ner_posology** predictions from the mt_samples dataset via the pipeline

In [33]:
# Input texts for pipeline
texts = mt_samples_df.select("text").withColumn("text_id", monotonically_increasing_id()).collect()

In [36]:
texts[0]

Row(text='Sample Type / Medical Specialty:\nHematology - Oncology\nSample Name:\nDischarge Summary - Mesothelioma - 1\nDescription:\nMesothelioma, pleural effusion, atrial fibrillation, anemia, ascites, esophageal reflux, and history of deep venous thrombosis.\n(Medical Transcription Sample Report)\nPRINCIPAL DIAGNOSIS:\nMesothelioma.\nSECONDARY DIAGNOSES:\nPleural effusion, atrial fibrillation, anemia, ascites, esophageal reflux, and history of deep venous thrombosis.\nPROCEDURES\n1. On August 24, 2007, decortication of the lung with pleural biopsy and transpleural fluoroscopy.\n2. On August 20, 2007, thoracentesis.\n3. On August 31, 2007, Port-A-Cath placement.\nHISTORY AND PHYSICAL:\nThe patient is a 41-year-old Vietnamese female with a nonproductive cough that started last week. She has had right-sided chest pain radiating to her back with fever starting yesterday. She has a history of pericarditis and pericardectomy in May 2006 and developed cough with right-sided chest pain, and 

In [39]:
# Initialize empty lists to store results
all_results = []
all_texts = []

# Process each text through the pipeline
for row in texts:
  text_id = row['text_id']
  text = row['text']

  # Save the text and its id
  all_texts.append({'text_id': int(text_id),'text': text})

  # Run the posology NER pipeline
  light_result = light_model.fullAnnotate([text])
  result = pipeline_parser.run(light_result)

  # Create a row for each entity
  for entity in result['result'][0]['entities']:
      all_results.append({
          'text_id': int(text_id),
          'begin': entity['begin'],
          'end': entity['end'],
          'chunk': entity['chunk'],
          'entity': entity['ner_label']
      })

# Convert results to DataFrames
result_df = pd.DataFrame(all_results)
texts_df = pd.DataFrame(all_texts)

# Save as CSV files
result_df.to_csv(f"ner_posology_ner_results.csv", index=False)
texts_df.to_csv(f"mtsamples_texts.csv", index=False)

In [40]:
result_df.head(10)

Unnamed: 0,text_id,begin,end,chunk,entity
0,0,1609,1616,Coumadin,DRUG
1,0,1618,1621,1 mg,STRENGTH
2,0,1623,1627,daily,FREQUENCY
3,0,1696,1705,Amiodarone,DRUG
4,0,1707,1712,100 mg,STRENGTH
5,0,1714,1716,p.o,ROUTE
6,0,1719,1723,daily,FREQUENCY
7,0,2770,2777,Coumadin,DRUG
8,0,2880,2886,Lovenox,DRUG
9,0,2888,2892,40 mg,STRENGTH


In [42]:
result_df["entity"].value_counts()

Unnamed: 0_level_0,count
entity,Unnamed: 1_level_1
DRUG,77
ROUTE,14
STRENGTH,13
FREQUENCY,12
FORM,3
DURATION,2
DOSAGE,1


---  
*`R.Caliskan`*
