# Install and import dependencies

In [None]:
from google.colab import files
import json
import os
import csv
import io
import pandas as pd
import numpy as np
import copy
import spacy

PATH_TO_LICENSE_KEY = ''

#import license keys from drive
with open(PATH_TO_LICENSE_KEY) as f:s
    license_keys = json.load(f)

secret = license_keys['SECRET']
os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']
os.environ['AWS_ACCESS_KEY_ID'] = license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']
sparknlp_version = license_keys["PUBLIC_VERSION"]
jsl_version = license_keys["JSL_VERSION"]

print ('SparkNLP Version:', sparknlp_version)
print ('SparkNLP-JSL Version:', jsl_version)

# Install Java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==$sparknlp_version
! python -m pip install --upgrade spark-nlp-jsl==$jsl_version --extra-index-url https://pypi.johnsnowlabs.com/$secret

os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ['PATH'] = os.environ['JAVA_HOME'] + "/bin:" + os.environ['PATH']

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

import sparknlp
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl

spark = sparknlp_jsl.start(secret)

SparkNLP Version: 2.6.5
SparkNLP-JSL Version: 2.7.2
openjdk version "11.0.9.1" 2020-11-04
OpenJDK Runtime Environment (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04)
OpenJDK 64-Bit Server VM (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
[K     |████████████████████████████████| 215.7MB 31kB/s 
[K     |████████████████████████████████| 204kB 21.3MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
Collecting spark-nlp==2.6.5
[?25l  Downloading https://files.pythonhosted.org/packages/c6/1d/9a2a7c17fc3b3aa78b3921167feed4911d5a055833fea390e7741bba0870/spark_nlp-2.6.5-py2.py3-none-any.whl (130kB)
[K     |████████████████████████████████| 133kB 4.0MB/s 
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-2.6.5
Looking in indexes: https://pypi.org/simple, https://pypi.johnsnowlabs.com/2.7.2-7ad44c2a1a61c48b6a74446b0a7cb6b97c58dba0
Collecting spark-nlp-jsl==2.7.2
[?25l  Downloading https://pypi.johnsnowlabs.com/2.7.2-7ad44c2a1a61c48

# Define pipeline elements

In [None]:
document_assembler = DocumentAssembler() \
  .setInputCol('text')\
  .setOutputCol('document')

sentence_detector = SentenceDetector() \
  .setInputCols(['document'])\
  .setOutputCol('sentence')

tokenizer = Tokenizer()\
  .setInputCols(['sentence']) \
  .setOutputCol('token')
 
word_embeddings_clinical = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")

ner_jsl = NerDLModel.pretrained("ner_jsl", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

ner_converter_diagnosis = NerConverter() \
  .setInputCols(["sentence", "token", "ner"]) \
  .setOutputCol("ner_chunk")\
  .setWhiteList(['Diagnosis'])

chunk_embeddings = ChunkEmbeddings()\
    .setInputCols(["ner_chunk", "embeddings"])\
    .setOutputCol("chunk_embeddings")
 
c2doc = Chunk2Doc().setInputCols("ner_chunk").setOutputCol("ner_chunk_doc") 

sbiobert_embedder = BertSentenceEmbeddings\
  .pretrained("sbiobert_base_cased_mli",'en','clinical/models')\
  .setInputCols(["ner_chunk_doc"])\
  .setOutputCol("sbert_embeddings")

sbert_resolver = SentenceEntityResolverModel.pretrained("sbiobertresolve_icd10cm_augmented","en", "clinical/models") \
  .setInputCols(["ner_chunk", "sbert_embeddings"]) \
  .setOutputCol("icd10cm_code")\
  .setDistanceFunction("EUCLIDEAN")

pipeline= Pipeline(
    stages = [
        document_assembler,
        sentence_detector,
        tokenizer,
        word_embeddings_clinical,
        ner_jsl,
        ner_converter_diagnosis,
        chunk_embeddings,
        c2doc,
        sbiobert_embedder,
        sbert_resolver])

empty_df = spark.createDataFrame([['']]).toDF("text")
pipeline_model = pipeline.fit(empty_df)
light_pipeline = sparknlp.base.LightPipeline(pipeline_model)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_jsl download started this may take some time.
Approximate size to download 14 MB
[OK!]
sbiobert_base_cased_mli download started this may take some time.
Approximate size to download 384.3 MB
[OK!]
sbiobertresolve_icd10cm_augmented download started this may take some time.
Approximate size to download 1.2 GB
[OK!]


# Define functions

In [None]:
def get_codes (light, code, text, url):

  '''
  example call: get_codes(light_pipeline, 'icd10cm_code', FEED_TEXT, FEED_URL)
  '''

  full_light_result = light.fullAnnotate(text)

  urls = []
  chunks = []
  begin = []
  end = []
  sent = []
  codes = []
  results = []
  resolutions = []
  res_distances = []

  for chunk, code in zip(full_light_result[0]['ner_chunk'], full_light_result[0][code]):
      
      urls.append(url)
      chunks.append(chunk.result)
      begin.append(chunk.begin)
      end.append(chunk.end)
      sent.append(chunk.metadata['sentence'])
      codes.append(code.result) 
      results.append(code.metadata['all_k_results'])
      resolutions.append(code.metadata['all_k_resolutions'])
      res_distances.append(code.metadata['all_k_distances'])
    

  df = pd.DataFrame({'url':urls,
                    'chunks':chunks, 
                     'begin': begin, 
                     'end':end, 
                     'sent':sent,
                    'code':codes,
                     'results':results,
                    'resolutions':resolutions,
                     'res_distances':res_distances})

  return df

In [None]:
import itertools

def run_pipeline(feed):
  
  r = []

  for index, row in feed.iterrows():
    url = row['url']
    text = row['text']
    er_results = get_codes(light_pipeline, 'icd10cm_code', text, url)
    r.append(er_results)
  
  #return concatenated pandas dataframes
  df = pd.concat(r)
  
  return df

# Import and process feed data

- If you are running this on google colab (recommended), it might be best to export the feed data from gfm.db into a few .csv files based on runtime restrictions for your license.

- Here, we exported the columns "url" and "fund_description" from all data into 4 separate .json files (~25k records in each file)

Example code:

```
feed = feed[['url','fund_description']]
feed.rename(columns={'fund_description':'text'}, inplace=True)
dfs = np.array_split(feed, 4)

PATH_TO_DATA_FOR_COLAB = ''

for i in range(4):
    with open(PATH_TO_DATA_FOR_COLAB + 'feed_chunk_' + str(i) + '.json', 'w', encoding='utf-8') as file:
        dfs[i].to_json(file, orient="records", force_ascii=False)

```


In [None]:
#read in chunk and analyze one at a time
PATH_TO_CHUNK = ''
chunk_n = 0

with open(PATH_TO_CHUNK + 'feed_chunk_' + str(chunk_n) + '.json') as json_file:
    feed = json.load(json_file)

In [1]:
feed = pd.DataFrame(feed)

### Text preprocessing

The spark tokenizer does not work reliably to tokenize on punctuation without whitespace e.g. "end.Beginning"

So will preprocess this manually to split tokens by .,!?

In [None]:
import re

def CustomTokenize(df):
  r = []
  for i in range(len(df)):
    string = re.sub(r'(?<=[.,!\\?])(?=[^\s])', r' ', df['text'][i])
    r.append(string)
  return r

In [None]:
feed.loc[:,'text_clean'] = CustomTokenize(feed)
del feed['text']
feed = feed.rename(columns={'text_clean':'text'})

# Run pipeline

In [9]:
%time r = run_pipeline(feed)

CPU times: user 1h 41min 45s, sys: 24min 39s, total: 2h 6min 24s
Wall time: 16h 58min 8s


In [10]:
EXPORT_PATH = ''
r.to_csv(EXPORT_PATH + 'feed_chunk_' + str(chunk_n) + '.csv', index=False)