Import license keys

In [1]:
import os
import json
import csv
import io
import pandas as pd
import numpy as np

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

license_keys.keys()


secret = license_keys['SECRET']
os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']
os.environ['AWS_ACCESS_KEY_ID'] = license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']
sparknlp_version = license_keys["PUBLIC_VERSION"]
jsl_version = license_keys["JSL_VERSION"]

print ('SparkNLP Version:', sparknlp_version)
print ('SparkNLP-JSL Version:', jsl_version)

Saving keys.json to keys.json
SparkNLP Version: 2.6.4
SparkNLP-JSL Version: 2.7.1


# Colab setup

Import dependencies into Python and start the Spark session

In [2]:
# Install Java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==$sparknlp_version
! python -m pip install --upgrade spark-nlp-jsl==$jsl_version --extra-index-url https://pypi.johnsnowlabs.com/$secret

os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ['PATH'] = os.environ['JAVA_HOME'] + "/bin:" + os.environ['PATH']

import pandas as pd
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import sparknlp
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl

spark = sparknlp_jsl.start(secret)

openjdk version "11.0.9.1" 2020-11-04
OpenJDK Runtime Environment (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04)
OpenJDK 64-Bit Server VM (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
[K     |████████████████████████████████| 215.7MB 65kB/s 
[K     |████████████████████████████████| 204kB 42.5MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
Collecting spark-nlp==2.6.4
[?25l  Downloading https://files.pythonhosted.org/packages/d9/26/f7a6ac12339d2f1ed271c46c16705665620059e4559f323695925f3c63b4/spark_nlp-2.6.4-py2.py3-none-any.whl (129kB)
[K     |████████████████████████████████| 133kB 8.5MB/s 
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-2.6.4
Looking in indexes: https://pypi.org/simple, https://pypi.johnsnowlabs.com/2.7.1-c069474a59bb52cf25c5ed6e7beb05b04c42e7ca
Collecting spark-nlp-jsl==2.7.1
  Downloading https://pypi.johnsnowlabs.com/2.7.1-c069474a59bb52cf25c5ed6e7beb05b04c42e7ca/spark-nlp-jsl/spark-nlp-jsl-2.7.

# Define common pipeline elements

In [3]:
document_assembler = DocumentAssembler() \
  .setInputCol('text')\
  .setOutputCol('document')

sentence_detector = SentenceDetector() \
  .setInputCols(['document'])\
  .setOutputCol('sentence')

tokenizer = Tokenizer()\
  .setInputCols(['sentence']) \
  .setOutputCol('token')

#-------------------------------------------------------------------------------
#embeddings
#-------------------------------------------------------------------------------
word_embeddings_healthcare = WordEmbeddingsModel.pretrained("embeddings_healthcare_100d","en","clinical/models")\
	.setInputCols(["document","token"])\
	.setOutputCol("embeddings")
 
word_embeddings_clinical = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")
#-------------------------------------------------------------------------------


#-------------------------------------------------------------------------------
#NER
#-------------------------------------------------------------------------------
ner_healthcare = NerDLModel.pretrained("ner_healthcare", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

ner_clinical = NerDLModel.pretrained("ner_clinical", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

ner_diseases = NerDLModel.pretrained("ner_diseases", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

ner_jsl = NerDLModel.pretrained("ner_jsl", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")
#-------------------------------------------------------------------------------

ner_converter_problem = NerConverter() \
  .setInputCols(["sentence", "token", "ner"]) \
  .setOutputCol("ner_chunk")\
  .setWhiteList(['PROBLEM'])

ner_converter_diseases = NerConverter() \
  .setInputCols(["sentence", "token", "ner"]) \
  .setOutputCol("ner_chunk")\
  .setWhiteList(['Disease'])

ner_converter_diagnosis = NerConverter() \
  .setInputCols(["sentence", "token", "ner"]) \
  .setOutputCol("ner_chunk")\
  .setWhiteList(['Diagnosis'])

chunk_embeddings = ChunkEmbeddings()\
    .setInputCols(["ner_chunk", "embeddings"])\
    .setOutputCol("chunk_embeddings")
    
athena = ChunkEntityResolverModel.pretrained("chunkresolve_athena_conditions_healthcare","en","clinical/models")\
	.setInputCols(["token","chunk_embeddings"])\
	.setOutputCol("entity")\
  .setDistanceFunction("COSINE")
 
icd10 = ChunkEntityResolverModel.pretrained("chunkresolve_icd10cm_clinical","en","clinical/models")\
	.setInputCols(["token","chunk_embeddings"])\
	.setOutputCol("entity")\
  .setDistanceFunction("COSINE")
 
snomed = ChunkEntityResolverModel.pretrained("chunkresolve_snomed_findings_clinical","en","clinical/models")\
	.setInputCols(["token","chunk_embeddings"])\
	.setOutputCol("entity")\
  .setDistanceFunction("COSINE")


embeddings_healthcare_100d download started this may take some time.
Approximate size to download 475.8 MB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_healthcare download started this may take some time.
Approximate size to download 13.4 MB
[OK!]
ner_clinical download started this may take some time.
Approximate size to download 13.8 MB
[OK!]
ner_diseases download started this may take some time.
Approximate size to download 13.7 MB
[OK!]
ner_jsl download started this may take some time.
Approximate size to download 14 MB
[OK!]
chunkresolve_athena_conditions_healthcare download started this may take some time.
Approximate size to download 180.7 MB
[OK!]
chunkresolve_icd10cm_clinical download started this may take some time.
Approximate size to download 166.3 MB
[OK!]
chunkresolve_snomed_findings_clinical download started this may take some time.
Approximate size to download 162.6 MB
[OK!]


# Inputs (SKIP)

In [None]:
# input_list = [
# '''
# Hello,   I am asking for dontations for my childhood friend, Lorrie Leppo age 55. Six weeks ago she was diogosed with A.T.C thyriod cancer. This is a rare and very fast growing cancer. She is currently in the end stage at Hershey Medical Center. Her five children, 19 being the youngest lost their father 3 years ago to cancer. They are very much in need of prayers and help for all the unexpected expenses to come. Any donation you can afford will be greatly appreciated as her children have been through enough hardship over the past few years. Thank you for all you contributions.  
# '''
# ]

# input_list = [
# '''
# "We had just finished eating lunch and Judah and Eloise were playing upstairs for a few minutes before I was going to go up and put them to bed for a nap. I heard a noise and went upstairs to investigate. I found Eloise looking out my bedroom window and the screen missing. I looked down to see Judah laying just below the second step, unconscious. As quickly as I could, I ran down stairs and onto the back patio where Judah came to and began crying and I called 911. A neighbor I've never even met came outside, saw the commotion, and jumped the fence with his army medic kit and went about stabilizing Judah, while I went to get Ellie who was still standing in the upstairs window crying. The paramedics arrived within minutes, checked his vitals, cut his shirt off, and got him on a board to transfer him via ambulance to Mary Bridge.  Friends. God is SO SO good. Yes, he fell out of a window. Yes, he landed on his head on pavement and fractured his skull. But it could have been SO much worse. He was less than an inch from that bottom step. I don't even want to think about what his injuries would have been had he landed on the steps instead. It is nothing short of a miracle that his injuries are not more extensive.  Why do bad things happen? So that His glory, His grace and mercy, can be made known. So that we can better see the work of His hands. So that we press into Him more than we knew we could. I am SO beyond thankful for His careful watch on my son and our family and the way He has taken care of us through this trauma. Your prayers, words of encouragement, messages, etc have all been a part of that. Thank you.  ‚ù§Ô∏è " My brother is a full time working father while he wife Sam stays home with their 3  kids. We are so very grateful my nephew is alive and without injuries he could have sustained. Medical bills are coming their way that are unxpected and plenty for a family with one income. We trust God to always take care in our times of need. If you would like to help out Jubal financially with the hospital stay and recovering process thats ahead, please feel free to do so here! Thank you and God bless!     
# '''
# ]

input_list = [
'''
Have you ever watched television and wondered what it felt like when the doctors yell "CLEAR!"¬ù and shock someone's heart?  Debbie could tell you "‚Äú she's been cardioverted (the technical term for getting shocked) over 100 times in her life.  Born with a laundry list of complex congenital heart, Debbie's entire life has been about defying medical odds.  Her current heart is a miracle of science and God.  After 45 years of life and love, it is time for Debbie to receive a new heart "‚Äú and she needs your help.  As she waits for a determination by one of the nation's best transplant teams, her family is accruing expenses.  Travel between home and the hospital (in another state), an apartment near the hospital, the potential cost of an emergency private flight to meet her new heart, and other expenses will continue to mount. Debbi is a mother of two, wife and beloved daughter, sister, aunt, & cousin is seeking funds to cover medical expenses associated with getting a heart transplant. Debbie has setup her Heart Fund to allow donations to be tax deductible. The Tax ID for Debbie's Heart Fund is 93-6026869.              
'''
]

# input_list = [
# '''
# Hello my name is Randy Pimentel and I am creating this in hopes to help a friend that is in dire need of a life saving surgery.  The woman in the picture of this campaign is Amanda Somers. She is a 29 year old mother of an 8 year old little girl. Amanda is a bright, happy, caring, selfless person.  4 years ago she was diagnosed with Hyperthyroidism and Graves Disease. If you are not familiar with these they are Auto Immune Diseases that are debilitating in many ways. She is hospitalized often, she is not able to work anymore, can't partake in any type of physical activities, is losing weight daily, most of her hair has fallen out,  is on a liquid diet, has severe insomnia, and her throat is so swollen that it makes it hard for her to breathe or swallow and  she is now taking 18 pills a day to help with all of her symptoms but it isnt working as much as the doctors had hoped for. About 4 months ago she had a mini stroke which caused her face to be partially paralyzed. Her heart is and has been beating out of control, blood pressure is extremely high which creates more of a problem on its own.  It has taken 3 years for her to get cardiac clearance to have this upcoming surgery. Her Endocrinologist and the Cardiac Surgeon are in agreement that without this surgery she could lose her life at a very young age. She  will still have permanent heart damage because it has taken so long to get the approval. The cost of the operation is just about $5000. ....  and approximately 35 days away...with her being unable to work and already being financially strapped with other medical expenses I'm hoping that with alot of outside help we can all make this surgery happen for her and so she can get back to living her life as a woking person and mother. She didn't want to ask for help so I thought the least I could do was ask myself. Even if all you can donate is $1.00 it is beyond appreciated. Thank You All!! Sincerely, Randy Pimentel
# '''              
# ]

# input_list = [
# '''
# We are a hard working family. We were recently shocked with news of my husband Jose Gutierrez being diagnosed with A.M.L Leukemia. It has been a roller coaster of what if's? This last week has been hard on him and our family as we try to look for help and deal with everything the Doctors are telling me. I must confess that asking for donations it is hard to do because we have always worked for everything we need for us and our kids. So here I am asking for your kindness to help us by donating $1, $5 or if you are blessed to help us with more we will greatly appreciated it!! We can not afford to pay the hospital bills and he doesn't meet the criteria needed for Medicaid. He was told that he will not be able to work and even if the chemotherapy works he will not be able to work for a long period of time. He is now fighting for his life! I ask for your prayers for him and my family. I understand if you can't help with money. If I can please ask for  prayers for him to be cure. I would like to say thank you so much from my heart.  thank you for helping me save my husband by  getting  the treatments he needs and pay for his hospital bills and his care that he will need once he is out of the hospital.
# '''
# ]

# input_list = [

# "My daughter and I are Midwesterners, cornfields and hardworking people, and it was a wonderful place to raise her and her sister. Graudating from college she set out on a road trip to experience the western wilderness and fell in love with the mountains. She settled into a small town in Idaho for over 14 years, working at the local health clinic while being a volunteer EMT. Seven years later, she met her beloved partner Tony. Tony had been coming to Stanley since he was 18 years old and was every mountain girls dream. He was a skier, coaching Olympic hopefuls and having fun in the backcountry. Tony was also a skilled and respected fly fisherman when he wasn‚Äôt working. Tony convinced Raechel of his intentions and they were married two years later in the spring, in a field of wildflowers, looking over the grand Rocky Mountains with friends and family. Two and a half years later they were blessed with my grand-daughter Sadie. Now, this family wants nothing more than to be together this day and every day. Raechel suffers from health complications and after great consideration, they decided to move from their beloved mountain home to a more populated place where Sadie could go to school and Raechel could get fresh produce in a much lower elevation. Sadie started kindergarten and Raechels health improved greatly. After a year of moving to their new location, Raechel went back to school, being accepted into a graduate program at a local Christian College. They saved enough money to buy a house in their new home state of Oregon, yet, the same week they closed on their house tony was diagnosed with stage 2 Ewing Sarcoma which is a rare form of cancer. The tumor is in the right forearm of his ulna bone and extending into the soft tissue. The cancer is considered localized which is good news but it is also very aggressive and fast growing therefore the treatment for this cancer is extremely aggressive. It requires hospitalization two times a month for high risk chemotherapy. He has a three day hospitalization, followed by a 5 day hospitalization, 12 days later. This cycle will continue until late November when they will order more imaging to determine how the cancer has responded to treatment. Tony will undergo surgery for removal of the tumor and his ulna bone which they intend to replace with a cadaver bone. After surgery he will resume the chemotherapy cycles until the end of March. Due to the severity of the treatment for this cancer, Tony will no longer be able to work for at least a year And will be rendered virtually disabled. Their financial needs are going to be for transportation, cost of living and medical. Driving over an hour, one way, to and from treatment and back and forth while he is hospitalized. Additional trips in between chemo treatment will also be necessary.  Fighting cancer is our number one priority.   We are all still working on being humble when it comes to asking and accepting help We appreciate your support and words of hope and encouragement. Thank you for visiting. We send out our greatest appreciation for all your help, love and prayers!!!"
# ]

# input_list = [
# "Our son Tyler Zens was diagnosed with Necrotizing Fasciitis (a rare bacterial infection)  last night, underwent surgery to remove the dead tissue in his leg and spent the night in ICU.  He will have to continue to undergo these surgeries just to remove and replace the antibiotic packings to stop the spread of the flesh eating bacteria in his body.  He is going to be in the hospital for an undetermined amount of time for his treatments.  Prayers are much needed at this time for Tyler and his precious family. We are reaching out for your assistance to help with medical costs that are not covered by their work insurance. Tyler and his wife Rachel have three beautiful, young children that will also need help.  Since Tyler will be out of work for awhile, his family still needs to be provided for and all the daily expenses with raising a young family.  Any donation will be graciously appreciated and very much needed. Please share our story with everyone you know.  Tyler has the biggest, kindest and most generous heart for such a young gentleman.   He has a strong faith based family and support group and we are reaching out to you for our son.   Thank you in advance for your generosity and blessings. The Zens Family"                        
# ]



# Functions

In [4]:
def RunNER(input_list, ner_model):

  '''ner_model = ['ner_healthcare','ner_clinical','ner_diseases','ner_jsl']'''

  empty_df = spark.createDataFrame([['']]).toDF('text')
  df = spark.createDataFrame(pd.DataFrame({"text": input_list}))
  
  if ner_model == 'ner_healthcare':

    word_healthcare_pipeline = Pipeline(stages=[
      document_assembler, 
      sentence_detector,
      tokenizer,
      word_embeddings_healthcare,
      ner_healthcare,
      ner_converter_problem])

    ner_healthcare_model = word_healthcare_pipeline.fit(empty_df)
    ner_healthcare_light = LightPipeline(ner_healthcare_model)
    result_healthcare = ner_healthcare_light.transform(df)
    return result_healthcare
  
  if ner_model == 'ner_clinical':

    word_clinical_pipeline = Pipeline(stages=[
      document_assembler, 
      sentence_detector,
      tokenizer,
      word_embeddings_clinical,
      ner_clinical,
      ner_converter_problem])

    ner_clinical_model = word_clinical_pipeline.fit(empty_df)
    ner_clinical_light = LightPipeline(ner_clinical_model)
    result_clinical = ner_clinical_light.transform(df)
    return result_clinical
  
  if ner_model == 'ner_diseases':

    word_diseases_pipeline = Pipeline(stages=[
      document_assembler, 
      sentence_detector,
      tokenizer,
      word_embeddings_clinical,
      ner_diseases,
      ner_converter_diseases])

    ner_diseases_model = word_diseases_pipeline.fit(empty_df)
    ner_diseases_light = LightPipeline(ner_diseases_model)
    result_diseases = ner_diseases_light.transform(df)
    return result_diseases
  
  if ner_model == 'ner_jsl':

    word_jsl_pipeline = Pipeline(stages=[
      document_assembler, 
      sentence_detector,
      tokenizer,
      word_embeddings_clinical,
      ner_jsl,
      ner_converter_diagnosis])

    ner_jsl_model = word_jsl_pipeline.fit(empty_df)
    ner_jsl_light = LightPipeline(ner_jsl_model)
    result_jsl = ner_jsl_light.transform(df)
    return result_jsl

In [5]:
def GetNERChunks(result):
  results = []
  result = result.toPandas()
  for row in result['ner_chunk'][0]:
    results.append(row['result'])
  return results

In [6]:
def RemoveStopwords(chunks):
  results = []
  for string in chunks:
    stopwords = ['a', 'an', 'the', 'this', 'that', 'these', 'his', 'her', 'their']
    words = string.split()
    resultwords  = [word for word in words if word.lower() not in stopwords]
    r = ' '.join(resultwords)
    results.append(r)
  return results

In [7]:
def ChunksToNewDoc(chunks):
  return ' and '.join(chunks)

In [8]:
def RunEntityResolution(result_ner, entity_resolution_model):
  '''
  entity_resolution_model = [
    'chunkresolve_athena_conditions_healthcare',
    'chunkresolve_icd10cm_clinical',
    'chunkresolve_snomed_findings_clinical'
    ]
  '''

  empty_df = spark.createDataFrame([['']]).toDF('text')

  if entity_resolution_model == 'athena':
    pipe = Pipeline(stages=[chunk_embeddings, athena])
  if entity_resolution_model == 'icd10':
    pipe = Pipeline(stages=[chunk_embeddings, icd10])
  if entity_resolution_model == 'snomed':
    pipe = Pipeline(stages=[chunk_embeddings, snomed])
  
  pipe_model = pipe.fit(empty_df)
  pipe_light = LightPipeline(pipe_model)
  results = pipe_light.transform(result_ner)
  return results

In [9]:
def GetEntityMetadata(result_df):
  '''
  input: pandas dataframe of entity resolution pipeline results
  output: list of dictionaries containing metadata for each resolved entity
  '''
  results = []
  for row in result_df['entity'][0]:
    results.append(row.metadata)
  return results

# Execute functions (SKIP)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
ners = ['ner_healthcare','ner_clinical','ner_diseases','ner_jsl']
ers = ['athena', 'icd10', 'snomed']

for ner in ners:
  for er in ers:
    ner_result = RunNER(input_list, ner)
    chunks = GetNERChunks(ner_result)
    chunks_no_stop = RemoveStopwords(chunks)
    doc = ChunksToNewDoc(chunks_no_stop)
    if er == 'athena':
      ner_result_2 = RunNER([doc], 'ner_healthcare')
      entity = RunEntityResolution(ner_result_2, er).toPandas()
    if er == 'icd10':
      ner_result_2 = RunNER([doc], 'ner_clinical')
      entity = RunEntityResolution(ner_result_2, er).toPandas()      
    if er == 'snomed':
      ner_result_2 = RunNER([doc], 'ner_clinical')
      entity = RunEntityResolution(ner_result_2, er).toPandas()    
    
    metadata = GetEntityMetadata(entity)

    filename = '/content/drive/My Drive/Crowdfunding/' + ner + '_' + er + '.json'
    with open(filename, 'w') as fp:
      json.dump(metadata, fp)


# Scalable pipeline

## Import feed data

In [10]:
uploaded = files.upload()
#feed = pd.read_csv(io.BytesIO(uploaded['example_50.csv']))
feed = pd.read_csv(io.BytesIO(uploaded['example_51-100.csv']))

Saving example_51-100.csv to example_51-100.csv


In [11]:
import copy

a = copy.deepcopy(feed)

a['fund_description'][0]

"Anyone who has met Piper knows how lively and sweet she is. After weeks of seeing her 'not herself', it was confirmed this weekend that she has arthritis in her hips. One hip is particularly bad right now and will need hip replacement, but this will be a chronic issue. Without proper treatment she could be immobilized. This is incredibly painful for her, but she is in great spirits. Unfortunately this treatment is 12,000$ and not covered by insurance and the family does not have the means to give Piper the help that she needs. Piper is just a baby at 3 years old and has a bright future full of love, frisbees, and wet kisses ahead of her. PLEASE help us with this cause and keep PIPER HIPer, any bit will be more helpful than you know! #keeppiperHIPer"

## Define helper functions

In [12]:
from pyspark.sql.types import *

# Auxiliar functions
def equivalent_type(f):
    if f == 'datetime64[ns]': return DateType()
    elif f == 'int64': return LongType()
    elif f == 'int32': return IntegerType()
    elif f == 'float64': return FloatType()
    else: return StringType()

def define_structure(string, format_type):
    try: typo = equivalent_type(format_type)
    except: typo = StringType()
    return StructField(string, typo)

# Given pandas dataframe, it will return a spark's dataframe.
def pandas_to_spark(pandas_df):
    columns = list(pandas_df.columns)
    types = list(pandas_df.dtypes)
    struct_list = []
    for column, typo in zip(columns, types): 
      struct_list.append(define_structure(column, typo))
    p_schema = StructType(struct_list)
    return sqlContext.createDataFrame(pandas_df, p_schema)

In [13]:
def RunNER(feed, ner_model, first_column):

  '''ner_model = ['ner_healthcare','ner_clinical','ner_diseases','ner_jsl']'''

  #initiate empty df
  empty_df = spark.createDataFrame([['']]).toDF(first_column)
  
  #load feed data into df
  df = spark.createDataFrame(feed)
  #df = pandas_to_spark(feed)

  #define first entity recognition pipeline with appropriate column name
  document_assembler = DocumentAssembler() \
  .setInputCol(first_column)\
  .setOutputCol('document')
  
  if ner_model == 'ner_healthcare':

    word_healthcare_pipeline = Pipeline(stages=[
      document_assembler, 
      sentence_detector,
      tokenizer,
      word_embeddings_healthcare,
      ner_healthcare,
      ner_converter_problem])

    ner_healthcare_model = word_healthcare_pipeline.fit(empty_df)
    result_healthcare = ner_healthcare_model.transform(df)
    return result_healthcare
  
  if ner_model == 'ner_clinical':

    word_clinical_pipeline = Pipeline(stages=[
      document_assembler, 
      sentence_detector,
      tokenizer,
      word_embeddings_clinical,
      ner_clinical,
      ner_converter_problem])

    ner_clinical_model = word_clinical_pipeline.fit(empty_df)
    result_clinical = ner_clinical_model.transform(df)
    return result_clinical
  
  if ner_model == 'ner_diseases':

    word_diseases_pipeline = Pipeline(stages=[
      document_assembler, 
      sentence_detector,
      tokenizer,
      word_embeddings_clinical,
      ner_diseases,
      ner_converter_diseases])

    ner_diseases_model = word_diseases_pipeline.fit(empty_df)
    result_diseases = ner_diseases_model.transform(df)
    return result_diseases
  
  if ner_model == 'ner_jsl':

    word_jsl_pipeline = Pipeline(stages=[
      document_assembler, 
      sentence_detector,
      tokenizer,
      word_embeddings_clinical,
      ner_jsl,
      ner_converter_diagnosis])

    ner_jsl_model = word_jsl_pipeline.fit(empty_df)
    result_jsl = ner_jsl_model.transform(df)
    return result_jsl

In [14]:
def GetChunksAndSentences(lofd):
  results = []
  for d in lofd:
    dic = {}
    dic['url'] = d['url']
    dic['fund_description'] = d['fund_description']
    dic['num_sentences'] = len(d['sentence'])
    dic['ner_chunk'] = []
    for chunk in d['ner_chunk']:
      dic['ner_chunk'].append({'result': chunk['result'], 'sentence': int(chunk.metadata['sentence'])})
    results.append(dic)
  return results

In [15]:
def PrepareNewDoc(lofd):
  r = []
  for d in lofd:
    dic = copy.deepcopy(d)
    dic['chunks'] = []
    for chunk in d['ner_chunk']:
      dic['chunks'].append(chunk['result'])
      dic['chunks'] = RemoveStopwords(dic['chunks'])
      dic['new_doc'] = ChunksToNewDoc(dic['chunks'])
    r.append(dic)
  return pd.DataFrame(r)

In [16]:
# def PrepareNewDoc(lofd):
#   r = []
#   for d in lofd:
#     dic = copy.deepcopy(d)
#     num_sent = dic['num_sentences']
#     df = pd.DataFrame(dic['ner_chunk']) 

#     new_doc = []
#     for i in range(0, num_sent):
#       if i not in df['sentence'].to_list():
#         new_doc.append(" . ")
#       else:
#         words_to_join = df[df['sentence'] == i]['result'].to_list()
#         words_to_join = RemoveStopwords(words_to_join)
#         joined = " and ".join(words_to_join)
#         new_doc.append(joined + ". ")
    
#     dic['new_doc'] = "".join(new_doc)
#     r.append(dic)
#   return pd.DataFrame(r)

In [17]:
def RunEntityResolution(result_ner, entity_resolution_model):

  empty_df = spark.createDataFrame([['']]).toDF('text')

  if entity_resolution_model == 'athena':
    pipe = Pipeline(stages=[chunk_embeddings, athena])
  if entity_resolution_model == 'icd10':
    pipe = Pipeline(stages=[chunk_embeddings, icd10])
  if entity_resolution_model == 'snomed':
    pipe = Pipeline(stages=[chunk_embeddings, snomed])
  
  pipe_model = pipe.fit(empty_df)
  results = pipe_model.transform(result_ner)
  return results

In [18]:
def GetEntityMetadata(lofd):
  results = []
  for d in lofd:
    dic = d.copy()
    del dic['entity']
    dic['entity'] = []
    for en in d['entity']:
      dic['entity'].append(en.metadata)
    results.append(dic)
  return results

## Define main function

In [None]:
def RunPipeline(feed, ner_model_1, ner_model_2, entity):
  '''
  inputs:
    feed: pandas df of feed data, 2 columns: url and fund_description
    ner_model_1: string in ['ner_healthcare','ner_clinical','ner_diseases','ner_jsl']
    ner_model_2: string in ['ner_healthcare','ner_clinical','ner_diseases','ner_jsl']
    entity: string in ['icd10','athena','snomed']

    for entity == athena, ner_model_2 must be ner_healthcare
  '''

  #run first NER
  results_ner_1 = RunNER(feed, ner_model_1, 'fund_description')

  # #clean results
  results_ner_1_df = results_ner_1.toPandas()
  results_ner_1_df = results_ner_1_df[['url','fund_description','sentence','ner_chunk']]
  results_ner_1_lofd = results_ner_1_df.to_dict('records') 
  results_ner_1_lofd_new = GetChunksAndSentences(results_ner_1_lofd)

  #new doc for second round of NER
  new_doc = PrepareNewDoc(results_ner_1_lofd_new)
  
  #pandas df has some NaN values, convert to empty string
  new_doc = new_doc.fillna('')

  #run second NER
  results_ner_2 = RunNER(new_doc[['url','new_doc']], ner_model_2, 'new_doc')
  
  #run entity resolution
  entity = RunEntityResolution(results_ner_2, entity)

  #clear results
  entity = entity.toPandas()
  entity = entity[['url','entity']]

  #merge entity resolution results with entity recognition
  merged = new_doc.merge(entity, on='url', how='left')

  #get entity metadata
  merged_lofd = merged.to_dict('records')
  results = GetEntityMetadata(merged_lofd)

  return results

In [22]:
def Text2NER(feed, list_of_ner_models):
  '''
  inputs:
    feed: pandas df of feed data, 2 columns: url and fund_description
    list_of_ner_models: ['ner_healthcare','ner_clinical','ner_diseases','ner_jsl']
  '''

  r = []
  for model in list_of_ner_models:
    
    #send update
    print('starting NER for ' + model)

    #run NER
    results_ner = RunNER(feed, model, 'fund_description')

    #send update
    print('finished NER for ' + model)
    
    #process results
    df = results_ner.toPandas()
    print('finished pandas for ' + model)
    df = df[['url', 'fund_description','ner_chunk']]
    df = df.explode('ner_chunk')
    ner = df['ner_chunk']
    del df['ner_chunk']
    df['text'] = [x['result'] if x is not np.nan else np.nan for x in ner]
    df['sentence'] = [x.metadata['sentence'] if x is not np.nan else np.nan for x in ner]
    df['start_char'] = [x['begin'] if x is not np.nan else np.nan for x in ner]
    df['end_char'] = [x['end'] if x is not np.nan else np.nan for x in ner]
    df['entity'] = [x.metadata['entity'] if x is not np.nan else np.nan for x in ner]
    
    #add label for ner model
    df['ner_model'] = [model for x in ner]
    
    #add to results 
    r.append(df)


  return r

# Run Main Function

In [None]:
results = RunPipeline(feed, 'ner_diseases', 'ner_clinical', 'icd10')

In [62]:
res = Text2NER(feed, ['ner_jsl','ner_diseases', 'ner_clinical','ner_healthcare'])

starting NER for ner_jsl
finished NER for ner_jsl
finished pandas for ner_jsl
starting NER for ner_diseases
finished NER for ner_diseases
finished pandas for ner_diseases
starting NER for ner_clinical
finished NER for ner_clinical
finished pandas for ner_clinical
starting NER for ner_healthcare
finished NER for ner_healthcare
finished pandas for ner_healthcare


In [64]:
df = pd.concat(res)

In [67]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [68]:
df.to_csv('/content/drive/My Drive/Crowdfunding/ex_51-100_ner.csv', index=False)

In [None]:
filename = '/content/drive/My Drive/Crowdfunding/' + 'ner_disease_ner_clinical_icd10_cosine.json'
with open(filename, 'w') as fp:
    json.dump(results, fp)

TypeError: ignored

In [None]:
results