### Colab Setup

Import license keys

In [None]:
import os
from google.colab import files
import json
import pandas as pd

license_keys = files.upload()
with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

secret = license_keys['SECRET']
os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']
os.environ['AWS_ACCESS_KEY_ID'] = license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']
sparknlp_version = license_keys["PUBLIC_VERSION"]
jsl_version = license_keys["JSL_VERSION"]

print ('SparkNLP Version:', sparknlp_version)
print ('SparkNLP-JSL Version:', jsl_version)

Saving keys.json to keys.json
SparkNLP Version: 2.6.5
SparkNLP-JSL Version: 2.7.2


### Install dependencies

In [None]:
# Install Java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==$sparknlp_version
! python -m pip install --upgrade spark-nlp-jsl==$jsl_version --extra-index-url https://pypi.johnsnowlabs.com/$secret
! pip install --ignore-installed spark-nlp-display

openjdk version "11.0.9.1" 2020-11-04
OpenJDK Runtime Environment (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04)
OpenJDK 64-Bit Server VM (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
[K     |████████████████████████████████| 215.7MB 72kB/s 
[K     |████████████████████████████████| 204kB 46.5MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
Collecting spark-nlp==2.6.5
[?25l  Downloading https://files.pythonhosted.org/packages/c6/1d/9a2a7c17fc3b3aa78b3921167feed4911d5a055833fea390e7741bba0870/spark_nlp-2.6.5-py2.py3-none-any.whl (130kB)
[K     |████████████████████████████████| 133kB 7.6MB/s 
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-2.6.5
Looking in indexes: https://pypi.org/simple, https://pypi.johnsnowlabs.com/2.7.2-7ad44c2a1a61c48b6a74446b0a7cb6b97c58dba0
Collecting spark-nlp-jsl==2.7.2
[?25l  Downloading https://pypi.johnsnowlabs.com/2.7.2-7ad44c2a1a61c48b6a74446b0a7cb6b97c58dba0/spark-nlp-jsl/spark_nlp_js

Import dependencies into Python and start the Spark session

In [None]:
os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ['PATH'] = os.environ['JAVA_HOME'] + "/bin:" + os.environ['PATH']

import pandas as pd
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import sparknlp
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl

spark = sparknlp_jsl.start(secret)

# manually start session
'''
spark = SparkSession.builder \
    .appName('Spark NLP Licensed') \
    .master('local[*]') \
    .config('spark.driver.memory', '16G') \
    .config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') \
    .config('spark.kryoserializer.buffer.max', '2000M') \
    .config('spark.jars.packages', 'com.johnsnowlabs.nlp:spark-nlp_2.11:' +sparknlp.version()) \
    .config('spark.jars', f'https://pypi.johnsnowlabs.com/{secret}/spark-nlp-jsl-{jsl_version}.jar').getOrCreate()
'''

"\nspark = SparkSession.builder     .appName('Spark NLP Licensed')     .master('local[*]')     .config('spark.driver.memory', '16G')     .config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')     .config('spark.kryoserializer.buffer.max', '2000M')     .config('spark.jars.packages', 'com.johnsnowlabs.nlp:spark-nlp_2.11:' +sparknlp.version())     .config('spark.jars', f'https://pypi.johnsnowlabs.com/{secret}/spark-nlp-jsl-{jsl_version}.jar').getOrCreate()\n"

In [None]:
spark = sparknlp_jsl.start(secret)

### Define Pipeline Elements

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol('text')\
    .setOutputCol('document')

sentence_detector = SentenceDetector() \
    .setInputCols(['document'])\
    .setOutputCol('sentence')

tokenizer = Tokenizer()\
    .setInputCols(['sentence']) \
    .setOutputCol('token')

word_embeddings = WordEmbeddingsModel.pretrained('embeddings_clinical', 'en', 'clinical/models') \
    .setInputCols(['sentence', 'token']) \
    .setOutputCol('embeddings')

# change the model according to requirement
ner_clinical = NerDLModel.pretrained('ner_clinical', 'en', 'clinical/models') \
    .setInputCols(['sentence', 'token', 'embeddings']) \
    .setOutputCol('ner')

ner_diseases = NerDLModel.pretrained('ner_diseases', 'en', 'clinical/models') \
    .setInputCols(['sentence', 'token', 'embeddings']) \
    .setOutputCol('ner')

ner_jsl = NerDLModel.pretrained('ner_jsl', 'en', 'clinical/models') \
    .setInputCols(['sentence', 'token', 'embeddings']) \
    .setOutputCol('ner')

ner_converter = NerConverter()\
    .setInputCols(['sentence', 'token', 'ner']) \
    .setOutputCol('ner_chunk')


embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_clinical download started this may take some time.
Approximate size to download 13.8 MB
[OK!]
ner_diseases download started this may take some time.
Approximate size to download 13.7 MB
[OK!]
ner_jsl download started this may take some time.
Approximate size to download 14 MB
[OK!]


In [None]:
nlp_pipeline = Pipeline(stages=[
    document_assembler, 
    sentence_detector,
    tokenizer,
    word_embeddings,
    ner_clinical,
    ner_converter])
nlp_pipeline = Pipeline(stages=[
    document_assembler, 
    sentence_detector,
    tokenizer])

In [None]:
feed = pd.DataFrame({'url':['example1','example2'],'text':["My mom's breast cancer",'My mom has heart failure.']})

In [None]:
df = spark.createDataFrame(feed)
empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline_model = nlp_pipeline.fit(empty_df)
result = pipeline_model.transform(df)
result = result.toPandas()

pyspark.sql.types.Row

### Define Functions

In [None]:
def RunNER(feed, ner_model, first_column):
  '''
  input:
    1. pandas dataframe of feed data, col_1 = "url", col_2 = "text"
    2. string of ner model
    3. string of first column name
  output: pandas dataframe with nested results
  '''

  #initiate empty df
  empty_df = spark.createDataFrame([['']]).toDF(first_column)
  
  #load feed data into df
  df = spark.createDataFrame(feed)
  
  if ner_model == 'ner_clinical':

    nlp_pipeline = Pipeline(stages=[
        document_assembler, 
        sentence_detector,
        tokenizer,
        word_embeddings,
        ner_clinical,
        ner_converter])

    ner_model = nlp_pipeline.fit(empty_df)
    result = ner_model.transform(df)
    return result.toPandas()
  
  if ner_model == 'ner_diseases':

    nlp_pipeline = Pipeline(stages=[
        document_assembler, 
        sentence_detector,
        tokenizer,
        word_embeddings,
        ner_diseases,
        ner_converter])

    ner_model = nlp_pipeline.fit(empty_df)
    result = ner_model.transform(df)
    return result.toPandas()
  
  if ner_model == 'ner_jsl':

    nlp_pipeline = Pipeline(stages=[
        document_assembler, 
        sentence_detector,
        tokenizer,
        word_embeddings,
        ner_jsl,
        ner_converter])

    ner_model = nlp_pipeline.fit(empty_df)
    result = ner_model.transform(df)
    return result.toPandas()


In [None]:
def Main(feed):
  r = []
  for ner_model in ['ner_clinical','ner_diseases','ner_jsl']:
    ner_result = RunNER(feed, ner_model, 'text')
    ner_result.loc[:,'ner_model'] = [ner_model for x in range(ner_result.shape[0])]
    r.append(ner_result)
  df = pd.concat(r)
  return df

### Import data

In [None]:
uploaded = files.upload()
with open(list(uploaded.keys())[0]) as f:
    feed = json.load(f)

Saving rounds_3-6_for_prodigy.json to rounds_3-6_for_prodigy (3).json


In [None]:
feed = pd.DataFrame(feed)[:3]

### Execute Functions

In [None]:
df = Main(feed)

In [None]:
df

Unnamed: 0,url,text,document,sentence,token,embeddings,ner,ner_chunk,ner_model
0,https://www.gofundme.com/f/please-help-karens-...,Help save my daughter's health Imagine having ...,"[(document, 0, 4446, Help save my daughter's h...","[(document, 0, 169, Help save my daughter's he...","[(token, 0, 3, Help, {'sentence': '0'}, []), (...","[(word_embeddings, 0, 3, Help, {'sentence': '0...","[(named_entity, 0, 3, O, {'word': 'Help'}, [])...","[(chunk, 136, 143, symptoms, {'sentence': '0',...",ner_clinical
1,https://www.gofundme.com/f/patrickwalter,"Patrick Walter is one of the kindest, funniest...","[(document, 0, 678, Patrick Walter is one of t...","[(document, 0, 77, Patrick Walter is one of th...","[(token, 0, 6, Patrick, {'sentence': '0'}, [])...","[(word_embeddings, 0, 6, Patrick, {'sentence':...","[(named_entity, 0, 6, O, {'word': 'Patrick'}, ...","[(chunk, 161, 168, the blue, {'sentence': '2',...",ner_clinical
2,https://www.gofundme.com/f/gspdys,"As most of you know, Alexander was born with H...","[(document, 0, 570, As most of you know, Alexa...","[(document, 0, 75, As most of you know, Alexan...","[(token, 0, 1, As, {'sentence': '0'}, []), (to...","[(word_embeddings, 0, 1, As, {'sentence': '0',...","[(named_entity, 0, 1, O, {'word': 'As'}, []), ...","[(chunk, 50, 74, a congenital heart defect, {'...",ner_clinical
0,https://www.gofundme.com/f/please-help-karens-...,Help save my daughter's health Imagine having ...,"[(document, 0, 4446, Help save my daughter's h...","[(document, 0, 169, Help save my daughter's he...","[(token, 0, 3, Help, {'sentence': '0'}, []), (...","[(word_embeddings, 0, 3, Help, {'sentence': '0...","[(named_entity, 0, 3, O, {'word': 'Help', 'con...","[(chunk, 148, 168, severe mental illness, {'se...",ner_diseases
1,https://www.gofundme.com/f/patrickwalter,"Patrick Walter is one of the kindest, funniest...","[(document, 0, 678, Patrick Walter is one of t...","[(document, 0, 77, Patrick Walter is one of th...","[(token, 0, 6, Patrick, {'sentence': '0'}, [])...","[(word_embeddings, 0, 6, Patrick, {'sentence':...","[(named_entity, 0, 6, O, {'word': 'Patrick', '...","[(chunk, 239, 258, a mass in his throat, {'sen...",ner_diseases
2,https://www.gofundme.com/f/gspdys,"As most of you know, Alexander was born with H...","[(document, 0, 570, As most of you know, Alexa...","[(document, 0, 75, As most of you know, Alexan...","[(token, 0, 1, As, {'sentence': '0'}, []), (to...","[(word_embeddings, 0, 1, As, {'sentence': '0',...","[(named_entity, 0, 1, O, {'word': 'As', 'confi...","[(chunk, 21, 29, Alexander, {'sentence': '0', ...",ner_diseases
0,https://www.gofundme.com/f/please-help-karens-...,Help save my daughter's health Imagine having ...,"[(document, 0, 4446, Help save my daughter's h...","[(document, 0, 169, Help save my daughter's he...","[(token, 0, 3, Help, {'sentence': '0'}, []), (...","[(word_embeddings, 0, 3, Help, {'sentence': '0...","[(named_entity, 0, 3, O, {'word': 'Help', 'con...","[(chunk, 148, 153, severe, {'sentence': '0', '...",ner_jsl
1,https://www.gofundme.com/f/patrickwalter,"Patrick Walter is one of the kindest, funniest...","[(document, 0, 678, Patrick Walter is one of t...","[(document, 0, 77, Patrick Walter is one of th...","[(token, 0, 6, Patrick, {'sentence': '0'}, [])...","[(word_embeddings, 0, 6, Patrick, {'sentence':...","[(named_entity, 0, 6, O, {'word': 'Patrick', '...","[(chunk, 171, 172, he, {'sentence': '2', 'chun...",ner_jsl
2,https://www.gofundme.com/f/gspdys,"As most of you know, Alexander was born with H...","[(document, 0, 570, As most of you know, Alexa...","[(document, 0, 75, As most of you know, Alexan...","[(token, 0, 1, As, {'sentence': '0'}, []), (to...","[(word_embeddings, 0, 1, As, {'sentence': '0',...","[(named_entity, 0, 1, O, {'word': 'As', 'confi...","[(chunk, 52, 74, congenital heart defect, {'se...",ner_jsl


### Export data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df.to_json('/content/drive/MyDrive/Crowdfunding/ner_prodigy.json', orient="records")