![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/training/english/dl-ner/ner_elmo.ipynb)

## 0. Colab Setup

In [8]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp

openjdk version "1.8.0_275"
OpenJDK Runtime Environment (build 1.8.0_275-8u275-b01-0ubuntu1~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.275-b01, mixed mode)
Processing /root/.cache/pip/wheels/ab/09/4d/0d184230058e654eb1b04467dbc1292f00eaa186544604b471/pyspark-2.4.4-py2.py3-none-any.whl
Collecting py4j==0.10.7
  Using cached https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.4


Collecting spark-nlp
  Using cached https://files.pythonhosted.org/packages/c6/1d/9a2a7c17fc3b3aa78b3921167feed4911d5a055833fea390e7741bba0870/spark_nlp-2.6.5-py2.py3-none-any.whl
Installing collected packages: spark-nlp
Successfully installed spark-nlp-2.6.5


# How to train a NER classifier with ELMO embeddings based on Char CNNs - BiLSTM - CRF

## Download the file into the local File System 
### It is a standard conll2003 format training file

In [9]:
# Download CoNLL 2003 Dataset
import os
from pathlib import Path
import urllib.request


download_path = "./eng.train"


if not Path(download_path).is_file():
    print("File Not found will downloading it!")
    url = "https://github.com/patverga/torch-ner-nlp-from-scratch/raw/master/data/conll2003/eng.train"
    urllib.request.urlretrieve(url, download_path)
else:
    print("File already present.")
    


File already present.


# Read CoNLL Dataset into Spark dataframe and automagically generate features for futures tasks
The readDataset method of the CoNLL class handily adds all the features required in the next steps

In [10]:
import sparknlp
from sparknlp.training import CoNLL

spark = sparknlp.start()
training_data = CoNLL().readDataset(spark, './eng.train')
training_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|EU rejects German...|[[document, 0, 47...|[[document, 0, 47...|[[token, 0, 1, EU...|[[pos, 0, 1, NNP,...|[[named_entity, 0...|
|     Peter Blackburn|[[document, 0, 14...|[[document, 0, 14...|[[token, 0, 4, Pe...|[[pos, 0, 4, NNP,...|[[named_entity, 0...|
| BRUSSELS 1996-08-22|[[document, 0, 18...|[[document, 0, 18...|[[token, 0, 7, BR...|[[pos, 0, 7, NNP,...|[[named_entity, 0...|
|The European Comm...|[[document, 0, 18...|[[document, 0, 18...|[[token, 0, 2, Th...|[[pos, 0, 2, DT, ...|[[named_entity, 0...|
|Germany 's repres...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 6, Ge...|[[pos, 0, 6, NNP,..

# Define the NER Pipeline 

### This pipeline defines a pretrained elmo component and a trainable NerDLApproach which is based on the Char CNN - BiLSTM - CRF

Usually you have to add additional pipeline components before the elmo for the document, sentence and token columns. But CoNLL took already care of this for us, awesome!

In [11]:

from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

# Define the pretrained Elmo model. 
# We need to set lstm_outputs2 pooling layer, because the elmo layer is not yet compatible with NerDL
elmo = ElmoEmbeddings.pretrained().setPoolingLayer("lstm_outputs2") \
 .setInputCols("sentence", "token")\
 .setOutputCol("elmo")\


# Defien the Char CNN - BiLSTM - CRF model. We will feed it the Elmo tokens 
nerTagger = NerDLApproach()\
  .setInputCols(["sentence", "token", "elmo"])\
  .setLabelColumn("label")\
  .setOutputCol("ner")\
  .setMaxEpochs(1)\
  .setRandomSeed(0)\
  .setVerbose(0)

# put everything into the pipe
pipeline = Pipeline(stages = [elmo , nerTagger])

elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


# Fit the Pipeline and get results

In [12]:
elmo_model = pipeline.fit(training_data.limit(10))
# elmo_model = pipeline.fit(training_data).transform(training_data)

In [13]:
elmo_model.stages[1].write().save('NER_elmo_20200221')

In [14]:
loaded_ner_model = NerDLModel.load("NER_elmo_20200221")\
   .setInputCols(["sentence", "token", "elmo"])\
   .setOutputCol("ner")

In [15]:
document = DocumentAssembler().setInputCol("text").setOutputCol("document")
sentence = SentenceDetector().setInputCols("document").setOutputCol("sentence")
token = Tokenizer().setInputCols("sentence").setOutputCol("token")

In [16]:
elmo = ElmoEmbeddings.pretrained().setPoolingLayer("lstm_outputs2") \
 .setInputCols("sentence", "token")\
 .setOutputCol("elmo")\

elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


In [17]:
loaded_ner_model = NerDLModel.load("NER_elmo_20200221")\
   .setInputCols(["sentence", "token", "elmo"])\
   .setOutputCol("ner")

In [18]:
converter = NerConverter()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_span")

In [19]:
custom_ner_pipeline = Pipeline(
    stages = [
              document,
              sentence,
              token,
              elmo,
              loaded_ner_model,
              converter
    ]
)

### Checkout only result columns

In [20]:
# ner_df.select(*['text', 'ner']).limit(1).show(truncate=False)

In [21]:
text = "Peter Parker is a nice man and lives in New York"
prediction_data = spark.createDataFrame([[text]]).toDF("text")

In [22]:
prediction_model = custom_ner_pipeline.fit(prediction_data)
preds = prediction_model.transform(prediction_data)
preds.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                elmo|                 ner|            ner_span|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Peter Parker is a...|[[document, 0, 47...|[[document, 0, 47...|[[token, 0, 4, Pe...|[[word_embeddings...|[[named_entity, 0...|[[chunk, 0, 4, Pe...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [23]:
preds.select("token.result","ner.result").show(truncate=False)

+------------------------------------------------------------+-----------------------------------------+
|result                                                      |result                                   |
+------------------------------------------------------------+-----------------------------------------+
|[Peter, Parker, is, a, nice, man, and, lives, in, New, York]|[I-PER, O, O, I-PER, O, O, O, O, O, O, O]|
+------------------------------------------------------------+-----------------------------------------+



In [24]:
import pyspark.sql.functions as F
preds.select(F.explode(F.arrays_zip("ner_span.result", "ner_span.metadata")).alias("entities"))\
.select(F.expr("entities['0']").alias("chunk"), F.expr("entities['1'].entity").alias("entity")).\
show(truncate=False)

+-----+------+
|chunk|entity|
+-----+------+
|Peter|PER   |
|a    |PER   |
+-----+------+



In [25]:
# Download CoNLL 2003 Dataset
import os
from pathlib import Path
import urllib.request


download_path = "./eng.testa"


if not Path(download_path).is_file():
    print("File Not found will downloading it!")
    url = "https://github.com/patverga/torch-ner-nlp-from-scratch/raw/master/data/conll2003/eng.testa"
    urllib.request.urlretrieve(url, download_path)
else:
    print("File already present.")

File Not found will downloading it!


In [26]:
import sparknlp
from sparknlp.training import CoNLL
spark = sparknlp.start()
test_data = CoNLL().readDataset(spark, './eng.testa')
test_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|CRICKET - LEICEST...|[[document, 0, 64...|[[document, 0, 64...|[[token, 0, 6, CR...|[[pos, 0, 6, NNP,...|[[named_entity, 0...|
|   LONDON 1996-08-30|[[document, 0, 16...|[[document, 0, 16...|[[token, 0, 5, LO...|[[pos, 0, 5, NNP,...|[[named_entity, 0...|
|West Indian all-r...|[[document, 0, 18...|[[document, 0, 18...|[[token, 0, 3, We...|[[pos, 0, 3, NNP,...|[[named_entity, 0...|
|Their stay on top...|[[document, 0, 20...|[[document, 0, 20...|[[token, 0, 4, Th...|[[pos, 0, 4, PRP$...|[[named_entity, 0...|
|After bowling Som...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 4, Af...|[[pos, 0, 4, IN, ..

In [27]:
test_preds = prediction_model.transform(test_data)
test_preds.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|                elmo|                 ner|            ner_span|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|CRICKET - LEICEST...|[[document, 0, 64...|[[document, 0, 64...|[[token, 0, 6, CR...|[[pos, 0, 6, NNP,...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|                  []|
|   LONDON 1996-08-30|[[document, 0, 16...|[[document, 0, 16...|[[token, 0, 5, LO...|[[pos, 0, 5, NNP,...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|                  []|
|West Indian all-r...|[[document, 0, 18...|[[

In [28]:
test_preds.select("token.result","ner.result").show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                                                                                        |result                                                                                                                         |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------