In [10]:
import os
import sys

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

import time
import zipfile

In [11]:


# Download CoNLL 2003 Dataset
import os
from pathlib import Path
import urllib.request

#if not Path("eng.train").is_file():
#    print("File Not found will downloading it!")
#    url = "https://github.com/patverga/torch-ner-nlp-from-scratch/raw/master/data/conll2003/eng.train"
#    urllib.request.urlretrieve(url, 'eng.train')
#else:
#    print("File already present.")



In [12]:
import sparknlp 

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  3.1.0
Apache Spark version:  3.0.2


In [13]:
from sparknlp.training import CoNLL


training_data = CoNLL().readDataset(spark, './eng_custom.train')
testing_data= CoNLL().readDataset(spark, './eng_custom.testa')
import pyspark.sql.functions as F

#data.select(F.explode(F.arrays_zip('token.result', 'label.result')).alias("cols")) \
#.select(F.expr("cols['0']").alias("token"),
#        F.expr("cols['1']").alias("ner_label")).show(truncate=50)

In [14]:
training_data.select(F.explode(F.arrays_zip('token.result', 'pos.result',  'label.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("pos"),
        F.expr("cols['2']").alias("ner_label")).show(truncate=50)

+--------+---+---------+
|   token|pos|ner_label|
+--------+---+---------+
|    make|  0|      Act|
|     can|  0|      Act|
|      do|  0|      Act|
|    read|  0|      Act|
|   write|  0|      Act|
|    talk|  0|      Act|
|    want|  0|      Act|
|    cook|  0|      Act|
|    brew|  0|      Act|
|   clean|  0|      Act|
|   brush|  0|      Act|
|    care|  0|      Act|
|    walk|  0|      Act|
|      go|  0|      Act|
|navigate|  0|      Act|
|activity|  0|      Act|
|   write|  0|      Act|
|    move|  0|      Act|
|    fall|  0|      Act|
|      up|  0|      Act|
+--------+---+---------+
only showing top 20 rows



In [15]:
!mkdir ner_logs

mkdir: cannot create directory ‘ner_logs’: File exists


In [16]:

#embeddings = WordEmbeddingsModel.pretrained('glove_100d')\
#          .setInputCols(["document", "token"])\
#          .setOutputCol("embeddings")

document_assembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

embeddings = WordEmbeddingsModel.pretrained('glove_100d') \
        .setInputCols(['document', 'token']) \
        .setOutputCol('embeddings')

nerTagger = NerDLApproach()\
      .setInputCols(["sentence", "token", "embeddings"])\
      .setLabelColumn("label")\
      .setOutputCol("ner")\
      .setMaxEpochs(100)\
      .setLr(0.003)\
      .setBatchSize(32)\
      .setRandomSeed(0)\
      .setVerbose(1)\
      .setValidationSplit(0.2)\
      .setEvaluationLogExtended(True) \
      .setEnableOutputLogs(True)\
      .setIncludeConfidence(True)\
      .setOutputLogsPath('ner_logs') # if not set, logs will be written to ~/annotator_logs
 #    .setGraphFolder('graphs') >> put your graph file (pb) under this folder if you are using a custom graph generated thru 4.1 NerDL-Graph.ipynb notebook
 #    .setEnableMemoryOptimizer() >> if you have a limited memory and a large conll file, you can set this True to train batch by batch 
ner_converter = NerConverter() \
    .setInputCols(['document', 'token', 'ner']) \
    .setOutputCol('ner_chunk')

ner_pipeline = Pipeline(stages=[
      document_assembler, 
      tokenizer,
      embeddings,
      nerTagger,
      ner_converter
 ])

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [17]:



ready_data = embeddings.transform(training_data)

ready_data.show(10)

#ner_model = ner_pipeline.fit(data)
#ner_model.stages[-1].write().overwrite().save('outputs/ner_wiki_glove100d_en')

#embeddings = WordEmbeddingsModel.pretrained()\
#.setOutputCol('embeddings')



+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| text|            document|            sentence|               token|                 pos|               label|          embeddings|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| make|[[document, 0, 3,...|[[document, 0, 3,...|[[token, 0, 3, ma...|[[pos, 0, 3, 0, [...|[[named_entity, 0...|[[word_embeddings...|
|  can|[[document, 0, 2,...|[[document, 0, 2,...|[[token, 0, 2, ca...|[[pos, 0, 2, 0, [...|[[named_entity, 0...|[[word_embeddings...|
|   do|[[document, 0, 1,...|[[document, 0, 1,...|[[token, 0, 1, do...|[[pos, 0, 1, 0, [...|[[named_entity, 0...|[[word_embeddings...|
| read|[[document, 0, 3,...|[[document, 0, 3,...|[[token, 0, 3, re...|[[pos, 0, 3, 0, [...|[[named_entity, 0...|[[word_embeddings...|
|write|[[document, 0, 4,...|[[document, 0, 4,...|[[token, 0, 4

In [18]:
start = time.time()
print("Start fitting")
ner_model = nerTagger.fit(ready_data)
#ner_model = ner_pipeline.fit(ready_data)
print("Fitting has ended")
print (time.time() - start)


Start fitting


2022-10-13 20:21:56.350216: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-13 20:21:56.412959: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1139] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [
  /job:localhost/replica:0/task:0/device:CPU:0].
See below for details of this colocation group:
Colocation Debug Info:
Colocation group had the following types and supported devices: 
Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_nam

Training started - total epochs: 100 - lr: 0.003 - batch size: 32 - labels: 7 - chars: 24 - training examples: 138
Epoch 1/100 started, lr: 0.003, dataset size: 138
Epoch 1/100 - 0.69s - loss: 22.38504 - batches: 5
Quality on validation dataset (20.0%), validation examples = 27
time to finish evaluation: 0.17s
label	 tp	 fp	 fn	 prec	 rec	 f1
Loc	 9	 0	 3	 1.0	 0.75	 0.85714287
Act	 15	 31	 1	 0.32608697	 0.9375	 0.48387095
Neg	 0	 0	 1	 0.0	 0.0	 0.0
0	 4	 1	 26	 0.8	 0.13333334	 0.22857144
Obj	 9	 3	 2	 0.75	 0.8181818	 0.78260875
Pos	 0	 0	 2	 0.0	 0.0	 0.0
tp: 37 fp: 35 fn: 35 labels: 6
Macro-average	 prec: 0.47934783, rec: 0.43983588, f1: 0.45874262
Micro-average	 prec: 0.5138889, rec: 0.5138889, f1: 0.5138889
Epoch 2/100 started, lr: 0.0029850747, dataset size: 138
Epoch 2/100 - 0.09s - loss: 20.640482 - batches: 5
Quality on validation dataset (20.0%), validation examples = 27
time to finish evaluation: 0.01s
label	 tp	 fp	 fn	 prec	 rec	 f1
Loc	 3	 0	 9	 1.0	 0.25	 0.4
Act	 10	

Epoch 16/100 - 0.08s - loss: 1.0625979 - batches: 5
Quality on validation dataset (20.0%), validation examples = 27
time to finish evaluation: 0.01s
label	 tp	 fp	 fn	 prec	 rec	 f1
Loc	 12	 0	 0	 1.0	 1.0	 1.0
Act	 15	 2	 1	 0.88235295	 0.9375	 0.90909094
Neg	 0	 0	 1	 0.0	 0.0	 0.0
0	 29	 0	 1	 1.0	 0.96666664	 0.9830508
Obj	 9	 5	 2	 0.64285713	 0.8181818	 0.72
Pos	 0	 0	 2	 0.0	 0.0	 0.0
tp: 65 fp: 7 fn: 7 labels: 6
Macro-average	 prec: 0.58753496, rec: 0.6203914, f1: 0.60351634
Micro-average	 prec: 0.9027778, rec: 0.9027778, f1: 0.9027778
Epoch 17/100 started, lr: 0.0027777776, dataset size: 138
Epoch 17/100 - 0.08s - loss: 1.0167599 - batches: 5
Quality on validation dataset (20.0%), validation examples = 27
time to finish evaluation: 0.01s
label	 tp	 fp	 fn	 prec	 rec	 f1
Loc	 12	 0	 0	 1.0	 1.0	 1.0
Act	 15	 1	 1	 0.9375	 0.9375	 0.9375
Neg	 0	 0	 1	 0.0	 0.0	 0.0
0	 29	 1	 1	 0.96666664	 0.96666664	 0.96666664
Obj	 9	 5	 2	 0.64285713	 0.8181818	 0.72
Pos	 0	 0	 2	 0.0	 0.0	 0

Epoch 31/100 - 0.08s - loss: 0.1611088 - batches: 5
Quality on validation dataset (20.0%), validation examples = 27
time to finish evaluation: 0.01s
label	 tp	 fp	 fn	 prec	 rec	 f1
Loc	 12	 0	 0	 1.0	 1.0	 1.0
Act	 14	 1	 2	 0.93333334	 0.875	 0.9032258
Neg	 0	 0	 1	 0.0	 0.0	 0.0
0	 29	 2	 1	 0.9354839	 0.96666664	 0.9508197
Obj	 9	 5	 2	 0.64285713	 0.8181818	 0.72
Pos	 0	 0	 2	 0.0	 0.0	 0.0
tp: 64 fp: 8 fn: 8 labels: 6
Macro-average	 prec: 0.58527905, rec: 0.60997474, f1: 0.59737176
Micro-average	 prec: 0.8888889, rec: 0.8888889, f1: 0.8888889
Epoch 32/100 started, lr: 0.0025974028, dataset size: 138
Epoch 32/100 - 0.08s - loss: 0.0715161 - batches: 5
Quality on validation dataset (20.0%), validation examples = 27
time to finish evaluation: 0.01s
label	 tp	 fp	 fn	 prec	 rec	 f1
Loc	 12	 0	 0	 1.0	 1.0	 1.0
Act	 14	 1	 2	 0.93333334	 0.875	 0.9032258
Neg	 0	 0	 1	 0.0	 0.0	 0.0
0	 29	 2	 1	 0.9354839	 0.96666664	 0.9508197
Obj	 9	 5	 2	 0.64285713	 0.8181818	 0.72
Pos	 0	 0	 2	 0.

Epoch 46/100 - 0.08s - loss: 0.04836112 - batches: 5
Quality on validation dataset (20.0%), validation examples = 27
time to finish evaluation: 0.01s
label	 tp	 fp	 fn	 prec	 rec	 f1
Loc	 12	 0	 0	 1.0	 1.0	 1.0
Act	 14	 1	 2	 0.93333334	 0.875	 0.9032258
Neg	 0	 0	 1	 0.0	 0.0	 0.0
0	 30	 2	 0	 0.9375	 1.0	 0.9677419
Obj	 9	 4	 2	 0.6923077	 0.8181818	 0.75
Pos	 0	 0	 2	 0.0	 0.0	 0.0
tp: 65 fp: 7 fn: 7 labels: 6
Macro-average	 prec: 0.5938569, rec: 0.6155303, f1: 0.6044994
Micro-average	 prec: 0.9027778, rec: 0.9027778, f1: 0.9027778
Epoch 47/100 started, lr: 0.0024390244, dataset size: 138
Epoch 47/100 - 0.08s - loss: 0.04236248 - batches: 5
Quality on validation dataset (20.0%), validation examples = 27
time to finish evaluation: 0.01s
label	 tp	 fp	 fn	 prec	 rec	 f1
Loc	 12	 0	 0	 1.0	 1.0	 1.0
Act	 13	 1	 3	 0.9285714	 0.8125	 0.8666666
Neg	 0	 0	 1	 0.0	 0.0	 0.0
0	 30	 2	 0	 0.9375	 1.0	 0.9677419
Obj	 9	 5	 2	 0.64285713	 0.8181818	 0.72
Pos	 0	 0	 2	 0.0	 0.0	 0.0
tp: 64 fp:

Epoch 61/100 - 0.08s - loss: 0.05167918 - batches: 5
Quality on validation dataset (20.0%), validation examples = 27
time to finish evaluation: 0.01s
label	 tp	 fp	 fn	 prec	 rec	 f1
Loc	 12	 0	 0	 1.0	 1.0	 1.0
Act	 13	 2	 3	 0.8666667	 0.8125	 0.83870965
Neg	 0	 0	 1	 0.0	 0.0	 0.0
0	 29	 1	 1	 0.96666664	 0.96666664	 0.96666664
Obj	 9	 6	 2	 0.6	 0.8181818	 0.69230765
Pos	 0	 0	 2	 0.0	 0.0	 0.0
tp: 63 fp: 9 fn: 9 labels: 6
Macro-average	 prec: 0.57222223, rec: 0.59955806, f1: 0.5855713
Micro-average	 prec: 0.875, rec: 0.875, f1: 0.875
Epoch 62/100 started, lr: 0.0022988506, dataset size: 138
Epoch 62/100 - 0.07s - loss: 0.010237203 - batches: 5
Quality on validation dataset (20.0%), validation examples = 27
time to finish evaluation: 0.01s
label	 tp	 fp	 fn	 prec	 rec	 f1
Loc	 12	 0	 0	 1.0	 1.0	 1.0
Act	 14	 2	 2	 0.875	 0.875	 0.875
Neg	 0	 0	 1	 0.0	 0.0	 0.0
0	 29	 1	 1	 0.96666664	 0.96666664	 0.96666664
Obj	 9	 5	 2	 0.64285713	 0.8181818	 0.72
Pos	 0	 0	 2	 0.0	 0.0	 0.0
tp:

Epoch 76/100 - 0.07s - loss: 0.012602276 - batches: 5
Quality on validation dataset (20.0%), validation examples = 27
time to finish evaluation: 0.01s
label	 tp	 fp	 fn	 prec	 rec	 f1
Loc	 12	 0	 0	 1.0	 1.0	 1.0
Act	 13	 1	 3	 0.9285714	 0.8125	 0.8666666
Neg	 0	 0	 1	 0.0	 0.0	 0.0
0	 30	 2	 0	 0.9375	 1.0	 0.9677419
Obj	 9	 5	 2	 0.64285713	 0.8181818	 0.72
Pos	 0	 0	 2	 0.0	 0.0	 0.0
tp: 64 fp: 8 fn: 8 labels: 6
Macro-average	 prec: 0.5848214, rec: 0.6051136, f1: 0.5947945
Micro-average	 prec: 0.8888889, rec: 0.8888889, f1: 0.8888889
Epoch 77/100 started, lr: 0.0021739132, dataset size: 138
Epoch 77/100 - 0.07s - loss: 0.024339551 - batches: 5
Quality on validation dataset (20.0%), validation examples = 27
time to finish evaluation: 0.01s
label	 tp	 fp	 fn	 prec	 rec	 f1
Loc	 12	 0	 0	 1.0	 1.0	 1.0
Act	 13	 1	 3	 0.9285714	 0.8125	 0.8666666
Neg	 0	 0	 1	 0.0	 0.0	 0.0
0	 30	 2	 0	 0.9375	 1.0	 0.9677419
Obj	 9	 5	 2	 0.64285713	 0.8181818	 0.72
Pos	 0	 0	 2	 0.0	 0.0	 0.0
tp: 64 

Epoch 91/100 - 0.08s - loss: 0.018287884 - batches: 5
Quality on validation dataset (20.0%), validation examples = 27
time to finish evaluation: 0.01s
label	 tp	 fp	 fn	 prec	 rec	 f1
Loc	 12	 0	 0	 1.0	 1.0	 1.0
Act	 14	 2	 2	 0.875	 0.875	 0.875
Neg	 0	 0	 1	 0.0	 0.0	 0.0
0	 30	 1	 0	 0.9677419	 1.0	 0.9836065
Obj	 9	 4	 2	 0.6923077	 0.8181818	 0.75
Pos	 0	 0	 2	 0.0	 0.0	 0.0
tp: 65 fp: 7 fn: 7 labels: 6
Macro-average	 prec: 0.5891749, rec: 0.6155303, f1: 0.6020643
Micro-average	 prec: 0.9027778, rec: 0.9027778, f1: 0.9027778
Epoch 92/100 started, lr: 0.0020618557, dataset size: 138
Epoch 92/100 - 0.08s - loss: 0.0090835 - batches: 5
Quality on validation dataset (20.0%), validation examples = 27
time to finish evaluation: 0.01s
label	 tp	 fp	 fn	 prec	 rec	 f1
Loc	 12	 0	 0	 1.0	 1.0	 1.0
Act	 14	 2	 2	 0.875	 0.875	 0.875
Neg	 0	 0	 1	 0.0	 0.0	 0.0
0	 30	 1	 0	 0.9677419	 1.0	 0.9836065
Obj	 9	 4	 2	 0.6923077	 0.8181818	 0.75
Pos	 0	 0	 2	 0.0	 0.0	 0.0
tp: 65 fp: 7 fn: 7 labe

In [19]:
ner_model.write().overwrite().save("./pip_wo_embedd/")
import shutil

shutil.make_archive("/home/sjhjrok/Documents/NLP/pip_wo_embedd", 'zip', "/home/sjhjrok/Documents/NLP/pip_wo_embedd")

2022-10-13 20:22:06.499895: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1139] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [
  /job:localhost/replica:0/task:0/device:CPU:0].
See below for details of this colocation group:
Colocation Debug Info:
Colocation group had the following types and supported devices: 
Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]
AssignSub: CPU 
AddV2: CPU 
ScatterAdd: CPU 
NoOp: CPU 
Sqrt: CPU 
UnsortedSegmentSum: CPU 
RealDiv: CPU 
Const: CPU 
StridedSlice: CPU 
RandomUniform: CPU 
Unique: CPU 
Mul: CPU 
Add: CPU 
VariableV2: CPU 
Assign: CPU 
Identity: CPU 
Shape: CPU 
Cast: CPU 
Sub: CPU 
Gat

'/home/sjhjrok/Documents/NLP/pip_wo_embedd.zip'

IllegalArgumentException: requirement failed: Wrong or missing inputCols annotators in NerDLModel_746da9204197.

Current inputCols: sentence,token,embeddings. Dataset's columns:
(column_name=text,is_nlp_annotator=false)
(column_name=document,is_nlp_annotator=true,type=document)
(column_name=sentence,is_nlp_annotator=true,type=document)
(column_name=token,is_nlp_annotator=true,type=token)
(column_name=pos,is_nlp_annotator=true,type=pos)
(column_name=label,is_nlp_annotator=true,type=named_entity).
Make sure such annotators exist in your pipeline, with the right output names and that they have following annotator types: document, token, word_embeddings

NameError: name 'predictions' is not defined

In [22]:
from sparknlp.base import LightPipeline

light = LightPipeline(ner_model)
light.annotate("navigate to kitchen")

Py4JError: An error occurred while calling None.com.johnsnowlabs.nlp.LightPipeline. Trace:
py4j.Py4JException: Constructor com.johnsnowlabs.nlp.LightPipeline([class com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel, class java.lang.Boolean]) does not exist
	at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:179)
	at py4j.reflection.ReflectionEngine.getConstructor(ReflectionEngine.java:196)
	at py4j.Gateway.invoke(Gateway.java:237)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)



In [None]:
text = input("Enter Testing Text")
while(text != "exit"):
    print(light.annotate(text))
    text = input("Enter New Text")

In [None]:
text_list = [
    """Navagate to kitchen"""
]