Import appropriate modules

In [None]:
import os
import sys
sys.path.append('../../')

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

import time
import zipfile

Download training dataset if not already there

In [None]:
# Download CoNLL 2003 Dataset
import os
from pathlib import Path
import urllib.request

if not Path("eng.train").is_file():
    url = "https://github.com/patverga/torch-ner-nlp-from-scratch/raw/master/data/conll2003/eng.train"
    urllib.request.urlretrieve(url, 'eng.train')


Download Glove word embeddings

In [None]:
# Download Glove Word Embeddings
file = "glove.6B.zip"
if not Path("glove.6B.zip").is_file():
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    print("Start downoading Glove Word Embeddings. It will take some time, please wait...")
    urllib.request.urlretrieve(url, "glove.6B.zip")
    print("Downloading finished")
    
if not Path("glove.6B.100d.txt").is_file():
    zip_ref = zipfile.ZipFile(file, 'r')
    zip_ref.extractall("./")
    zip_ref.close()

Start Spark

In [None]:
spark = SparkSession.builder \
    .appName("ner")\
    .master("local[1]")\
    .config("spark.driver.memory","8G")\
    .config("spark.driver.maxResultSize", "2G")\
    .config("spark.jar", "lib/sparknlp.jar")\
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()

1. Download CoNLL2003 dataset
2. Save 3 files eng.train, eng.testa, eng.testa, into working dir ./

Create annotator components in the right order, with their training Params. Finisher will output only NER. Put all in pipeline.

In [None]:
documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

tokenizer = Tokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")

posTagger = PerceptronApproach()\
  .setIterations(5)\
  .setInputCols(["token", "document"])\
  .setOutputCol("pos")\
  .setCorpus("file:///" + os.getcwd() + "/../../../src/test/resources/anc-pos-corpus-small/", "|")

nerTagger = NerCrfApproach()\
  .setInputCols(["sentence", "token", "pos"])\
  .setLabelColumn("label")\
  .setOutputCol("ner")\
  .setMinEpochs(1)\
  .setMaxEpochs(5)\
  .setLossEps(1e-3)\
  .setEmbeddingsSource("glove.6B.100d.txt", 100, 2)\
  .setExternalFeatures("file:///" + os.getcwd() + "/../../../src/test/resources/ner-corpus/dict.txt", ",")\
  .setExternalDataset("file:///" + os.getcwd() + "/eng.train")\
  .setL2(1)\
  .setC0(1250000)\
  .setRandomSeed(0)\
  .setVerbose(2)

finisher = Finisher() \
    .setInputCols(["ner"]) \
    .setIncludeKeys(True)

pipeline = Pipeline(
    stages = [
    documentAssembler,
    sentenceDetector,
    tokenizer,
    posTagger,
    nerTagger,
    finisher
  ])


Load a dataset for prediction. Training is not relevant from this dataset.

In [None]:
#Load the input data to be annotated
data = spark. \
        read. \
        parquet("file:///" + os.getcwd() + "/../../../src/test/resources/sentiment.parquet"). \
        limit(1000)
data.cache()
data.count()
data.show()

Traing the model. Training doesn't really do anything from the dataset itself.

In [None]:
start = time.time()
print("Start fitting")
model = pipeline.fit(data)
print("Fitting is ended")
print (time.time() - start)

Run the prediction

In [None]:
ner_data = model.transform(data)
ner_data.show()

Save model and pipeline into disk after training

In [None]:
pipeline.write().overwrite().save("./ner_pipeline")
model.write().overwrite().save("./ner_model")

Load the model and the pipeline

In [None]:
from pyspark.ml import PipelineModel, Pipeline

Pipeline.read().load("./ner_pipeline")
sameModel = PipelineModel.read().load("./ner_model")