Necessary imports, start spark and create our model downloader

In [None]:
import os
import sys
sys.path.append('../../')

print(sys.version)

from sparknlp.pretrained import ResourceDownloader
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import *

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

spark = SparkSession.builder \
    .appName("downloader-example")\
    .master("local[*]")\
    .getOrCreate()

# instantiate the downloader
downloader = ResourceDownloader()


Create a dummy spark dataframe

In [None]:
# create some mock data to play with
l = [
  (1,'To be or not to be'),
  (2,'This is it!')
]

data = spark.createDataFrame(l, ['docID','text'])

Now we intend to download a POS model by its name and language, which requires tokenized text. Hence, we create our tokenizer pipeline to get the data ready.
Then, we add the POS along the other annotators and transform some text.

In [None]:
# download directly - models
document_assembler = DocumentAssembler() \
    .setInputCol("text")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")
    
# pos tagger
pos = downloader.downloadModel(PerceptronModel, "pos_fast", "en")    
    
pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, pos])

output = pipeline.fit(data).transform(data)
output.show()


Here we download a Pipeline by its name and language

In [None]:
# download directly - pipeline models

# simple pipeline with document assembler and tokenizer
pipeline = downloader.downloadPipeline("pipeline_basic", "en")
pipeline.transform(data).show()

We clear cache of recently downloaded pipeline

In [None]:
# Test clearCache
downloader.clearCache("pipeline_basic", "en")

We use predefined BasicPipeline in order to annotate a dataframe with it

In [None]:
# download predefined - pipelines
from sparknlp.pretrained.pipeline.en import BasicPipeline

basic_data = BasicPipeline.annotate(data, "text")
basic_data.show()

We can also annotate a single string

In [None]:
# annotat quickly from string
BasicPipeline().annotate("This world is made up of good and bad things")

Now we proceed to download a POS model, utilizing the PerceptronModel class to retrieve it.
We do the same for the NER model.
Then, we retrieve the Basic Pipeline and combine these models to use them appropriately meeting their requirements.

In [None]:
# download predefined - models

pos = PerceptronModel.pretrained()
pos.setInputCols(["document", "normal"]).setOutputCol("pos")

ner = NerCrfModel.pretrained()
ner.setInputCols(["pos", "normal", "document"]).setOutputCol("ner")

annotation_pipeline = BasicPipeline.pretrained()
annotation_data = annotation_pipeline.transform(data)
annotation_data.show()

pos_tagged = pos.transform(annotation_data)
ner_tagged = ner.transform(pos_tagged)
ner_tagged.show()