In [None]:
import os
import sys
sys.path.append('../../')

print(sys.version)

from sparknlp.pretrained.downloader import ResourceDownloader
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import SentenceDetector

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("downloader-example")\
    .master("local[1]")\
    .getOrCreate()

# instantiate the downloader
downloader = ResourceDownloader()


In [None]:
# create some mock data to play with
l = [
  (1,'To be or not to be'),
  (2,'This is it!')
]

data = spark.createDataFrame(l, ['docID','text'])

In [None]:
# download directly - models

# document assembler
assembler = downloader.downloadModel(DocumentAssembler, "document_std", "en")
output = assembler.transform(data)
output.show()


# sentence detector
detector = downloader.downloadModel(SentenceDetector, "sentence_std", "en")
detector.transform(output).show()


In [None]:
# download directly - pipeline models

# simple pipeline with document assembler and tokenizer
pipeline = downloader.downloadPipeline("pipeline_std", "en")
pipeline.transform(data).show()

In [None]:
# Test clearCache

downloader.clearCache("pipeline_std", "en")

In [None]:
# download predefined - models
from sparknlp.pretrained.en.models import *
assembler_std = CloudDocumentAssembler.retrieveStandard()
assembler_std.transform(data).show()

CloudSentenceDetector.retrieveStandard()
CloudTokenizer.retrieveStandard()
CloudPerceptronModel.retrieveSmall()
CloudNerCrfModel.retrieveSmall()

In [None]:
# download predefined - pipelines models

from sparknlp.pretrained.en.pipelines import *

# part of speech
pos_std = CloudPOSPipeline.retrieveSmall()
pos_tagged = pos_std.transform(data)
pos_tagged.show()

# named entity recognition
ner_std = CloudNerCrfPipeline.retrieveSmall()
ner_std.transform(data).show()
