Necessary imports

In [None]:
#Imports
import sys
sys.path.append('../../')

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import RegexRule
from sparknlp.base import DocumentAssembler, Finisher

Create a spark dataset

In [None]:
data = spark. \
        read. \
        parquet("../../../src/test/resources/sentiment.parquet"). \
        limit(10000)
data.cache()
data.count()

Create appropriate annotators. We are using Sentence Detection, Tokenizing the sentences, and find the lemmas of those tokens
The Finisher will only output the Sentiment.

In [None]:
document_assembler = DocumentAssembler() \
    .setInputCol("text")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

lemmatizer = Lemmatizer() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \
    .setDictionary("../../../src/test/resources/lemma-corpus-small/lemmas_small.txt", key_delimiter="->", value_delimiter="\t")
        
sentiment_detector = SentimentDetector() \
    .setInputCols(["lemma", "sentence"]) \
    .setOutputCol("sentiment_score") \
    .setDictionary("../../../src/test/resources/sentiment-corpus/default-sentiment-dict.txt", ",")
    
finisher = Finisher() \
    .setInputCols(["sentiment_score"]) \
    .setOutputCols(["sentiment"])

Train the pipeline, which is only being trained from external resources, not from the dataset we pass on.
The prediction runs on the target dataset

In [None]:
pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, lemmatizer, sentiment_detector, finisher])
model = pipeline.fit(data)
result = model.transform(data)

We filter the finisher output, to find the positive sentiment lines

In [None]:
result.filter("sentiment != 'positive'").show()