In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages JohnSnowLabs:spark-nlp:1.2.3 pyspark-shell'

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel

conf = pyspark.SparkConf()
#conf.set("spark.executor.memory", "8g")

# create the context
sc = pyspark.SparkContext(conf = conf)
spark = SparkSession(sc)

In [5]:
test_data = spark.read.parquet("test-data.parquet")

In [None]:
from sparknlp.annotator import *
from sparknlp.base import DocumentAssembler, Finisher

document_assembler = DocumentAssembler().setInputCol("norm_text")
    
sentence_detector = SentenceDetectorModel().setInputCols(["document"]).setOutputCol("sentence")

tokenizer = RegexTokenizer().setInputCols(["sentence"]).setOutputCol("token")
        
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normal")        
        
spell_checker = NorvigSweetingApproach().setInputCols(["normal"]).setOutputCol("spell")
        
# sentiment_detector = ViveknSentimentApproach().setInputCols(["spell", "sentence"]) \
#     .setOutputCol("sentiment").setPositiveSource("trumptweet/positive/1.txt") \
#     .setNegativeSource("trumptweet/negative/1.txt").setPruneCorpus(False)

sentiment_detector = ViveknSentimentApproach().setInputCols(["spell", "sentence"]) \
    .setOutputCol("sentiment").setPositiveSource("trumptweet/positive.txt") \
    .setNegativeSource("trumptweet/negative.txt").setPruneCorpus(False)   
    
finisher = Finisher().setInputCols(["sentiment"]).setIncludeKeys(True)
    
pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    normalizer,
    spell_checker,
    sentiment_detector,
    finisher
])

sentiment_data = pipeline.fit(test_data).transform(test_data)    
sentiment_data.show()

+--------------------+-----+--------------------+
|           norm_text|Class|  finished_sentiment|
+--------------------+-----+--------------------+
|"Yes, Donald Trum...|    0|result->negative@...|
|. Donald Trump is...|    1|    result->positive|
|. Yes. We learned...|    1|result->positive@...|
|. reports for New...|    0|    result->positive|
|Actually, Harbaug...|    0|    result->negative|
|After your videos...|    1|    result->negative|
|At a sad loss whe...|    0|    result->negative|
|Bernie Sanders bl...|    0|    result->negative|
|Bush with subtle ...|    0|    result->negative|
|Byron York's rece...|    1|    result->positive|
|Conservative Expe...|    0|    result->negative|
|Donald Trump Does...|    1|    result->positive|
|Donald Trump arri...|    1|    result->positive|
|Donald Trump can ...|    0|    result->negative|
|Donald Trump says...|    0|    result->negative|
|Exceptionally stu...|    0|    result->negative|
|FOX moderators fo...|    1|    result->negative|


In [None]:
from pyspark.sql.types import IntegerType

def sigmoid(s):
    if s is not None:
        ls = list(map(lambda x: 1 if (x == "result->positive") else 0, s.split("@")))
        return 0 if (len(ls) == 0) else round(sum(ls) / len(ls) + .01)
    else:
        return 0

sigmoid_udf = udf(sigmoid, IntegerType())

sentiment_data = sentiment_data.withColumn("total_sentiment", sigmoid_udf("finished_sentiment"))

Exception ignored in: <function JavaObject.__init__.<locals>.<lambda> at 0x7feffd1bfd90>
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1169, in <lambda>
  File "/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 555, in _garbage_collect_object
  File "/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 883, in send_command
  File "/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1028, in send_command
  File "/opt/conda/lib/python3.6/socket.py", line 586, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt: 
Exception ignored in: <function JavaObject.__init__.<locals>.<lambda> at 0x7feffd1bfe18>
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1169, in <lambda>
  File "/usr/local/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 555, in _garbag

In [None]:
correct_count = sentiment_data.filter(sentiment_data.Class == sentiment_data.total_sentiment).count()
total_count =  sentiment_data.count()
correct_count / total_count