In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages JohnSnowLabs:spark-nlp:1.2.3 pyspark-shell'

In [2]:
import pyspark
from pyspark.sql import SparkSession

conf = pyspark.SparkConf()
#conf.set("spark.executor.memory", "4g")

sc = pyspark.SparkContext(conf = conf)
spark = SparkSession(sc)

In [3]:
test_data = spark.read.parquet("test-data.parquet")
test_data.cache()

DataFrame[norm_text: string, Class: string]

In [4]:
from pyspark.ml import Pipeline, PipelineModel
from sparknlp.annotator import *
from sparknlp.base import DocumentAssembler, Finisher

document_assembler = DocumentAssembler().setInputCol("norm_text")
    
sentence_detector = SentenceDetectorModel().setInputCols(["document"]).setOutputCol("sentence")

tokenizer = RegexTokenizer().setInputCols(["sentence"]).setOutputCol("token")
        
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normal")        
        
spell_checker = NorvigSweetingApproach().setInputCols(["normal"]).setOutputCol("spell")
        
# sentiment_detector = ViveknSentimentApproach().setInputCols(["spell", "sentence"]) \
#     .setOutputCol("sentiment").setPositiveSource("trumptweet/positive/1.txt") \
#     .setNegativeSource("trumptweet/negative/1.txt").setPruneCorpus(False)

sentiment_detector = ViveknSentimentApproach().setInputCols(["spell", "sentence"]) \
    .setOutputCol("sentiment").setPositiveSource("train-data.txt/positive.txt") \
    .setNegativeSource("train-data.txt/negative.txt").setPruneCorpus(False)   
    
finisher = Finisher().setInputCols(["sentiment"]).setIncludeKeys(True)
    
pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    normalizer,
    spell_checker,
    sentiment_detector,
    finisher
])

sentiment_data = pipeline.fit(test_data).transform(test_data)    
sentiment_data.show()

+--------------------+-----+--------------------+
|           norm_text|Class|  finished_sentiment|
+--------------------+-----+--------------------+
|1/3 "The presiden...|    0|    result->negative|
|Donald Trump cont...|    1|result->positive@...|
|Halperin: Trump R...|    1|    result->positive|
|I've listened to ...|    0|result->negative@...|
|Maher: I Don't Al...|    1|    result->positive|
|Watched the inter...|    1|result->negative@...|
|Who the he'll wan...|    0|result->negative@...|
|timisteve Adviser...|    1|result->positive@...|
|I wonder if Donal...|    1|    result->negative|
|Rush continues ta...|    1|result->negative@...|
|Trump sayshe's go...|    0|result->negative@...|
|Trump would be th...|    1|result->negative@...|
|We can only hope....|    1|result->negative@...|
|realDonaldTrump H...|    0|    result->negative|
|Donald Trump talk...|    0|    result->negative|
|More striking? Re...|    1|result->negative@...|
|Scott Walker: Don...|    1|    result->positive|


In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from statistics import mean

def sigmoid(s):
    return 0 if s is None else round(mean(map(lambda x: 1 if (x == "result->positive") else 0, s.split("@"))) + .01)

sigmoid_udf = udf(sigmoid, IntegerType())

sentiment_data = sentiment_data.withColumn("total_sentiment", sigmoid_udf("finished_sentiment"))
sentiment_data.cache()

DataFrame[norm_text: string, Class: string, finished_sentiment: string, total_sentiment: int]

In [None]:
correct_count = sentiment_data.filter(sentiment_data.Class == sentiment_data.total_sentiment).count()
total_count =  sentiment_data.count()
correct_count / total_count