# Data processing

In [1]:
import os
# Load Spark NLP package as it's not included in all-spark-notebook
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages JohnSnowLabs:spark-nlp:1.2.3 pyspark-shell'

In [2]:
import pyspark
from pyspark.sql import SparkSession

conf = pyspark.SparkConf()
conf.set("spark.executor.memory", "4g")
conf.set("spark.driver.memory", "4g")

sc = pyspark.SparkContext(conf = conf)
spark = SparkSession(sc)

In [13]:
# Read in data produced earlier
test_data = spark.read.parquet("test-data.parquet")
# Drop any redundant data
test_data = test_data.distinct()
test_data.cache()
test_data.show()

+--------------------+-----+
|           norm_text|Class|
+--------------------+-----+
|DidUMiss? Trump m...|    1|
|Next question to ...|    0|
|When Trump become...|    1|
|scott walker says...|    0|
|Dude.. I just ran...|    0|
|I donate to polit...|    1|
|Jackie Who? Donal...|    1|
|NYC News Donald T...|    1|
|Trump is dangerou...|    0|
|Majority of Ameri...|    1|
|The polls have be...|    1|
|Trump's latest fo...|    1|
|Very interesting....|    1|
|Donald Trump and ...|    1|
|Totally planned o...|    0|
|When will you wri...|    0|
|CNN: Donald Trump...|    1|
|Cleric Rejects Al...|    0|
|Donald Trump Does...|    1|
|Donald Trump has ...|    1|
+--------------------+-----+
only showing top 20 rows



Execute NLP pipeline using the Vivekn annotator.

In [14]:
from pyspark.ml import Pipeline, PipelineModel
from sparknlp.annotator import *
from sparknlp.base import DocumentAssembler, Finisher

document_assembler = DocumentAssembler().setInputCol("norm_text")
    
sentence_detector = SentenceDetectorModel().setInputCols(["document"]).setOutputCol("sentence")

tokenizer = RegexTokenizer().setInputCols(["sentence"]).setOutputCol("token")
        
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normal")        
        
spell_checker = NorvigSweetingApproach().setInputCols(["normal"]).setOutputCol("spell")
        
sentiment_detector = ViveknSentimentApproach().setInputCols(["spell", "sentence"]) \
    .setOutputCol("sentiment").setPositiveSource("train-data.txt/positive.txt") \
    .setNegativeSource("train-data.txt/negative.txt").setPruneCorpus(False)   
    
finisher = Finisher().setInputCols(["sentiment"]).setIncludeKeys(True)
    
pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    normalizer,
    spell_checker,
    sentiment_detector,
    finisher
])

sentiment_data = pipeline.fit(test_data).transform(test_data)    
sentiment_data.show()

+--------------------+-----+--------------------+
|           norm_text|Class|  finished_sentiment|
+--------------------+-----+--------------------+
|DidUMiss? Trump m...|    1|result->positive@...|
|Next question to ...|    0|    result->negative|
|When Trump become...|    1|result->positive@...|
|scott walker says...|    0|result->negative@...|
|Dude.. I just ran...|    0|result->negative@...|
|I donate to polit...|    1|result->positive@...|
|Jackie Who? Donal...|    1|result->positive@...|
|NYC News Donald T...|    1|    result->positive|
|Trump is dangerou...|    0|    result->negative|
|Majority of Ameri...|    1|    result->positive|
|The polls have be...|    1|    result->positive|
|Trump's latest fo...|    1|result->positive@...|
|Very interesting....|    1|result->positive@...|
|Donald Trump and ...|    1|    result->negative|
|Totally planned o...|    0|result->negative@...|
|When will you wri...|    0|    result->negative|
|CNN: Donald Trump...|    1|    result->positive|


The pipeline adds an extra column with an sentiment indicated for each sentence in the tweet. The value of the indication is either `result->positive` or `result-negative` and multiple sentences are represented with the repetition of these values delimited by `@`.  In order to get a single Boolean value for the tweet, we take the average of each sentiment indication and then round the result to 0 or 1. In Python 3, the result of rounding a value of .5 is 0. Purely by chance, I decided to be biased towards positive sentiments by adding a small value to force .5 to round to 1. However, in testing I found that doing so resulted in more accurate predictions.

In [15]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from statistics import mean

def sigmoid(s):
    return 0 if s is None else round(mean(map(lambda x: 1 if (x == "result->positive") else 0, s.split("@"))) + .01)

sigmoid_udf = udf(sigmoid, IntegerType())

sentiment_data = sentiment_data.withColumn("total_sentiment", sigmoid_udf("finished_sentiment"))
sentiment_data.cache()
sentiment_data.show()

+--------------------+-----+--------------------+---------------+
|           norm_text|Class|  finished_sentiment|total_sentiment|
+--------------------+-----+--------------------+---------------+
|DidUMiss? Trump m...|    1|result->positive@...|              1|
|Next question to ...|    0|    result->negative|              0|
|When Trump become...|    1|result->positive@...|              1|
|scott walker says...|    0|result->negative@...|              0|
|Dude.. I just ran...|    0|result->negative@...|              0|
|I donate to polit...|    1|result->positive@...|              1|
|Jackie Who? Donal...|    1|result->positive@...|              1|
|NYC News Donald T...|    1|    result->positive|              1|
|Trump is dangerou...|    0|    result->negative|              0|
|Majority of Ameri...|    1|    result->positive|              1|
|The polls have be...|    1|    result->positive|              1|
|Trump's latest fo...|    1|result->positive@...|              1|
|Very inte

Finally, we determine the accuracy of the model.

In [16]:
correct_count = sentiment_data.filter(sentiment_data.Class == sentiment_data.total_sentiment).count()
total_count =  sentiment_data.count()
correct_count / total_count

0.9105431309904153