In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages JohnSnowLabs:spark-nlp:1.2.3 pyspark-shell'

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel

sc = pyspark.SparkContext()
spark = SparkSession(sc)

In [3]:
labeled_data = spark.read.csv("trumptweet-mod.csv", header = True, escape = '"')
labeled_data = labeled_data.select("text", "Class")

In [4]:
labeled_data = labeled_data.filter((labeled_data.Class == '0') | (labeled_data.Class == '1'))
labeled_data.select('Class').distinct().show()

+-----+
|Class|
+-----+
|    0|
|    1|
+-----+



In [5]:
# Normalize tweets
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import trim

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

user_regex = r"@\S+"
url_regex = r"http[s]?://\S+"
hashtag_regex = r"#\S+"
space_regex = r"\s{2,}|\n"

#import emot
#def strip_emo(text):
#    for data in emot.emoji(text):
#        text = text.replace(data['value'], '')   
#    for data in emot.emoticons(text):
#        text = text.replace(data['value'], '')
#    return text
#
#strip_emo_udf = udf(strip_emo, StringType())

rt_regex = r"(?=\s?)(RT)(?=\s?)"
user_regex = r"@\S+"
url_regex = r"http[s]?:\/\/\S+"
hashtag_regex = r"#\S+"
space_regex = r"\s{2,}|\n"

# TODO: Remove <ed><U+hex>
# TODO: Allow hashtags
# TODO: Allow users?

# Remove RT, users (@foo), URLs, and duplicate space
uber_regex =  "|".join([rt_regex, url_regex, space_regex, user_regex, hashtag_regex])#, user_regex, hashtag_regex])

#labeled_data = labeled_data.withColumn("norm_text", trim(strip_emo_udf(regexp_replace("text", uber_regex, ""))))
labeled_data = labeled_data.withColumn("norm_text", trim(regexp_replace("text", uber_regex, "")))
labeled_data = labeled_data.filter(labeled_data.norm_text != '')

In [6]:
labeled_data.show()

+--------------------+-----+--------------------+
|                text|Class|           norm_text|
+--------------------+-----+--------------------+
|RT @GOPBlackChick...|    1|Illegals must be ...|
|RT @KurtSchlichte...|    0|- The GOP Establi...|
|@ajpeacemaker @md...|    0|So much stupid go...|
|THE TRUMP IMMIGRA...|    1|THE TRUMP IMMIGRA...|
|RT @CNNPolitics: ...|    0|Christie on Donal...|
|@Morning_Joe Not ...|    1|Not a  fan, but h...|
|RT @ThePatriot143...|    0|Court Has To Step...|
|Trump is correct ...|    1|Trump is correct ...|
|"I'm going to pre...|    1|"I'm going to pre...|
|I really hope peo...|    0|I really hope peo...|
|Trump is claiming...|    0|Trump is claiming...|
|RT @marclamonthil...|    1|Latest poll has T...|
|BOOM ' Univision ...|    1|BOOM ' Univision ...|
|@GeoScarborough I...|    1|I am now all in f...|
|RT @charlescwcook...|    0|Today's Trump pos...|
|RT @mdabbss: Dona...|    0|Donald trump the ...|
|A little surprise...|    1|A little surprise...|


In [7]:
# break up into separate parts
train_data, test_data = labeled_data.randomSplit([0.8, 0.2], seed=71082)

# Note: This wouldn't work in a cluster
def write_df(df, dirname, filename):
    Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
    FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
    Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration

    tmp_name = filename + ".tmp"
    df.coalesce(1).write.mode('overwrite').text(tmp_name)
    
    fs = FileSystem.get(Configuration())
    fs.mkdirs(Path(dirname))
    # Assume one file output
    file = fs.globStatus(Path(tmp_name + "/*.txt"))[0].getPath();
    fs.rename(file, Path(dirname + "/" + filename));
    fs.delete(Path(tmp_name), True);

# split training data into positive/negative
positive_data = train_data.filter(train_data.Class == "1").select("norm_text")
write_df(positive_data, "trumptweet/positive", "1.txt")
#positive_data.write.mode("overwrite").text("trumptweet/positive")

negative_data = labeled_data.filter(train_data.Class == "0").select("norm_text")
write_df(negative_data, "trumptweet/negative", "1.txt")
#negative_data.write.mode("overwrite").text("trumptweet/negative")


In [8]:
from sparknlp.annotator import *
from sparknlp.base import DocumentAssembler, Finisher

### Define the dataframe
document_assembler = DocumentAssembler().setInputCol("norm_text")
    
sentence_detector = SentenceDetectorModel().setInputCols(["document"]).setOutputCol("sentence")

tokenizer = RegexTokenizer().setInputCols(["sentence"]).setOutputCol("token")
        
normalizer = Normalizer().setInputCols(["token"]).setOutputCol("normal")        
        
spell_checker = NorvigSweetingApproach().setInputCols(["normal"]).setOutputCol("spell")
        
sentiment_detector = ViveknSentimentApproach().setInputCols(["spell", "sentence"]) \
    .setOutputCol("sentiment").setPositiveSource("trumptweet/positive/1.txt") \
    .setNegativeSource("trumptweet/negative/1.txt").setPruneCorpus(False) # when training on small data you may want to disable this to not cut off infrequent words
    
finisher = Finisher().setInputCols(["sentiment"]).setIncludeKeys(True)##.setCleanAnnotations(False)
    
pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    normalizer,
    spell_checker,
    sentiment_detector,
    finisher
])

sentiment_data = pipeline.fit(test_data).transform(test_data)    
sentiment_data.show()

+--------------------+-----+--------------------+--------------------+
|                text|Class|           norm_text|  finished_sentiment|
+--------------------+-----+--------------------+--------------------+
| you claim you're...|    0|you claim you're ...|    result->negative|
|"Did you know tha...|    0|"Did you know tha...|    result->negative|
|"Former Reagan An...|    0|"Former Reagan An...|    result->negative|
|"In 24 days plus ...|    1|"In 24 days plus ...|result->negative@...|
|#DonaldTrump Copi...|    0|Copied  Immigrati...|    result->negative|
|#GoogleNews Conse...|    0|Conservative Expe...|    result->negative|
|#NYC #News From c...|    1|From campaign to ...|    result->positive|
|#Politics Donald ...|    1|Donald Trump: A f...|    result->positive|
|#Politics Trump l...|    1|Trump leaves camp...|    result->positive|
|#Politics Trump l...|    1|Trump leaves camp...|    result->positive|
|#Reuterspolitics ...|    1|Trump leaves camp...|    result->positive|
|#Trum

In [9]:
from pyspark.sql.types import IntegerType

def sigmoid(s):
    if s is not None:
        ls = list(map(lambda x: 1 if (x == "result->positive") else 0, s.split("@")))
        return 0 if (len(ls) == 0) else round(sum(ls) / len(ls) + .01)
    else:
        return 0

sigmoid_udf = udf(sigmoid, IntegerType())

sentiment_data = sentiment_data.withColumn("total_sentiment", sigmoid_udf("finished_sentiment"))
sentiment_data.cache()

DataFrame[text: string, Class: string, norm_text: string, finished_sentiment: string, total_sentiment: int]

In [10]:
sentiment_data.show()

+--------------------+-----+--------------------+--------------------+---------------+
|                text|Class|           norm_text|  finished_sentiment|total_sentiment|
+--------------------+-----+--------------------+--------------------+---------------+
| you claim you're...|    0|you claim you're ...|    result->negative|              0|
|"Did you know tha...|    0|"Did you know tha...|    result->negative|              0|
|"Former Reagan An...|    0|"Former Reagan An...|    result->negative|              0|
|"In 24 days plus ...|    1|"In 24 days plus ...|result->negative@...|              0|
|#DonaldTrump Copi...|    0|Copied  Immigrati...|    result->negative|              0|
|#GoogleNews Conse...|    0|Conservative Expe...|    result->negative|              0|
|#NYC #News From c...|    1|From campaign to ...|    result->positive|              1|
|#Politics Donald ...|    1|Donald Trump: A f...|    result->positive|              1|
|#Politics Trump l...|    1|Trump leaves ca

In [11]:
sentiment_data.filter(sentiment_data.Class == sentiment_data.total_sentiment).count() / sentiment_data.count()

0.8027586206896552