In [2]:
import os

In [3]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages JohnSnowLabs:spark-nlp:1.2.3 pyspark-shell'

In [4]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel

In [5]:
sc = pyspark.SparkContext()
spark = SparkSession(sc)

In [6]:
labeled_data = spark.read.option("header", "true").csv("trumptweet-mod.csv")

In [7]:
labeled_data.head()

# retweeted='FALSE'

Row(X.1='1', X='4', text='RT @GOPBlackChick: Illegals must be deported, said @realDonaldTrump Glad somebody has the guts to use the D-word! https://t.co/y15YuRIE59', retweet_count='26', favorited='FALSE', truncated='FALSE', id_str='6.33E+017', in_reply_to_screen_name='NA', source='"<a href=""http://www.tweetcaster.com"" rel=""nofollow"">TweetCaster for Android</a>"', retweeted='FALSE', created_at='Mon Aug 17 12:22:27 +0000 2015', in_reply_to_status_id_str='NA', in_reply_to_user_id_str='NA', lang='en', listed_count='46', verified='FALSE', location='Gotham City', user_id_str='191986903', description='Do I look like Batman to you?', geo_enabled='FALSE', user_created_at='Fri Sep 17 21:55:51 +0000 2010', statuses_count='138514', followers_count='881', favourites_count='155', protected='FALSE', user_url='NA', name=' Red Hood ', time_zone='Eastern Time (US & Canada)', user_lang='en', utc_offset='-14400', friends_count='927', screen_name='Blaze_in_3D', country_code='NA', country='NA', place_ty

In [44]:
# break up into separate parts
train_data, test_data, validation_data = labeled_data.randomSplit([0.6, 0.2, 0.2], seed=71082)

# Note: This wouldn't work in a cluster
def write_df(df, name):
    Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
    FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
    Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration

    tmp_name = name + ".tmp"
    df.coalesce(1).write.mode('overwrite').text(tmp_name)
    
    fs = FileSystem.get(Configuration())
    # Assume one file output
    file = fs.globStatus(Path(tmp_name + "/*.txt"))[0].getPath();
    fs.rename(file, Path(name));
    fs.delete(Path(tmp_name), True);

# split training data into positive/negative
positive_data = train_data.filter(train_data.Class == "1").select("text")
write_df(positive_data, "trumptweet-pos.txt")

negative_data = labeled_data.filter(train_data.Class == "0").select("text")
write_df(negative_data, "trumptweet-neg.txt")

In [None]:
#Load the input data to be annotated
data = spark.read.json("Trump_2017-10-30.json.gz")
data = data.filter(data.lang == "en")
# Drop RTs
data = data.filter(data.retweeted_status.isNull())
#data.cache()
#data.count()
#data.show()
data.printSchema()

In [None]:
data.head()

In [None]:
# Normalize tweets
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import trim

import emot
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

user_regex = r"@\S+"
url_regex = r"http[s]?://\S+"
hashtag_regex = r"#\S+"
space_regex = r"\s{2,}|\n"

def strip_emo(text):
    for data in emot.emoji(text):
        text = text.replace(data['value'], '')   
    for data in emot.emoticons(text):
        text = text.replace(data['value'], '')
    return text

strip_emo_udf = udf(strip_emo, StringType())

# Remove users (@foo), URLs, and duplicate space
uber_regex =  "|".join([user_regex, url_regex, hashtag_regex, space_regex])# , emoji_regex])

data = data.withColumn("norm_text", trim(strip_emo_udf(regexp_replace("text", uber_regex, ""))))
data = data.select("text", "norm_text")

In [None]:
data.head()

In [None]:
from sparknlp.annotator import *
from sparknlp.base import DocumentAssembler, Finisher

In [None]:
### Define the dataframe
document_assembler = DocumentAssembler() \
            .setInputCol("norm_text")
    
sentence_detector = SentenceDetectorModel() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = RegexTokenizer() \
            .setInputCols(["sentence"]) \
            .setOutputCol("token")
        
normalizer = Normalizer() \
            .setInputCols(["token"]) \
            .setOutputCol("normal")        
        
spell_checker = NorvigSweetingApproach() \
            .setInputCols(["normal"]) \
            .setOutputCol("spell")
        
sentiment_detector = ViveknSentimentApproach() \
    .setInputCols(["spell", "sentence"]) \
    .setOutputCol("sentiment") \
    .setPositiveSource("vivekn/positive") \
    .setNegativeSource("vivekn/negative") \
    .setPruneCorpus(False) # when training on small data you may want to disable this to not cut off infrequent words
    
finisher = Finisher() \
    .setInputCols(["sentiment"]) \
    .setIncludeKeys(True) \
    ##.setCleanAnnotations(False)
    
pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    normalizer,
    spell_checker,
    sentiment_detector,
    finisher
])

sentiment_data = pipeline.fit(data).transform(data)    

In [None]:
sentiment_data.show()

In [None]:
from pyspark.sql.types import IntegerType

def round_up(d):
    return round(d + .01)

def calc_mean(s):
    ls = list(map(lambda x: 1 if (x == "result->positive") else 0, s.split("@")))
    return round_up(sum(ls) / len(ls))

calc_mean_udf = udf(calc_mean, IntegerType())

sentiment_data = sentiment_data.withColumn("mean_sentiment", calc_mean_udf("finished_sentiment"))
sentiment_data.take(20)