In [2]:
import os

In [3]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages JohnSnowLabs:spark-nlp:1.2.3 pyspark-shell'

In [4]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel

In [5]:
sc = pyspark.SparkContext()
spark = SparkSession(sc)

In [6]:
labeled_data = spark.read.option("header", "true").csv("trumptweet-mod.csv")

In [7]:
labeled_data.head()

# retweeted='FALSE'

Row(X.1='1', X='4', text='RT @GOPBlackChick: Illegals must be deported, said @realDonaldTrump Glad somebody has the guts to use the D-word! https://t.co/y15YuRIE59', retweet_count='26', favorited='FALSE', truncated='FALSE', id_str='6.33E+017', in_reply_to_screen_name='NA', source='"<a href=""http://www.tweetcaster.com"" rel=""nofollow"">TweetCaster for Android</a>"', retweeted='FALSE', created_at='Mon Aug 17 12:22:27 +0000 2015', in_reply_to_status_id_str='NA', in_reply_to_user_id_str='NA', lang='en', listed_count='46', verified='FALSE', location='Gotham City', user_id_str='191986903', description='Do I look like Batman to you?', geo_enabled='FALSE', user_created_at='Fri Sep 17 21:55:51 +0000 2010', statuses_count='138514', followers_count='881', favourites_count='155', protected='FALSE', user_url='NA', name=' Red Hood ', time_zone='Eastern Time (US & Canada)', user_lang='en', utc_offset='-14400', friends_count='927', screen_name='Blaze_in_3D', country_code='NA', country='NA', place_ty

In [23]:
# break up into separate parts
# then split training data into positive/negative
positive_data = labeled_data.filter(labeled_data.Class == "1").select("text")
positive_data.write.mode('overwrite').option("header", "true").format('csv').save('vivekn/positive/1.csv')
negative_data = labeled_data.filter(labeled_data.Class == "0").select("text")
negative_data.write.mode('overwrite').option("header", "true").format('csv').save('vivekn/negative/1.csv')

seed=12345
positive_train, positive_test, positive_validation = positive_data.randomSplit([0.6, 0.2, 0.2], seed=seed)
negative_train, negative_test, negative_validation = negative_data.randomSplit([0.6, 0.2, 0.2], seed=seed)

In [24]:
#Load the input data to be annotated
data = spark.read.json("Trump_2017-10-30.json.gz")
data = data.filter(data.lang == "en")
# Drop RTs
data = data.filter(data.retweeted_status.isNull())
#data.cache()
#data.count()
#data.show()
data.printSchema()

root
 |-- contributors: string (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- type: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- entities: struct (nullable = true)
 |    |-- hashtags: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true)
 |    |    |    |-- text: string (nullable = true)
 |    |-- media: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- display_url: string (nullable = true)
 |    |    |    |-- expanded_url: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- id_str: string (nullable = true)
 |    |    |    |-- indices: array (nullable = true)
 |    |    |    |    |-- element: long (containsNull = true

In [25]:
data.head()

Row(contributors=None, coordinates=None, created_at='Mon Oct 30 23:59:59 +0000 2017', entities=Row(hashtags=[], media=None, symbols=[], urls=[Row(display_url='justicewell.com/trump', expanded_url='http://justicewell.com/trump', indices=[57, 80], url='https://t.co/70aonSe3js'), Row(display_url='twitter.com/i/web/status/9…', expanded_url='https://twitter.com/i/web/status/925150322124767233', indices=[116, 139], url='https://t.co/hSZ86lEyWn')], user_mentions=[Row(id=724655615335788545, id_str='724655615335788545', indices=[0, 12], name='thenameisandy', screen_name='morganandyy')]), extended_entities=None, favorite_count=0, favorited=False, geo=None, id=925150322124767233, id_str='925150322124767233', in_reply_to_screen_name='morganandyy', in_reply_to_status_id=924361936589938688, in_reply_to_status_id_str='924361936589938688', in_reply_to_user_id=724655615335788545, in_reply_to_user_id_str='724655615335788545', is_quote_status=False, lang='en', metadata=Row(iso_language_code='en', result_

In [27]:
# Normalize tweets
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import trim

import emot
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

user_regex = r"@\S+"
url_regex = r"http[s]?://\S+"
hashtag_regex = r"#\S+"
space_regex = r"\s{2,}|\n"

def strip_emo(text):
    for data in emot.emoji(text):
        text = text.replace(data['value'], '')   
    for data in emot.emoticons(text):
        text = text.replace(data['value'], '')
    return text

strip_emo_udf = udf(strip_emo, StringType())

# Remove users (@foo), URLs, and duplicate space
uber_regex =  "|".join([user_regex, url_regex, hashtag_regex, space_regex])# , emoji_regex])

data = data.withColumn("norm_text", trim(strip_emo_udf(regexp_replace("text", uber_regex, ""))))
data = data.select("text", "norm_text")

In [28]:
data.head()

Row(text="@morganandyy President Kennedy was shot in the throat.\n\n💥https://t.co/70aonSe3js 💥\n\nWhy isn't FBI indicting people… https://t.co/hSZ86lEyWn", norm_text="President Kennedy was shot in the throat. Why isn't FBI indicting people…")

In [9]:
from sparknlp.annotator import *
from sparknlp.base import DocumentAssembler, Finisher

In [15]:
### Define the dataframe
document_assembler = DocumentAssembler() \
            .setInputCol("norm_text")
    
sentence_detector = SentenceDetectorModel() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = RegexTokenizer() \
            .setInputCols(["sentence"]) \
            .setOutputCol("token")
        
normalizer = Normalizer() \
            .setInputCols(["token"]) \
            .setOutputCol("normal")        
        
spell_checker = NorvigSweetingApproach() \
            .setInputCols(["normal"]) \
            .setOutputCol("spell")
        
sentiment_detector = ViveknSentimentApproach() \
    .setInputCols(["spell", "sentence"]) \
    .setOutputCol("sentiment") \
    .setPositiveSource("vivekn/positive") \
    .setNegativeSource("vivekn/negative") \
    .setPruneCorpus(False) # when training on small data you may want to disable this to not cut off infrequent words
    
finisher = Finisher() \
    .setInputCols(["sentiment"]) \
    .setIncludeKeys(True) \
    ##.setCleanAnnotations(False)
    
pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    normalizer,
    spell_checker,
    sentiment_detector,
    finisher
])

sentiment_data = pipeline.fit(data).transform(data)    

In [16]:
sentiment_data.show()

+--------------------+--------------------+--------------------+
|                text|           norm_text|  finished_sentiment|
+--------------------+--------------------+--------------------+
|@morganandyy Pres...|President Kennedy...|result->negative@...|
|@WhiteHouse Trump...|Trump's Tax Plan ...|    result->positive|
|@ThirteenCanuck @...|She’s not as stup...|    result->positive|
|REPORTER ASKED IF...|REPORTER ASKED IF...|    result->positive|
|But her emails!!!...|But her emails!!!...|    result->positive|
|Mike #Pence Could...|Mike  Could Be Th...|    result->negative|
|@AndreaR96631665 ...|               *come|    result->positive|
|@K1NGSOLOMON_ @Ch...|Know many people ...|    result->positive|
|@TinaMcGugan @G_D...|  Trump created that|    result->positive|
|@ResistanceDept W...|We'll see what ha...|result->positive@...|
|The day we jail o...|The day we jail o...|    result->positive|
|Trump is incapabl...| Trump is incapable.|    result->positive|
|@LlL_Donnie @FoxN...|Was

In [20]:
from pyspark.sql.types import IntegerType

def round_up(d):
    return round(d + .01)

def calc_mean(s):
    ls = list(map(lambda x: 1 if (x == "result->positive") else 0, s.split("@")))
    return round_up(sum(ls) / len(ls))

calc_mean_udf = udf(calc_mean, IntegerType())

sentiment_data = sentiment_data.withColumn("mean_sentiment", calc_mean_udf("finished_sentiment"))
sentiment_data.take(20)

[Row(text="@morganandyy President Kennedy was shot in the throat.\n\n💥https://t.co/70aonSe3js 💥\n\nWhy isn't FBI indicting people… https://t.co/hSZ86lEyWn", norm_text="President Kennedy was shot in the throat. Why isn't FBI indicting people…", finished_sentiment='result->negative@result->positive', mean_sentiment=1),
 Row(text="@WhiteHouse Trump's Tax Plan Could Lead to a Huge Deficit\nhttps://t.co/YLqFct52zm", norm_text="Trump's Tax Plan Could Lead to a Huge Deficit", finished_sentiment='result->positive', mean_sentiment=1),
 Row(text='@ThirteenCanuck @cnnbrk She’s not as stupid as Trump!', norm_text='She’s not as stupid as Trump!', finished_sentiment='result->positive', mean_sentiment=1),
 Row(text='REPORTER ASKED IF TRUMP WORKED WITH RUSSIA, WHAT SARAH SANDERS SAID... https://t.co/106ABMaxgf', norm_text='REPORTER ASKED IF TRUMP WORKED WITH RUSSIA, WHAT SARAH SANDERS SAID...', finished_sentiment='result->positive', mean_sentiment=1),
 Row(text='But her emails!!!!!!!!!!! #ButHerEmails