In [1]:
import os

In [2]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages JohnSnowLabs:spark-nlp:1.2.3 pyspark-shell'

In [3]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel

In [4]:
sc = pyspark.SparkContext()
spark = SparkSession(sc)

In [5]:
labeled_data = spark.read.option("header", "true").csv("trumptweet-mod.csv")

# Filter out retweets?

In [6]:
labeled_data.head(25)

# retweeted='FALSE'

[Row(X.1='1', X='4', text='RT @GOPBlackChick: Illegals must be deported, said @realDonaldTrump Glad somebody has the guts to use the D-word! https://t.co/y15YuRIE59', retweet_count='26', favorited='FALSE', truncated='FALSE', id_str='6.33E+017', in_reply_to_screen_name='NA', source='"<a href=""http://www.tweetcaster.com"" rel=""nofollow"">TweetCaster for Android</a>"', retweeted='FALSE', created_at='Mon Aug 17 12:22:27 +0000 2015', in_reply_to_status_id_str='NA', in_reply_to_user_id_str='NA', lang='en', listed_count='46', verified='FALSE', location='Gotham City', user_id_str='191986903', description='Do I look like Batman to you?', geo_enabled='FALSE', user_created_at='Fri Sep 17 21:55:51 +0000 2010', statuses_count='138514', followers_count='881', favourites_count='155', protected='FALSE', user_url='NA', name=' Red Hood ', time_zone='Eastern Time (US & Canada)', user_lang='en', utc_offset='-14400', friends_count='927', screen_name='Blaze_in_3D', country_code='NA', country='NA', place_t

In [7]:
# Normalize tweets
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import trim

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

user_regex = r"@\S+"
url_regex = r"http[s]?://\S+"
hashtag_regex = r"#\S+"
space_regex = r"\s{2,}|\n"

#import emot
#def strip_emo(text):
#    for data in emot.emoji(text):
#        text = text.replace(data['value'], '')   
#    for data in emot.emoticons(text):
#        text = text.replace(data['value'], '')
#    return text
#
#strip_emo_udf = udf(strip_emo, StringType())

rt_regex = r"(?=\s?)(RT)(?=\s?)"
user_regex = r"@\S+"
url_regex = r"http[s]?:\/\/\S+"
hashtag_regex = r"#\S+"
space_regex = r"\s{2,}|\n"

# Remove RT, users (@foo), URLs, and duplicate space
uber_regex =  "|".join([rt_regex, user_regex, url_regex, hashtag_regex, space_regex])

#labeled_data = labeled_data.withColumn("norm_text", trim(strip_emo_udf(regexp_replace("text", uber_regex, ""))))
labeled_data = labeled_data.withColumn("norm_text", trim(regexp_replace("text", uber_regex, "")))
labeled_data = labeled_data.where(labeled_data.text.isNotNull())

In [8]:
labeled_data.head(25)

[Row(X.1='1', X='4', text='RT @GOPBlackChick: Illegals must be deported, said @realDonaldTrump Glad somebody has the guts to use the D-word! https://t.co/y15YuRIE59', retweet_count='26', favorited='FALSE', truncated='FALSE', id_str='6.33E+017', in_reply_to_screen_name='NA', source='"<a href=""http://www.tweetcaster.com"" rel=""nofollow"">TweetCaster for Android</a>"', retweeted='FALSE', created_at='Mon Aug 17 12:22:27 +0000 2015', in_reply_to_status_id_str='NA', in_reply_to_user_id_str='NA', lang='en', listed_count='46', verified='FALSE', location='Gotham City', user_id_str='191986903', description='Do I look like Batman to you?', geo_enabled='FALSE', user_created_at='Fri Sep 17 21:55:51 +0000 2010', statuses_count='138514', followers_count='881', favourites_count='155', protected='FALSE', user_url='NA', name=' Red Hood ', time_zone='Eastern Time (US & Canada)', user_lang='en', utc_offset='-14400', friends_count='927', screen_name='Blaze_in_3D', country_code='NA', country='NA', place_t

In [9]:
# break up into separate parts
#train_data, test_data, validation_data = labeled_data.randomSplit([0.6, 0.2, 0.2], seed=71082)

#public DataFrame sample(boolean withReplacement,
#               double fraction,
#               long seed)
seed = 71082
train_data = labeled_data.sample(False, 0.6, seed)
test_data = labeled_data.sample(False, 0.2, seed)
validation_data = labeled_data.sample(False, 0.2, seed)
#train, test = data.randomSplit([0.9, 0.1], seed=12345)
# Maybe just need training and validation?

# Note: This wouldn't work in a cluster
def write_df(df, dirname, filename):
    Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
    FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
    Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration

    tmp_name = filename + ".tmp"
    df.coalesce(1).write.mode('overwrite').text(tmp_name)
    
    fs = FileSystem.get(Configuration())
    fs.mkdirs(Path(dirname))
    # Assume one file output
    file = fs.globStatus(Path(tmp_name + "/*.txt"))[0].getPath();
    fs.rename(file, Path(dirname + "/" + filename));
    fs.delete(Path(tmp_name), True);

# split training data into positive/negative
positive_data = train_data.filter(train_data.Class == "1").select("norm_text")
write_df(positive_data, "trumptweet/positive", "1.txt")
#positive_data.write.mode("overwrite").text("trumptweet/positive")

negative_data = labeled_data.filter(train_data.Class == "0").select("norm_text")
write_df(negative_data, "trumptweet/negative", "1.txt")
#negative_data.write.mode("overwrite").text("trumptweet/negative")


In [10]:
test_data.head(25)
#labeled_data.head(25)

[Row(X.1='2', X='6', text='@CNN is there any other news than Trump in the whole world going on? News, please!', retweet_count='0', favorited='FALSE', truncated='FALSE', id_str='6.33E+017', in_reply_to_screen_name='CNN', source='"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>"', retweeted='FALSE', created_at='Mon Aug 17 12:22:29 +0000 2015', in_reply_to_status_id_str='NA', in_reply_to_user_id_str='759251', lang='en', listed_count='12', verified='FALSE', location='Grand Rapids MI', user_id_str='18170798', description='Beyond Existing Enterprises LLC', geo_enabled=None, user_created_at=None, statuses_count=None, followers_count=None, favourites_count=None, protected=None, user_url=None, name=None, time_zone=None, user_lang=None, utc_offset=None, friends_count=None, screen_name=None, country_code=None, country=None, place_type=None, full_name=None, place_name=None, place_id=None, place_lat=None, place_lon=None, lat=None, lon=None, expanded_url=None

In [11]:
from sparknlp.annotator import *
from sparknlp.base import DocumentAssembler, Finisher

In [12]:
### Define the dataframe
document_assembler = DocumentAssembler() \
            .setInputCol("norm_text")
    
sentence_detector = SentenceDetectorModel() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = RegexTokenizer() \
            .setInputCols(["sentence"]) \
            .setOutputCol("token")
        
normalizer = Normalizer() \
            .setInputCols(["token"]) \
            .setOutputCol("normal")        
        
spell_checker = NorvigSweetingApproach() \
            .setInputCols(["normal"]) \
            .setOutputCol("spell")
        
sentiment_detector = ViveknSentimentApproach() \
    .setInputCols(["spell", "sentence"]) \
    .setOutputCol("sentiment") \
    .setPositiveSource("trumptweet/positive/1.txt") \
    .setNegativeSource("trumptweet/negative/1.txt") \
    .setPruneCorpus(False) # when training on small data you may want to disable this to not cut off infrequent words
    
finisher = Finisher() \
    .setInputCols(["sentiment"]) \
    .setIncludeKeys(True) \
    ##.setCleanAnnotations(False)
    
pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    normalizer,
    spell_checker,
    sentiment_detector,
    finisher
])

sentiment_data = pipeline.fit(test_data).transform(test_data)    

In [13]:
#sentiment_data.write.mode("overwrite").option("header", "true").csv("sentiment_data.csv")

sentiment_data.show()

+---+---+--------------------+--------------------+---------+---------+---------+-----------------------+--------------------+--------------------+--------------------+-------------------------+-----------------------+----+------------+--------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+---------------+----------------+---------+--------------------+--------------------+--------------------+---------+----------+-------------+---------------+------------+-------------+----------+------------+----------+--------+----------+-----------+----+----+--------------------+--------------------+-----+--------------------+--------------------+
|X.1|  X|                text|       retweet_count|favorited|truncated|   id_str|in_reply_to_screen_name|              source|           retweeted|          created_at|in_reply_to_status_id_str|in_reply_to_user_id_str|lang|listed_count|verified|            location| user_id_str|      

In [17]:
from pyspark.sql.types import IntegerType

def round_up(d):
    return round(d + .01)

def calc_mean(s):
    if s is not None:
        ls = list(map(lambda x: 1 if (x == "result->positive") else 0, s.split("@")))
        return 0 if (len(ls) == 0) else round_up(sum(ls) / len(ls))
    else:
        return 0

calc_mean_udf = udf(calc_mean, IntegerType())

sentiment_data = sentiment_data.withColumn("mean_sentiment", calc_mean_udf("finished_sentiment"))
sentiment_data.cache()
#sentiment_data.take(20)
#sentiment_data.printSchema()

DataFrame[X.1: string, X: string, text: string, retweet_count: string, favorited: string, truncated: string, id_str: string, in_reply_to_screen_name: string, source: string, retweeted: string, created_at: string, in_reply_to_status_id_str: string, in_reply_to_user_id_str: string, lang: string, listed_count: string, verified: string, location: string, user_id_str: string, description: string, geo_enabled: string, user_created_at: string, statuses_count: string, followers_count: string, favourites_count: string, protected: string, user_url: string, name: string, time_zone: string, user_lang: string, utc_offset: string, friends_count: string, screen_name: string, country_code: string, country: string, place_type: string, full_name: string, place_name: string, place_id: string, place_lat: string, place_lon: string, lat: string, lon: string, expanded_url: string, url: string, Class: string, norm_text: string, finished_sentiment: string, mean_sentiment: int]

In [18]:
sentiment_data.head(25)

[Row(X.1='2', X='6', text='@CNN is there any other news than Trump in the whole world going on? News, please!', retweet_count='0', favorited='FALSE', truncated='FALSE', id_str='6.33E+017', in_reply_to_screen_name='CNN', source='"<a href=""http://twitter.com/download/android"" rel=""nofollow"">Twitter for Android</a>"', retweeted='FALSE', created_at='Mon Aug 17 12:22:29 +0000 2015', in_reply_to_status_id_str='NA', in_reply_to_user_id_str='759251', lang='en', listed_count='12', verified='FALSE', location='Grand Rapids MI', user_id_str='18170798', description='Beyond Existing Enterprises LLC', geo_enabled=None, user_created_at=None, statuses_count=None, followers_count=None, favourites_count=None, protected=None, user_url=None, name=None, time_zone=None, user_lang=None, utc_offset=None, friends_count=None, screen_name=None, country_code=None, country=None, place_type=None, full_name=None, place_name=None, place_id=None, place_lat=None, place_lon=None, lat=None, lon=None, expanded_url=None

In [26]:
sentiment_data.filter(sentiment_data.Class == sentiment_data.mean_sentiment).count() / sentiment_data.count()
#sentiment_data.filter(sentiment_data.Class == sentiment_data.mean_sentiment).select("Class", "mean_sentiment").show()

0.6411251212415131