In [4]:
import spacy

In [5]:
english = spacy.load('en')

In [6]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

We'll look at the sentiment of some example text from Jane Austen (we picked a notably recognizable excerpt):

In [7]:
sampletext = """ It is a truth universally acknowledged, that a single man in possession
of a good fortune, must be in want of a wife.

However little known the feelings or views of such a man may be on his
first entering a neighbourhood, this truth is so well fixed in the minds
of the surrounding families, that he is considered the rightful property
of some one or other of their daughters. """

In [8]:
result = english(sampletext)

In [9]:
type(result)

spacy.tokens.doc.Doc

In [10]:
analyzer = SentimentIntensityAnalyzer()

In [11]:
[analyzer.polarity_scores(str(s)) for s in list(result.sents)]

[{'neg': 0.0, 'neu': 0.711, 'pos': 0.289, 'compound': 0.6705},
 {'neg': 0.0, 'neu': 0.895, 'pos': 0.105, 'compound': 0.6147}]

Unsurprisingly, the first two sentences of _Pride and Prejudice_ score as pretty neutral.  Let's try some raw text from the negative product reviews corpus:

In [12]:
negative = english(""" This oatmeal is not good. Its mushy, soft, I don't like it. Quaker Oats is the way to go. 

Seriously this product was as tasteless as they come. There are much better tasting products out 
there but at 100 calories its better than a special k bar or cookie snack pack. You just have to 
season it or combine it with something else to share the flavor.

These were nasty, they were so greasy and too rich for my blood, plus they lacked major flavor, 
no spicy jalapeno flavor at all.
""")
[(s, analyzer.polarity_scores(str(s))) for s in list(negative.sents)]

[( This oatmeal is not good.,
  {'neg': 0.376, 'neu': 0.624, 'pos': 0.0, 'compound': -0.3412}),
 (Its mushy, soft, I don't like it.,
  {'neg': 0.297, 'neu': 0.703, 'pos': 0.0, 'compound': -0.2755}),
 (Quaker Oats is the way to go. 
  , {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}),
 (Seriously this product was as tasteless as they come.,
  {'neg': 0.175, 'neu': 0.825, 'pos': 0.0, 'compound': -0.1779}),
 (There are much better tasting products out 
  there but at 100 calories its better than a special k bar or cookie snack pack.,
  {'neg': 0.0, 'neu': 0.658, 'pos': 0.342, 'compound': 0.8537}),
 (You just have to 
  season it or combine it with something else to share the flavor.
  , {'neg': 0.0, 'neu': 0.872, 'pos': 0.128, 'compound': 0.296}),
 (These were nasty, they were so greasy and too rich for my blood, plus they lacked major flavor, 
  no spicy jalapeno flavor at all.,
  {'neg': 0.191, 'neu': 0.691, 'pos': 0.118, 'compound': -0.296})]

Now we'll connect this sort of analysis to Spark so we can apply it to streaming data.

In [13]:
import os
SPARK_VERSION="2.2.0"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages org.apache.spark:spark-sql-kafka-0-10_2.11:%s pyspark-shell" % SPARK_VERSION

In [14]:
import pyspark

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("Social Firehose") \
    .getOrCreate()

In [15]:
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import column, from_json

structure = StructType([StructField(fn, StringType(), True) for fn in "text user_id update_id".split()])

records = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka.kafka.svc:9092") \
  .option("subscribe", "social-firehose") \
  .load() \
  .select(column("value").cast(StringType()).alias("value")) \
  .select(from_json(column("value"), structure).alias("json")) \
  .select(column("json.update_id"), column("json.user_id").alias("user_id"), column("json.text"))

In [16]:
# This code is borrowed from Sparkling Pandas; see here:
# https://github.com/sparklingpandas/sparklingml/blob/627c8f23688397a53e2e9e805e92a54c2be1cf3d/sparklingml/transformation_functions.py#L53
class SpacyMagic(object):
    """
    Simple Spacy Magic to minimize loading time.
    >>> SpacyMagic.get("en")
    <spacy.en.English ...
    """
    _spacys = {}

    @classmethod
    def get(cls, lang):
        if lang not in cls._spacys:
            import spacy
            cls._spacys[lang] = spacy.load(lang)
        return cls._spacys[lang]

Now we can make a user-defined function to split social-media updates into sentences.  We will use spaCy, which is more expensive than most reasonable heuristics for splitting text into sentences (but also much smarter).

In [17]:
from pyspark.sql.types import ArrayType
from pyspark.sql.functions import udf

def split_sentences_impl(s):
    """ splits an English string into sentences, using spaCy """
    english = SpacyMagic.get("en")
    return [str(sentence) for sentence in english(s).sents]

split_sentences = udf(split_sentences_impl, ArrayType(StringType()))

To see what this looks like, we'll run it on the first 10 rows of the data frame

In [18]:
split_records = records \
  .orderBy("update_id") \
  .limit(10) \
  .select("update_id", "user_id", split_sentences(column("text")).alias("sentences")) \
  .cache()

split_records.collect()

[Row(update_id='00000000000000000000', user_id='4665560161', sentences=["Elinor wished that the same forbearance could have extended towards herself, but that was impossible, and she was necessarily drawn from the mother's description.", '#socialmedia #marketing #yolo']),
 Row(update_id='00000000000000000000', user_id='1000040647', sentences=['It did not suit her situation or feelings, I might have rejoiced in its termination.', '#tbt #fail #yolo']),
 Row(update_id='00000000000000000000', user_id='9086078734', sentences=['The furniture was in all probability have gained some news of them; and till we know that she ever should receive another so perfectly gratifying in the occasion and the style.', '#retweet #yolo #ff']),
 Row(update_id='00000000000000000001', user_id='3082369400', sentences=['After this period every appearance of equal permanency.', '#health']),
 Row(update_id='00000000000000000001', user_id='5902440326', sentences=['Her performance was pleasing, though by no means tir

We can explode each array into multiple rows to make further processing easier:

In [19]:
from pyspark.sql.functions import explode
sentences = split_records.select("update_id", "user_id", explode(column("sentences")).alias("sentence"))
sentences.show(truncate=False)

+--------------------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|update_id           |user_id   |sentence                                                                                                                                                                        |
+--------------------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|00000000000000000000|4665560161|Elinor wished that the same forbearance could have extended towards herself, but that was impossible, and she was necessarily drawn from the mother's description.              |
|00000000000000000000|4665560161|#socialmedia #marketing #yolo                                                                                              

In [20]:
from pyspark.sql.types import FloatType

sentiment_fields = "pos neg neu compound".split()
sentiment_structure = StructType([StructField(fn, FloatType(), True) for fn in sentiment_fields])

analyzer_bcast = spark.sparkContext.broadcast(analyzer)

def vader_impl(s):
    va = analyzer_bcast.value
    result = va.polarity_scores(s)
    return [result[key] for key in sentiment_fields]

sentiment_score = udf(vader_impl, sentiment_structure)

In [21]:
sentences.select("update_id", "user_id", "sentence", sentiment_score(column("sentence"))).show()

+--------------------+----------+--------------------+--------------------+
|           update_id|   user_id|            sentence|vader_impl(sentence)|
+--------------------+----------+--------------------+--------------------+
|00000000000000000000|4665560161|Elinor wished tha...|   [0.0,0.0,1.0,0.0]|
|00000000000000000000|4665560161|#socialmedia #mar...|   [0.0,0.0,1.0,0.0]|
|00000000000000000000|1000040647|It did not suit h...|[0.188,0.0,0.812,...|
|00000000000000000000|1000040647|    #tbt #fail #yolo|   [0.0,0.0,1.0,0.0]|
|00000000000000000000|9086078734|The furniture was...|[0.292,0.0,0.708,...|
|00000000000000000000|9086078734|  #retweet #yolo #ff|   [0.0,0.0,1.0,0.0]|
|00000000000000000001|3082369400|After this period...|   [0.0,0.0,1.0,0.0]|
|00000000000000000001|3082369400|             #health|   [0.0,0.0,1.0,0.0]|
|00000000000000000001|5902440326|Her performance w...|[0.252,0.146,0.60...|
|00000000000000000001|5902440326|#socialmedia #blo...|   [0.0,0.0,1.0,0.0]|
|00000000000