In [1]:
# Confirm local setup by checking SparkContext
sc

In [2]:
# initialize spark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import *

spark = SparkSession  \
    .builder  \
    .appName("StructruedKafkaRead")  \
    .getOrCreate()

# Subscribe to 1 topic
tweetStream = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("subscribe", "TWEETS2") \
  .option("startingOffsets", "earliest") \
  .load()

tweetStream = tweetStream.selectExpr("timestamp","CAST(value AS STRING)")
tweetStream = tweetStream.withColumn("eventTime",to_timestamp("timestamp"))

query = tweetStream  \
    .writeStream  \
    .trigger(processingTime= '10 seconds')\
    .outputMode("append")  \
    .option("truncate", "False") \
    .format("console")  \
    .start()
    
query.awaitTermination()


21/11/13 12:41:56 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/_t/lpyv0_x57m1gk04y_mc3m_q80000gn/T/temporary-e08c4a5c-3012-4397-a91b-7f5a365c17c1. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-----------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|timestamp              |value                                                                                                                                            |eventTime              |
+-----------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|2021-11-13 12:10:37.073|RT @rssurjewala: India’s biggest ever “Bitcoin Scam Coverup” under the Karnataka BJP Government!

Why were Interpol/NIA/ED/SFIO kept in the…     |2021-11-13 12:10:37.073|
|2021-11-13 12:10:37.1  |RT @mistabill: Bitcoin Creator Satoshi Nakamoto Could Be Unmasked at Florida Trial - WSJ https

KeyboardInterrupt: 

In [None]:
# initialize spark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession  \
    .builder  \
    .appName("StructruedKafkaRead")  \
    .getOrCreate()

# Subscribe to ElonTweets topic where tweets are being sent from TweetStreamer.py
tweetStream = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "localhost:9092") \
  .option("subscribe", "ElonTweets") \
  .option("startingOffsets", "earliest") \
  .load()

tweetStream = tweetStream.selectExpr("timestamp","CAST(value AS STRING)")

from pyspark.ml.feature import Tokenizer
# Tokenizer - doing this before StopWordRemover as it expects array of strings as input 
tokenizer = Tokenizer(inputCol="value", outputCol="tweetWords")
tweetTerms = tokenizer.transform(tweetStream)

# Remove stop words from the tweetWords column 
from pyspark.ml.feature import StopWordsRemover
stopRemover = StopWordsRemover(inputCol="tweetWords", outputCol="filteredTweetWords")
filteredTerms = stopRemover.transform(tweetTerms)

import re
from pyspark.sql.types import ArrayType, StringType

## UDF to clean tweet by removing hashtags, mentions, and links before performing analysis
def removeRegex(tokens: list) -> list:
    expression    = '(@[A-Za-z0-a9_]+)|(#[A-Za-z0-9_]+)|'+\
              '(https?://[^\s<>"]+|www\.[^\s<>"]+)'        
    regex   = re.compile(expression)
    cleaned = [t for t in tokens if not(regex.search(t)) if len(t) > 0]
    return list(filter(None, cleaned))

removeRegexUDF = F.udf(removeRegex, ArrayType(StringType()))

# region array of strings back into single string expected as input to TextBlob library for sentiment analysis
filteredTerms = filteredTerms.withColumn('filteredTweetWords', F.concat_ws(',', 'filteredTweetWords'))
filteredTerms = filteredTerms.withColumn('filteredTweetWords', F.regexp_replace('filteredTweetWords', ',', ' '))
filteredTerms = filteredTerms.withColumn("filteredTweetWords", removeRegexUDF(filteredTerms["filteredTweetWords"]))
filteredTerms = filteredTerms.selectExpr("timestamp", "filteredTweetWords")

from textblob import TextBlob
from pyspark.sql.types import FloatType
# Get sentiment via TextBlob - simplify to be 0, 1, or 2 based on range
# Using polarity measure produced by library rather than subjectivity
def getSentiment(tweet):
    temp = TextBlob(tweet).sentiment[0]
    if temp == 0.0:
        return 0.0 # Indicates neutral sentiment
    elif temp >= 0.0:
        return 1.0 # Indicates positive sentiment
    else:
        return 2.0 # Indicates negative sentiment
    
    
sentiment_score_udf = F.udf(lambda x: obj.getSentiment(x), FloatType())

tweetSentiment = filteredTerms.withColumn("value", sentiment_score_udf(filteredTerms["filteredTweetWords"]))
tweetSentiment = tweetSentiment.selectExpr("value")

# Send to kafka topic - ElonSentiment which is picked up by LogStash
query = tweetSentiment  \
    .selectExpr("CAST(timestamp AS STRING) AS key", "CAST(value AS STRING)")\ 
    .writeStream  \
    .format("kafka") \ 
    .option("kafka.bootstrap.servers", "localhost:9092") \
    ##.trigger(processingTime= "10 seconds")\
    .option("topic", "ElonSentiment") \
    .start()

query.awaitTermination()
