# Introduction

This notebook uses a pretrained sparknlp model to extract entities from tweets to create a word cloud. The word cloud will contain the entities in the last 60 seconds and update every 10 seconds.

# Imports

In [5]:
from datetime import datetime, timedelta
from IPython.display import display, clear_output
import time
from itertools import chain
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from pyspark.sql import (
    SparkSession,
    functions as F
)
from pyspark.sql.types import (
    StringType,
    StructType,
    StructField,
    IntegerType,
    LongType,
    BooleanType
)
import sparknlp
from sparknlp import Finisher
from pyspark.ml import (
    Pipeline
)
from sparknlp.pretrained import PretrainedPipeline

from wordcloud import (
    WordCloud, 
    STOPWORDS, 
    ImageColorGenerator
)

from src.producers.twitter_kafka_producer import TwitterStreamer

In [6]:
spark = sparknlp.start()
print(f'sparknlp.version(): {sparknlp.version()}')
print(f'spark.version: {spark.version}')

sparknlp.version(): 3.1.0
spark.version: 3.1.1


# Set up directories

In [7]:
project_dir = Path.cwd().parent
models_dir = project_dir / 'models'
pretrained_models_dir = models_dir / 'pretrained'
data_dir = project_dir / 'data'
raw_data_dir = data_dir / 'raw'
processed_data_dir = data_dir / 'processed'

# Load pretrained pipeline

In [9]:
model_name = 'recognize_entities_dl'
pretrained_pipeline = PretrainedPipeline(name=model_name)

recognize_entities_dl download started this may take some time.
Approx size to download 160.1 MB
[OK!]


In [10]:
# A Finisher is required to extract the ArrayColumn `entities`
finisher = Finisher().setInputCols(['entities'])
pipeline = Pipeline().setStages([
    pretrained_pipeline.model,
    finisher
])

# Read stream

In [23]:
tweet_schema = StructType([
    StructField('created_at', StringType(), True),
    StructField('id', LongType(), True),
    StructField('text', StringType(), True),
    StructField('is_quote_status', BooleanType(), True),
    StructField('in_reply_to_user_id', LongType(), True),
    StructField('user', StructType([
        StructField('id', LongType(), True),
        StructField('followers_count', IntegerType(), True),
        StructField('friends_count', IntegerType(), True),
        StructField('created_at', StringType(), True)
    ])),
    StructField('extended_tweet', StructType([
        StructField('full_text', StringType(), True)
    ])),
    StructField('retweeted_status', StructType([
        StructField('id', LongType(), True)
    ])),
    StructField('retweet_count', IntegerType(), True),
    StructField('favorite_count', IntegerType(), True),
    StructField('quote_count', IntegerType(), True),
    StructField('reply_count', IntegerType(), True)
])

In [24]:
stream_df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "broker:29092") \
  .option("startingOffsets", "earliest") \
  .option("subscribe", "twitterdata") \
  .load()

In [25]:
json_stream_df = (
    stream_df
    # Convert the key and value from binary to StringType
    .withColumn('key', stream_df['key'].cast(StringType()))
    .withColumn('value', stream_df['value'].cast(StringType()))
    # Assign fields to JSON
    .withColumn('value', F.from_json('value', tweet_schema))
)

In [26]:
json_stream_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- created_at: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- text: string (nullable = true)
 |    |-- is_quote_status: boolean (nullable = true)
 |    |-- in_reply_to_user_id: long (nullable = true)
 |    |-- user: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- followers_count: integer (nullable = true)
 |    |    |-- friends_count: integer (nullable = true)
 |    |    |-- created_at: string (nullable = true)
 |    |-- extended_tweet: struct (nullable = true)
 |    |    |-- full_text: string (nullable = true)
 |    |-- retweeted_status: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |-- retweet_count: integer (nullable = true)
 |    |-- favorite_count: integer (nullable = true)
 |    |-- quote_count: integer (nullable = true)
 |    |-- reply_count: integer (nullable = true)
 |-- topic: string (nullable = true)
 |-- pa

In [27]:
tweet_stream_df = (
    json_stream_df
    .select('timestamp',
            'value.created_at',
            'value.text',
            'value.extended_tweet.full_text')
    .withWatermark("timestamp", "1 minutes")
#     .filter(F.col('timestamp') > datetime.now() - timedelta(seconds=60)) # This doesn't seem to restrict the data to the last 60 seconds because the count keeps on growing.
#     .groupBy(F.window(json_stream_df.timestamp, window_duration, slide_duration))
)

In [28]:
tweet_stream_df.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- created_at: string (nullable = true)
 |-- text: string (nullable = true)
 |-- full_text: string (nullable = true)



In [29]:
tweet_stream = (
    tweet_stream_df
    .writeStream
    .format('memory')
    .queryName('tweet_view')
    .start()
)

# Stream to parquet

In [30]:
parquet_path = processed_data_dir / 'ner_parquet'
checkpoint_path = processed_data_dir / 'ner_checkpoint'

In [31]:
parquet_stream = (
    tweet_stream_df
    .writeStream
    .option('path', parquet_path.as_posix())
    .outputMode('append')
    .option('checkpointLocation', checkpoint_path.as_posix())
    .start()
)

Py4JJavaError: An error occurred while calling o470.start.
: java.io.IOException: mkdir of /data/processed/ner_parquet/_spark_metadata failed
	at org.apache.hadoop.fs.FileSystem.primitiveMkdir(FileSystem.java:1280)
	at org.apache.hadoop.fs.DelegateToFileSystem.mkdir(DelegateToFileSystem.java:183)
	at org.apache.hadoop.fs.FilterFs.mkdir(FilterFs.java:212)
	at org.apache.hadoop.fs.FileContext$4.next(FileContext.java:804)
	at org.apache.hadoop.fs.FileContext$4.next(FileContext.java:800)
	at org.apache.hadoop.fs.FSLinkResolver.resolve(FSLinkResolver.java:90)
	at org.apache.hadoop.fs.FileContext.mkdir(FileContext.java:807)
	at org.apache.spark.sql.execution.streaming.FileContextBasedCheckpointFileManager.mkdirs(CheckpointFileManager.scala:309)
	at org.apache.spark.sql.execution.streaming.HDFSMetadataLog.<init>(HDFSMetadataLog.scala:64)
	at org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog.<init>(CompactibleFileStreamLog.scala:47)
	at org.apache.spark.sql.execution.streaming.FileStreamSinkLog.<init>(FileStreamSinkLog.scala:86)
	at org.apache.spark.sql.execution.streaming.FileStreamSink.<init>(FileStreamSink.scala:141)
	at org.apache.spark.sql.execution.datasources.DataSource.createSink(DataSource.scala:330)
	at org.apache.spark.sql.streaming.DataStreamWriter.createV1Sink(DataStreamWriter.scala:484)
	at org.apache.spark.sql.streaming.DataStreamWriter.startInternal(DataStreamWriter.scala:453)
	at org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:301)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [246]:
parquet_stream.stop()

# Create word cloud

In [32]:
while True:
    query = """
    SELECT 
      CASE WHEN ISNULL(full_text) THEN text 
           WHEN CHAR_LENGTH(text) > CHAR_LENGTH(full_text) THEN text ELSE full_text
      END as text
    FROM tweet_view
    WHERE timestamp > (CURRENT_TIMESTAMP() - INTERVAL 60 seconds)
    """
    tweet_df = spark.sql(query)
    clear_output(wait=True)
#     display(tweet_df.show(20))

    # Extract entities
    df_entities = pipeline.fit(tweet_df).transform(tweet_df)
    
    # Create word cloud: it takes about 10-15 seconds to process ~1100 tweets (60 seconds' worth of Tweets)
    # into an NER word cloud.
    # Most of the time goes into transforming into pandas dataframe
    # It would probably be faster to use spark to create the Counter, but don't know how.
    pdf_entities = df_entities.select('finished_entities').toPandas()
    
    # This line transforms a column of lists of entities into a Counter
    c = Counter(chain.from_iterable(x for x in pdf_entities.finished_entities if len(x) > 0))
    plt.figure(figsize=(15, 8), facecolor='k')
    wordcloud = WordCloud(width=1200, height=600).generate_from_frequencies(c)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()
    print(f'Number of tweets: {tweet_df.count()}')
    time.sleep(1)

ValueError: We need at least 1 word to plot a word cloud, got 0.

<Figure size 1080x576 with 0 Axes>