In [1]:
kafka_brokers = 'se-var-vastdb-ingest:19092'
topic = 'streaming-demo'

In [2]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, lit, to_timestamp

conf = SparkConf()
conf.setAll([
    ("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.3,"
                            "org.apache.logging.log4j:log4j-slf4j2-impl:2.19.0,"
                            "org.apache.logging.log4j:log4j-api:2.19.0,"
                            "org.apache.logging.log4j:log4j-core:2.19.0"),
    ("spark.jars.excludes", "org.slf4j:slf4j-api,org.slf4j:slf4j-log4j12"), 
    ("spark.driver.userClassPathFirst", "true"),
    ("spark.executor.userClassPathFirst", "true")
])

spark = SparkSession.builder \
    .master("local") \
    .appName("TweetGenerator") \
    .config(conf=conf) \
    .getOrCreate()

spark.sparkContext

In [3]:
from random import randint, choice
import json
import time
import uuid

# Define lists of words, phrases, and hashtags
words = ["awesome", "fantastic", "amazing", "incredible", "great", "wonderful", "lovely", "beautiful", "happy", "fun"]
phrases = ["can't believe", "so excited about", "just discovered", "finally got", "totally in love with"]
hashtags = ["SPARK", "SPARK ML", "SPARK STREAMING"]

# Function to generate a random tweet
def generate_tweet():
  tweet = {}
  tweet["text"] = f"{choice(phrases)} how {choice(words)} {choice(hashtags)} is!"
  tweet["created_at"] = time.strftime("%Y-%m-%d %H:%M:%S Z", time.gmtime())
  tweet["id"] = randint(1, 9223372036854775807)  # Random long value for tweet id
  tweet["id_str"] = str(tweet["id"])
  return tweet

# Generate 11 tweets
tweets = [
    {"key": tweet["id_str"], "value": json.dumps(tweet)}
    for tweet in [generate_tweet() for _ in range(11)]
]

In [4]:
# Create DataFrame from tweets
df = spark.createDataFrame(tweets)

df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [5]:
df.show()

+-------------------+--------------------+
|                key|               value|
+-------------------+--------------------+
|2650335822288723984|{"text": "finally...|
|5780384821381590520|{"text": "just di...|
|6131546600826454432|{"text": "can't b...|
|5067458675131736321|{"text": "finally...|
|5524828952239728309|{"text": "can't b...|
|6302052410047609796|{"text": "so exci...|
|3584088957218351083|{"text": "so exci...|
|4036443486884828215|{"text": "just di...|
|8947696289899628888|{"text": "finally...|
|7774630392055017433|{"text": "so exci...|
|5701966646253446429|{"text": "totally...|
+-------------------+--------------------+



In [6]:
# Write DataFrame as JSON to Kafka topic
df.write \
  .format("kafka") \
  .option("kafka.bootstrap.servers", kafka_brokers) \
  .option("topic", topic) \
  .mode("append") \
  .save()

# Stop SparkSession
spark.stop()