In [1]:
kafka_brokers = 'vastdb-ingest:9093'
topic = 'streaming-demo'

In [2]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, lit, to_timestamp

conf = SparkConf()
conf.setAll([
    ("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.3,"
                            "org.apache.logging.log4j:log4j-slf4j2-impl:2.19.0,"
                            "org.apache.logging.log4j:log4j-api:2.19.0,"
                            "org.apache.logging.log4j:log4j-core:2.19.0"),
    ("spark.jars.excludes", "org.slf4j:slf4j-api,org.slf4j:slf4j-log4j12"), 
    ("spark.driver.userClassPathFirst", "true"),
    ("spark.executor.userClassPathFirst", "true")
])

spark = SparkSession.builder \
    .master("local") \
    .appName("TweetGenerator") \
    .config(conf=conf) \
    .getOrCreate()

In [3]:
from random import randint, choice
import json
import time
import uuid

# Define lists of words, phrases, and hashtags
words = ["awesome", "fantastic", "amazing", "incredible", "great", "wonderful", "lovely", "beautiful", "happy", "fun"]
phrases = ["can't believe", "so excited about", "just discovered", "finally got", "totally in love with"]
hashtags = ["SPARK", "SPARK ML", "SPARK STREAMING"]

# Function to generate a random date in the past year
def random_date():
  base_date = int(round(time.time() * 1000)) - (365 * 24 * 60 * 60 * 1000)
  offset = randint(0, 365 * 24 * 60 * 60 * 1000)
  timestamp_in_millis = int(base_date + offset)
  # Convert milliseconds to seconds and format as string
  return time.strftime("%Y-%m-%d %H:%M:%S Z", time.gmtime(timestamp_in_millis / 1000))

# Function to generate a random tweet
def generate_tweet():
  tweet = {}
  tweet["text"] = f"{choice(phrases)} how {choice(words)} {choice(hashtags)} is!"
  tweet["created_at"] = str(to_timestamp(random_date(), "EEE MMM dd HH:mm:ss Z yyyy"))
  tweet["id"] = randint(1, 9223372036854775807)  # Random long value for tweet id
  tweet["id_str"] = str(tweet["id"])
  return tweet

# Generate 11 tweets
tweets = [
    {"key": tweet["id_str"], "value": json.dumps(tweet)}
    for tweet in [generate_tweet() for _ in range(11)]
]

In [4]:
# Create DataFrame from tweets
df = spark.createDataFrame(tweets)

df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [5]:
df.show()

+-------------------+--------------------+
|                key|               value|
+-------------------+--------------------+
|4292053126331771219|{"text": "just di...|
|2555723666789404836|{"text": "so exci...|
|8675334356151675105|{"text": "finally...|
|1564043520580363040|{"text": "so exci...|
|6282775659133243706|{"text": "can't b...|
|3796718732763794232|{"text": "so exci...|
|2611660519021234635|{"text": "finally...|
| 513802857198179829|{"text": "totally...|
|2386229353206424825|{"text": "can't b...|
|7348020505670878482|{"text": "just di...|
|4463373452635096452|{"text": "just di...|
+-------------------+--------------------+



In [6]:
# Write DataFrame as JSON to Kafka topic
df.write \
  .format("kafka") \
  .option("kafka.bootstrap.servers", kafka_brokers) \
  .option("topic", topic) \
  .mode("append") \
  .save()

# Stop SparkSession
spark.stop()