In [1]:
import os
import time
import json
from random import randint, choice
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP")
print(f"{DOCKER_HOST_OR_IP=}")

kafka_brokers = f'{DOCKER_HOST_OR_IP}:19092'
topic = 'streaming-demo-2'


DOCKER_HOST_OR_IP='10.143.11.241'


In [2]:
conf = SparkConf()
conf.setAll([
    ("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.3,"
                            "org.apache.logging.log4j:log4j-slf4j2-impl:2.19.0,"
                            "org.apache.logging.log4j:log4j-api:2.19.0,"
                            "org.apache.logging.log4j:log4j-core:2.19.0"),
    ("spark.jars.excludes", "org.slf4j:slf4j-api,org.slf4j:slf4j-log4j12"), 
    ("spark.driver.userClassPathFirst", "true"),
    ("spark.executor.userClassPathFirst", "true"),
    ("spark.sql.execution.arrow.pyspark.enabled", "false")
])

spark = SparkSession.builder \
    .master("local") \
    .appName("TweetGenerator") \
    .config(conf=conf) \
    .getOrCreate()

# Define lists of words, phrases, and hashtags
words = ["awesome", "fantastic", "amazing", "incredible", "great", "wonderful", "lovely", "beautiful", "happy", "fun"]
phrases = ["can't believe", "so excited about", "just discovered", "finally got", "totally in love with"]
hashtags = ["SPARK", "SPARK ML", "SPARK STREAMING"]

# Function to generate a random tweet
def generate_tweet():
    tweet = {}
    tweet["text"] = f"{choice(phrases)} how {choice(words)} {choice(hashtags)} is!"
    tweet["created_at"] = int(time.time() * 1000)
    tweet["id"] = randint(1, 9223372036854775807)  # Random long value for tweet id
    tweet["id_str"] = str(tweet["id"])
    return tweet

In [3]:
import json
import time
from contextlib import contextmanager

class TweetProducer:
    def __init__(self, spark, kafka_brokers, topic):
        self.spark = spark
        self.kafka_brokers = kafka_brokers
        self.topic = topic
        self.tweet_count = 0
        self.active = True
    
    def generate_batch(self, batch_size=5):
        tweets = [
            {"key": tweet["id_str"], "value": json.dumps(tweet)}
            for tweet in [generate_tweet() for _ in range(batch_size)]
        ]
        
        df = self.spark.createDataFrame(tweets)
        
        df.write \
          .format("kafka") \
          .option("kafka.bootstrap.servers", self.kafka_brokers) \
          .option("topic", self.topic) \
          .mode("append") \
          .save()
        
        self.tweet_count += batch_size
        print(f"Produced: {self.tweet_count} tweets", end="\r")
    
    def run(self):
        try:
            while self.active:
                self.generate_batch()
                time.sleep(1)
        except Exception as e:
            print(f"\nError during tweet generation: {e}")
        finally:
            print("\nTweet generation stopped")
            print(f"Total tweets produced: {self.tweet_count}")
    
    def stop(self):
        self.active = False

@contextmanager
def tweet_producer_session(spark, kafka_brokers, topic):
    producer = TweetProducer(spark, kafka_brokers, topic)
    try:
        yield producer
    finally:
        producer.stop()

# Main execution
try:
    with tweet_producer_session(spark, kafka_brokers, topic) as producer:
        producer.run()
except KeyboardInterrupt:
    print("\nShutdown initiated...")

Produced: 15 tweets
Tweet generation stopped
Total tweets produced: 15

Shutdown initiated...
