In [49]:
import os

DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP")
print(f"{DOCKER_HOST_OR_IP=}")

DOCKER_HOST_OR_IP='10.143.11.241'


In [50]:
kafka_brokers = f'{DOCKER_HOST_OR_IP}:19092'
topic = 'streaming-demo-2'

In [51]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, lit, to_timestamp
import pandas as pd
pd.set_option("max_colwidth", 150)


conf = SparkConf()
conf.setAll([
    ("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.3,"
                            "org.apache.logging.log4j:log4j-slf4j2-impl:2.19.0,"
                            "org.apache.logging.log4j:log4j-api:2.19.0,"
                            "org.apache.logging.log4j:log4j-core:2.19.0"),
    ("spark.jars.excludes", "org.slf4j:slf4j-api,org.slf4j:slf4j-log4j12"), 
    ("spark.driver.userClassPathFirst", "true"),
    ("spark.executor.userClassPathFirst", "true"),
    ("spark.sql.execution.arrow.pyspark.enabled", "false")
])

spark = SparkSession.builder \
    .master("local") \
    .appName("TweetGenerator") \
    .config(conf=conf) \
    .getOrCreate()

spark.sparkContext

In [52]:
from random import randint, choice
import json
import time
import uuid

# Define lists of words, phrases, and hashtags
words = ["awesome", "fantastic", "amazing", "incredible", "great", "wonderful", "lovely", "beautiful", "happy", "fun"]
phrases = ["can't believe", "so excited about", "just discovered", "finally got", "totally in love with"]
hashtags = ["SPARK", "SPARK ML", "SPARK STREAMING"]

# Function to generate a random tweet
def generate_tweet():
  tweet = {}
  tweet["text"] = f"{choice(phrases)} how {choice(words)} {choice(hashtags)} is!"
  tweet["created_at"] = int(time.time() * 1000)
  tweet["id"] = randint(1, 9223372036854775807)  # Random long value for tweet id
  tweet["id_str"] = str(tweet["id"])
  return tweet

# Generate 10 tweets
tweets = [
    {"key": tweet["id_str"], "value": json.dumps(tweet)}
    for tweet in [generate_tweet() for _ in range(10)]
]

In [53]:
# Create DataFrame from tweets
df = spark.createDataFrame(tweets)

df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [54]:
df.toPandas()

Unnamed: 0,key,value
0,427431201886804724,"{""text"": ""can't believe how awesome SPARK ML is!"", ""created_at"": 1736856181853, ""id"": 427431201886804724, ""id_str"": ""427431201886804724""}"
1,3037795565582831546,"{""text"": ""finally got how wonderful SPARK ML is!"", ""created_at"": 1736856181853, ""id"": 3037795565582831546, ""id_str"": ""3037795565582831546""}"
2,6280810761316908011,"{""text"": ""so excited about how wonderful SPARK is!"", ""created_at"": 1736856181853, ""id"": 6280810761316908011, ""id_str"": ""6280810761316908011""}"
3,4335145873731869316,"{""text"": ""finally got how fantastic SPARK is!"", ""created_at"": 1736856181853, ""id"": 4335145873731869316, ""id_str"": ""4335145873731869316""}"
4,5484066861208472072,"{""text"": ""totally in love with how fantastic SPARK is!"", ""created_at"": 1736856181853, ""id"": 5484066861208472072, ""id_str"": ""5484066861208472072""}"
5,9087417459438598094,"{""text"": ""can't believe how awesome SPARK ML is!"", ""created_at"": 1736856181853, ""id"": 9087417459438598094, ""id_str"": ""9087417459438598094""}"
6,1032723330297491998,"{""text"": ""finally got how amazing SPARK ML is!"", ""created_at"": 1736856181853, ""id"": 1032723330297491998, ""id_str"": ""1032723330297491998""}"
7,6891567505696657966,"{""text"": ""can't believe how beautiful SPARK is!"", ""created_at"": 1736856181853, ""id"": 6891567505696657966, ""id_str"": ""6891567505696657966""}"
8,33634797300494019,"{""text"": ""just discovered how great SPARK ML is!"", ""created_at"": 1736856181853, ""id"": 33634797300494019, ""id_str"": ""33634797300494019""}"
9,8585735133437743944,"{""text"": ""finally got how happy SPARK STREAMING is!"", ""created_at"": 1736856181853, ""id"": 8585735133437743944, ""id_str"": ""8585735133437743944""}"


In [55]:
# Write DataFrame as JSON to Kafka topic
df.write \
  .format("kafka") \
  .option("kafka.bootstrap.servers", kafka_brokers) \
  .option("topic", topic) \
  .mode("append") \
  .save()

In [56]:
# Stop SparkSession
# spark.stop()