In [57]:
import os

DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP")
print(f"{DOCKER_HOST_OR_IP=}")

DOCKER_HOST_OR_IP='10.143.11.241'


In [58]:
kafka_brokers = f'{DOCKER_HOST_OR_IP}:19092'
topic = 'streaming-demo-2'

In [59]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, lit, to_timestamp
import pandas as pd
pd.set_option("max_colwidth", 150)


conf = SparkConf()
conf.setAll([
    ("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.3,"
                            "org.apache.logging.log4j:log4j-slf4j2-impl:2.19.0,"
                            "org.apache.logging.log4j:log4j-api:2.19.0,"
                            "org.apache.logging.log4j:log4j-core:2.19.0"),
    ("spark.jars.excludes", "org.slf4j:slf4j-api,org.slf4j:slf4j-log4j12"), 
    ("spark.driver.userClassPathFirst", "true"),
    ("spark.executor.userClassPathFirst", "true"),
    ("spark.sql.execution.arrow.pyspark.enabled", "false")
])

spark = SparkSession.builder \
    .master("local") \
    .appName("TweetGenerator") \
    .config(conf=conf) \
    .getOrCreate()

spark.sparkContext

In [60]:
from random import randint, choice
import json
import time
import uuid

# Define lists of words, phrases, and hashtags
words = ["awesome", "fantastic", "amazing", "incredible", "great", "wonderful", "lovely", "beautiful", "happy", "fun"]
phrases = ["can't believe", "so excited about", "just discovered", "finally got", "totally in love with"]
hashtags = ["SPARK", "SPARK ML", "SPARK STREAMING"]

# Function to generate a random tweet
def generate_tweet():
  tweet = {}
  tweet["text"] = f"{choice(phrases)} how {choice(words)} {choice(hashtags)} is!"
  tweet["created_at"] = int(time.time() * 1000)
  tweet["id"] = randint(1, 9223372036854775807)  # Random long value for tweet id
  tweet["id_str"] = str(tweet["id"])
  return tweet

# Generate 10 tweets
tweets = [
    {"key": tweet["id_str"], "value": json.dumps(tweet)}
    for tweet in [generate_tweet() for _ in range(10)]
]

In [61]:
# Create DataFrame from tweets
df = spark.createDataFrame(tweets)

df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [62]:
df.toPandas()

Unnamed: 0,key,value
0,8037718634281042486,"{""text"": ""finally got how lovely SPARK STREAMING is!"", ""created_at"": 1736856328329, ""id"": 8037718634281042486, ""id_str"": ""8037718634281042486""}"
1,4860774655311241774,"{""text"": ""totally in love with how fantastic SPARK ML is!"", ""created_at"": 1736856328329, ""id"": 4860774655311241774, ""id_str"": ""4860774655311241774""}"
2,8475397230643489571,"{""text"": ""so excited about how fantastic SPARK is!"", ""created_at"": 1736856328329, ""id"": 8475397230643489571, ""id_str"": ""8475397230643489571""}"
3,676194985410546175,"{""text"": ""finally got how incredible SPARK ML is!"", ""created_at"": 1736856328329, ""id"": 676194985410546175, ""id_str"": ""676194985410546175""}"
4,6687307504160519154,"{""text"": ""just discovered how amazing SPARK ML is!"", ""created_at"": 1736856328329, ""id"": 6687307504160519154, ""id_str"": ""6687307504160519154""}"
5,5153671730750461177,"{""text"": ""finally got how great SPARK is!"", ""created_at"": 1736856328329, ""id"": 5153671730750461177, ""id_str"": ""5153671730750461177""}"
6,3371199005618122196,"{""text"": ""totally in love with how happy SPARK ML is!"", ""created_at"": 1736856328329, ""id"": 3371199005618122196, ""id_str"": ""3371199005618122196""}"
7,7614655453023225679,"{""text"": ""so excited about how fantastic SPARK ML is!"", ""created_at"": 1736856328329, ""id"": 7614655453023225679, ""id_str"": ""7614655453023225679""}"
8,8196052907276102343,"{""text"": ""finally got how wonderful SPARK STREAMING is!"", ""created_at"": 1736856328329, ""id"": 8196052907276102343, ""id_str"": ""8196052907276102343""}"
9,8306563723109330038,"{""text"": ""totally in love with how awesome SPARK is!"", ""created_at"": 1736856328329, ""id"": 8306563723109330038, ""id_str"": ""8306563723109330038""}"


In [63]:
# Write DataFrame as JSON to Kafka topic
df.write \
  .format("kafka") \
  .option("kafka.bootstrap.servers", kafka_brokers) \
  .option("topic", topic) \
  .mode("append") \
  .save()

In [64]:
# Stop SparkSession
# spark.stop()