In [65]:
import os

DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP")
print(f"{DOCKER_HOST_OR_IP=}")

DOCKER_HOST_OR_IP='10.143.11.241'


In [66]:
kafka_brokers = f'{DOCKER_HOST_OR_IP}:19092'
topic = 'streaming-demo-2'

In [67]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.functions import col, lit, to_timestamp
import pandas as pd
pd.set_option("max_colwidth", 150)


conf = SparkConf()
conf.setAll([
    ("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.3,"
                            "org.apache.logging.log4j:log4j-slf4j2-impl:2.19.0,"
                            "org.apache.logging.log4j:log4j-api:2.19.0,"
                            "org.apache.logging.log4j:log4j-core:2.19.0"),
    ("spark.jars.excludes", "org.slf4j:slf4j-api,org.slf4j:slf4j-log4j12"), 
    ("spark.driver.userClassPathFirst", "true"),
    ("spark.executor.userClassPathFirst", "true"),
    ("spark.sql.execution.arrow.pyspark.enabled", "false")
])

spark = SparkSession.builder \
    .master("local") \
    .appName("TweetGenerator") \
    .config(conf=conf) \
    .getOrCreate()

spark.sparkContext

In [68]:
from random import randint, choice
import json
import time
import uuid

# Define lists of words, phrases, and hashtags
words = ["awesome", "fantastic", "amazing", "incredible", "great", "wonderful", "lovely", "beautiful", "happy", "fun"]
phrases = ["can't believe", "so excited about", "just discovered", "finally got", "totally in love with"]
hashtags = ["SPARK", "SPARK ML", "SPARK STREAMING"]

# Function to generate a random tweet
def generate_tweet():
  tweet = {}
  tweet["text"] = f"{choice(phrases)} how {choice(words)} {choice(hashtags)} is!"
  tweet["created_at"] = int(time.time() * 1000)
  tweet["id"] = randint(1, 9223372036854775807)  # Random long value for tweet id
  tweet["id_str"] = str(tweet["id"])
  return tweet

# Generate 10 tweets
tweets = [
    {"key": tweet["id_str"], "value": json.dumps(tweet)}
    for tweet in [generate_tweet() for _ in range(10)]
]

In [69]:
# Create DataFrame from tweets
df = spark.createDataFrame(tweets)

df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



In [70]:
df.toPandas()

Unnamed: 0,key,value
0,2820175276698830073,"{""text"": ""finally got how awesome SPARK STREAMING is!"", ""created_at"": 1736856704332, ""id"": 2820175276698830073, ""id_str"": ""2820175276698830073""}"
1,6749500828762348162,"{""text"": ""just discovered how beautiful SPARK STREAMING is!"", ""created_at"": 1736856704332, ""id"": 6749500828762348162, ""id_str"": ""67495008287623481..."
2,1854151201497862184,"{""text"": ""just discovered how amazing SPARK ML is!"", ""created_at"": 1736856704332, ""id"": 1854151201497862184, ""id_str"": ""1854151201497862184""}"
3,401714987810481096,"{""text"": ""just discovered how awesome SPARK STREAMING is!"", ""created_at"": 1736856704332, ""id"": 401714987810481096, ""id_str"": ""401714987810481096""}"
4,3282668407013200087,"{""text"": ""totally in love with how happy SPARK STREAMING is!"", ""created_at"": 1736856704332, ""id"": 3282668407013200087, ""id_str"": ""3282668407013200..."
5,8155505717449008239,"{""text"": ""just discovered how beautiful SPARK is!"", ""created_at"": 1736856704332, ""id"": 8155505717449008239, ""id_str"": ""8155505717449008239""}"
6,2829747659462582929,"{""text"": ""can't believe how wonderful SPARK STREAMING is!"", ""created_at"": 1736856704332, ""id"": 2829747659462582929, ""id_str"": ""2829747659462582929""}"
7,3727933186940585215,"{""text"": ""just discovered how wonderful SPARK is!"", ""created_at"": 1736856704332, ""id"": 3727933186940585215, ""id_str"": ""3727933186940585215""}"
8,1305322468508154747,"{""text"": ""just discovered how great SPARK STREAMING is!"", ""created_at"": 1736856704332, ""id"": 1305322468508154747, ""id_str"": ""1305322468508154747""}"
9,8818361261225405661,"{""text"": ""so excited about how lovely SPARK ML is!"", ""created_at"": 1736856704332, ""id"": 8818361261225405661, ""id_str"": ""8818361261225405661""}"


In [71]:
# Write DataFrame as JSON to Kafka topic
df.write \
  .format("kafka") \
  .option("kafka.bootstrap.servers", kafka_brokers) \
  .option("topic", topic) \
  .mode("append") \
  .save()

In [72]:
# Stop SparkSession
# spark.stop()