## Project Template

In [1]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("MyApp") \
    .config("spark.jars.packages", 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0') \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .getOrCreate()


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e63d5e80-4214-4bbd-a7e3-d883127bd329;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.0 in central
	found org.apache.kafka#kafka-clients;2.4.1 in central
	found com.github.luben#zstd-jni;1.4.4-3 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.7.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
:: resolution report :: resolve 422ms :: artifacts dl 13

Be sure to start the stream on Kafka!

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, TimestampType, DateType

schema = StructType(
      [
        StructField("name", StringType(), False),
        StructField("price", DoubleType(), False),
        StructField("timestamp", TimestampType(), False),
      ]
    )

In [3]:
kafka_server = "kafka1:9092"   
from pyspark.sql.functions import from_json

lines = (spark.readStream                        # Get the DataStreamReader
  .format("kafka")                                 # Specify the source format as "kafka"
  .option("kafka.bootstrap.servers", kafka_server) # Configure the Kafka server name and port
  .option("subscribe", "stock")                       # Subscribe to the "en" Kafka topic 
  .option("startingOffsets", "earliest")           # The start point when a query is started
  .option("maxOffsetsPerTrigger", 100)             # Rate limit on max offsets per trigger interval
  .load()
  .select(from_json(col("value").cast("string"), schema).alias("parsed_value"))
# Load the DataFrame
)
df = lines.select("parsed_value.*")


## The assignment starts here

You can create a

## Select the N most valuable stocks in a window

In [None]:
from pyspark.sql.functions import window, desc, col, collect_list, struct, slice

# Define a 1-minute sliding time window and collect prices within each window
windowed_df = df.groupBy(window(df.timestamp, "1 minute")).agg(
    collect_list(struct("name", "price")).alias("stocks")
)

# Define a UDF (User Defined Function) to calculate the total value of each stock
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
@udf(returnType=FloatType())
def calculate_max_value(stocks):
    return max(stock.price for stock in stocks)

# Calculate the total value for each stock within each window
windowed_df = windowed_df.withColumn("total_value", calculate_total_value(col("stocks")))

# Select the N most valuable stocks within each window
n = 5  # Define the value of N
most_valuable_stocks = windowed_df.select("window", slice(col("stocks"), 1, n).alias("5 most valuable stocks and their prices"))

# Output the results to the console for real-time display
query = most_valuable_stocks.writeStream.outputMode("complete").format("console").option("truncate", False).start()
# Start the streaming query
query.awaitTermination()


23/11/06 20:43:33 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-475afcb0-fc6d-4c09-8174-1eea67300a45. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+---------------------------------------------------------------------------+
|window                                    |5 most valuable stocks and their prices                                    |
+------------------------------------------+---------------------------------------------------------------------------+
|[2023-11-03 14:55:00, 2023-11-03 14:56:00]|[[ISRG, 192.6465], [BBT, 30.8], [FITB, 16.63], [ZION, 24.57], [HCP, 47.08]]|
|[2023-11-03 14:56:00, 2023-11-03 14:57:00]|[[PFE, 27.75], [AKAM, 38.76], [ILMN, 49.74], [FLS, 53.4166], [TGT, 63.21]] |
+------------------------------------------+---------------------------------------------------------------------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+---------------------------------------------------------------------------+
|window                                    |5 most valuable stocks and their prices                                    |
+------------------------------------------+---------------------------------------------------------------------------+
|[2023-11-03 14:57:00, 2023-11-03 14:58:00]|[[LYB, 63.17], [TGT, 68.83], [AGN, 93.13], [PGR, 25.68], [MSFT, 28.85]]    |
|[2023-11-03 14:55:00, 2023-11-03 14:56:00]|[[ISRG, 192.6465], [BBT, 30.8], [FITB, 16.63], [ZION, 24.57], [HCP, 47.08]]|
|[2023-11-03 14:56:00, 2023-11-03 14:57:00]|[[PFE, 27.75], [AKAM, 38.76], [ILMN, 49.74], [FLS, 53.4166], [TGT, 63.21]] |
+------------------------------------------+---------------------------------------------------------------------------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+---------------------------------------------------------------------------+
|window                                    |5 most valuable stocks and their prices                                    |
+------------------------------------------+---------------------------------------------------------------------------+
|[2023-11-03 14:58:00, 2023-11-03 14:59:00]|[[FRT, 116.94], [HST, 18.5191], [GWW, 254.0], [IQV, 44.33], [SCG, 53.2375]]|
|[2023-11-03 14:57:00, 2023-11-03 14:58:00]|[[LYB, 63.17], [TGT, 68.83], [AGN, 93.13], [PGR, 25.68], [MSFT, 28.85]]    |
|[2023-11-03 14:55:00, 2023-11-03 14:56:00]|[[ISRG, 192.6465], [BBT, 30.8], [FITB, 16.63], [ZION, 24.57], [HCP, 47.08]]|
|[2023-11-03 14:56:00, 2023-11-03 14:57:00]|[[PFE, 27.75], [AKAM, 38.76], [ILMN, 49.74], [FLS, 53.4166], [TGT, 63.21]] |
+------------------------------------------+------------

                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------------------------------------------+---------------------------------------------------------------------------+
|window                                    |5 most valuable stocks and their prices                                    |
+------------------------------------------+---------------------------------------------------------------------------+
|[2023-11-03 14:58:00, 2023-11-03 14:59:00]|[[FRT, 116.94], [HST, 18.5191], [GWW, 254.0], [IQV, 44.33], [SCG, 53.2375]]|
|[2023-11-03 14:57:00, 2023-11-03 14:58:00]|[[LYB, 63.17], [TGT, 68.83], [AGN, 93.13], [PGR, 25.68], [MSFT, 28.85]]    |
|[2023-11-03 14:55:00, 2023-11-03 14:56:00]|[[ISRG, 192.6465], [BBT, 30.8], [FITB, 16.63], [ZION, 24.57], [HCP, 47.08]]|
|[2023-11-03 14:56:00, 2023-11-03 14:57:00]|[[PFE, 27.75], [AKAM, 38.76], [ILMN, 49.74], [FLS, 53.4166], [TGT, 63.21]] |
|[2023-11-03 14:59:00, 2023-11-03 15:00:00]|[[FDX, 99.85

                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+------------------------------------------+---------------------------------------------------------------------------+
|window                                    |5 most valuable stocks and their prices                                    |
+------------------------------------------+---------------------------------------------------------------------------+
|[2023-11-03 14:58:00, 2023-11-03 14:59:00]|[[FRT, 116.94], [HST, 18.5191], [GWW, 254.0], [IQV, 44.33], [SCG, 53.2375]]|
|[2023-11-03 14:57:00, 2023-11-03 14:58:00]|[[LYB, 63.17], [TGT, 68.83], [AGN, 93.13], [PGR, 25.68], [MSFT, 28.85]]    |
|[2023-11-03 14:55:00, 2023-11-03 14:56:00]|[[ISRG, 192.6465], [BBT, 30.8], [FITB, 16.63], [ZION, 24.57], [HCP, 47.08]]|
|[2023-11-03 14:56:00, 2023-11-03 14:57:00]|[[PFE, 27.75], [AKAM, 38.76], [ILMN, 49.74], [FLS, 53.4166], [TGT, 63.21]] |
|[2023-11-03 14:59:00, 2023-11-03 15:00:00]|[[FDX, 99.85

                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+------------------------------------------+---------------------------------------------------------------------------+
|window                                    |5 most valuable stocks and their prices                                    |
+------------------------------------------+---------------------------------------------------------------------------+
|[2023-11-03 14:58:00, 2023-11-03 14:59:00]|[[FRT, 116.94], [HST, 18.5191], [GWW, 254.0], [IQV, 44.33], [SCG, 53.2375]]|
|[2023-11-03 14:57:00, 2023-11-03 14:58:00]|[[LYB, 63.17], [TGT, 68.83], [AGN, 93.13], [PGR, 25.68], [MSFT, 28.85]]    |
|[2023-11-03 15:00:00, 2023-11-03 15:01:00]|[[MGM, 16.25], [CMCSA, 22.315], [IRM, 29.08], [ALK, 31.4795], [MPC, 35.05]]|
|[2023-11-03 14:55:00, 2023-11-03 14:56:00]|[[ISRG, 192.6465], [BBT, 30.8], [FITB, 16.63], [ZION, 24.57], [HCP, 47.08]]|
|[2023-11-03 14:56:00, 2023-11-03 14:57:00]|[[PFE, 27.75

                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+------------------------------------------+---------------------------------------------------------------------------+
|window                                    |5 most valuable stocks and their prices                                    |
+------------------------------------------+---------------------------------------------------------------------------+
|[2023-11-03 14:58:00, 2023-11-03 14:59:00]|[[FRT, 116.94], [HST, 18.5191], [GWW, 254.0], [IQV, 44.33], [SCG, 53.2375]]|
|[2023-11-03 15:01:00, 2023-11-03 15:02:00]|[[OMC, 62.0], [MKC, 70.49], [SLG, 88.21], [BLL, 22.65], [CMA, 42.11]]      |
|[2023-11-03 14:57:00, 2023-11-03 14:58:00]|[[LYB, 63.17], [TGT, 68.83], [AGN, 93.13], [PGR, 25.68], [MSFT, 28.85]]    |
|[2023-11-03 15:00:00, 2023-11-03 15:01:00]|[[MGM, 16.25], [CMCSA, 22.315], [IRM, 29.08], [ALK, 31.4795], [MPC, 35.05]]|
|[2023-11-03 14:55:00, 2023-11-03 14:56:00]|[[ISRG, 192.

                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+------------------------------------------+---------------------------------------------------------------------------+
|window                                    |5 most valuable stocks and their prices                                    |
+------------------------------------------+---------------------------------------------------------------------------+
|[2023-11-03 15:01:00, 2023-11-03 15:02:00]|[[OMC, 62.0], [MKC, 70.49], [SLG, 88.21], [BLL, 22.65], [CMA, 42.11]]      |
|[2023-11-03 14:58:00, 2023-11-03 14:59:00]|[[FRT, 116.94], [HST, 18.5191], [GWW, 254.0], [IQV, 44.33], [SCG, 53.2375]]|
|[2023-11-03 15:02:00, 2023-11-03 15:03:00]|[[MAA, 63.25], [AES, 13.41], [HOLX, 21.25], [MCO, 71.28], [FFIV, 89.96]]   |
|[2023-11-03 14:57:00, 2023-11-03 14:58:00]|[[LYB, 63.17], [TGT, 68.83], [AGN, 93.13], [PGR, 25.68], [MSFT, 28.85]]    |
|[2023-11-03 15:00:00, 2023-11-03 15:01:00]|[[MGM, 16.25

                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+------------------------------------------+---------------------------------------------------------------------------+
|window                                    |5 most valuable stocks and their prices                                    |
+------------------------------------------+---------------------------------------------------------------------------+
|[2023-11-03 15:01:00, 2023-11-03 15:02:00]|[[OMC, 62.0], [MKC, 70.49], [SLG, 88.21], [BLL, 22.65], [CMA, 42.11]]      |
|[2023-11-03 14:58:00, 2023-11-03 14:59:00]|[[FRT, 116.94], [HST, 18.5191], [GWW, 254.0], [IQV, 44.33], [SCG, 53.2375]]|
|[2023-11-03 15:02:00, 2023-11-03 15:03:00]|[[MAA, 63.25], [AES, 13.41], [HOLX, 21.25], [MCO, 71.28], [FFIV, 89.96]]   |
|[2023-11-03 14:57:00, 2023-11-03 14:58:00]|[[LYB, 63.17], [TGT, 68.83], [AGN, 93.13], [PGR, 25.68], [MSFT, 28.85]]    |
|[2023-11-03 15:00:00, 2023-11-03 15:01:00]|[[MGM, 16.25

                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+------------------------------------------+---------------------------------------------------------------------------+
|window                                    |5 most valuable stocks and their prices                                    |
+------------------------------------------+---------------------------------------------------------------------------+
|[2023-11-03 15:01:00, 2023-11-03 15:02:00]|[[OMC, 62.0], [MKC, 70.49], [SLG, 88.21], [BLL, 22.65], [CMA, 42.11]]      |
|[2023-11-03 14:58:00, 2023-11-03 14:59:00]|[[FRT, 116.94], [HST, 18.5191], [GWW, 254.0], [IQV, 44.33], [SCG, 53.2375]]|
|[2023-11-03 15:02:00, 2023-11-03 15:03:00]|[[MAA, 63.25], [AES, 13.41], [HOLX, 21.25], [MCO, 71.28], [FFIV, 89.96]]   |
|[2023-11-03 14:57:00, 2023-11-03 14:58:00]|[[LYB, 63.17], [TGT, 68.83], [AGN, 93.13], [PGR, 25.68], [MSFT, 28.85]]    |
|[2023-11-03 15:00:00, 2023-11-03 15:01:00]|[[MGM, 16.25

                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+------------------------------------------+---------------------------------------------------------------------------+
|window                                    |5 most valuable stocks and their prices                                    |
+------------------------------------------+---------------------------------------------------------------------------+
|[2023-11-03 15:01:00, 2023-11-03 15:02:00]|[[OMC, 62.0], [MKC, 70.49], [SLG, 88.21], [BLL, 22.65], [CMA, 42.11]]      |
|[2023-11-03 14:58:00, 2023-11-03 14:59:00]|[[FRT, 116.94], [HST, 18.5191], [GWW, 254.0], [IQV, 44.33], [SCG, 53.2375]]|
|[2023-11-03 15:02:00, 2023-11-03 15:03:00]|[[MAA, 63.25], [AES, 13.41], [HOLX, 21.25], [MCO, 71.28], [FFIV, 89.96]]   |
|[2023-11-03 14:57:00, 2023-11-03 14:58:00]|[[LYB, 63.17], [TGT, 68.83], [AGN, 93.13], [PGR, 25.68], [MSFT, 28.85]]    |
|[2023-11-03 15:00:00, 2023-11-03 15:01:00]|[[MGM, 16.2

                                                                                

-------------------------------------------
Batch: 11
-------------------------------------------
+------------------------------------------+---------------------------------------------------------------------------+
|window                                    |5 most valuable stocks and their prices                                    |
+------------------------------------------+---------------------------------------------------------------------------+
|[2023-11-03 15:01:00, 2023-11-03 15:02:00]|[[OMC, 62.0], [MKC, 70.49], [SLG, 88.21], [BLL, 22.65], [CMA, 42.11]]      |
|[2023-11-03 14:58:00, 2023-11-03 14:59:00]|[[FRT, 116.94], [HST, 18.5191], [GWW, 254.0], [IQV, 44.33], [SCG, 53.2375]]|
|[2023-11-03 15:02:00, 2023-11-03 15:03:00]|[[MAA, 63.25], [AES, 13.41], [HOLX, 21.25], [MCO, 71.28], [FFIV, 89.96]]   |
|[2023-11-03 14:57:00, 2023-11-03 14:58:00]|[[LYB, 63.17], [TGT, 68.83], [AGN, 93.13], [PGR, 25.68], [MSFT, 28.85]]    |
|[2023-11-03 15:00:00, 2023-11-03 15:01:00]|[[MGM, 16.2

                                                                                

-------------------------------------------
Batch: 12
-------------------------------------------
+------------------------------------------+----------------------------------------------------------------------------+
|window                                    |5 most valuable stocks and their prices                                     |
+------------------------------------------+----------------------------------------------------------------------------+
|[2023-11-03 15:01:00, 2023-11-03 15:02:00]|[[OMC, 62.0], [MKC, 70.49], [SLG, 88.21], [BLL, 22.65], [CMA, 42.11]]       |
|[2023-11-03 14:58:00, 2023-11-03 14:59:00]|[[FRT, 116.94], [HST, 18.5191], [GWW, 254.0], [IQV, 44.33], [SCG, 53.2375]] |
|[2023-11-03 15:02:00, 2023-11-03 15:03:00]|[[MAA, 63.25], [AES, 13.41], [HOLX, 21.25], [MCO, 71.28], [FFIV, 89.96]]    |
|[2023-11-03 14:57:00, 2023-11-03 14:58:00]|[[LYB, 63.17], [TGT, 68.83], [AGN, 93.13], [PGR, 25.68], [MSFT, 28.85]]     |
|[2023-11-03 15:00:00, 2023-11-03 15:01:00]|[[MG

                                                                                

-------------------------------------------
Batch: 13
-------------------------------------------
+------------------------------------------+----------------------------------------------------------------------------+
|window                                    |5 most valuable stocks and their prices                                     |
+------------------------------------------+----------------------------------------------------------------------------+
|[2023-11-03 15:01:00, 2023-11-03 15:02:00]|[[OMC, 62.0], [MKC, 70.49], [SLG, 88.21], [BLL, 22.65], [CMA, 42.11]]       |
|[2023-11-03 14:58:00, 2023-11-03 14:59:00]|[[FRT, 116.94], [HST, 18.5191], [GWW, 254.0], [IQV, 44.33], [SCG, 53.2375]] |
|[2023-11-03 15:07:00, 2023-11-03 15:08:00]|[[TJX, 60.16], [MKC, 71.54], [CME, 74.86], [FLS, 76.58], [EMN, 85.7]]       |
|[2023-11-03 15:02:00, 2023-11-03 15:03:00]|[[MAA, 63.25], [AES, 13.41], [HOLX, 21.25], [MCO, 71.28], [FFIV, 89.96]]    |
|[2023-11-03 14:57:00, 2023-11-03 14:58:00]|[[LY

## Select the stocks that lost value between two windows

In [None]:
from pyspark.sql.functions import from_json, col, lit
# Define a watermark for event-time processing
df = df.withWatermark("timestamp", "10 minutes")

# Create a previous DataFrame to join with the current DataFrame
previous_df = df.withColumnRenamed("name", "prev_name") \
    .withColumnRenamed("timestamp", "prev_timestamp") \
    .withColumnRenamed("price", "prev_price")

# Join the current and previous DataFrames
df_with_price_difference = df.join(
    previous_df,
    (df["name"] == previous_df["prev_name"]) & (df["timestamp"] > previous_df["prev_timestamp"]),
    "left"
)

# Calculate price_difference
df_with_price_difference = df_with_price_difference.withColumn("price_difference", col("price") - col("prev_price"))

# Filter for rows with a price loss
df_with_loss = df_with_price_difference.filter(col("price_difference") <= 0)
df_to_output = df_with_loss.select("name", "price", "timestamp", "prev_price", "prev_timestamp", "price_difference")

# Output the results to the console
output_query = df_with_loss.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

# Start the streaming query
output_query.awaitTermination()

## Select the stock that gained the most (between windows)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lag, window, col
from pyspark.sql.window import Window

# Function to handle each batch of data
def process_batch(df, epoch_id):
    window_spec = Window.partitionBy("name").orderBy("window")

    # Create a column "previous_price" by using the lag function
    df = df.withColumn("PreviousAverage", lag("avg(price)").over(window_spec))
    df = df.withColumn("previous_window", lag("window").over(window_spec))
    # Calculate the difference and add a new column
   

    # Filter where there is a previous price and the previous price is greater than the current avg price
    df = df.filter(col("PreviousAverage").isNotNull() & (col("PreviousAverage") < col("avg(price)")))
    df = df.withColumn("HighestGain", col("avg(price)") - col("PreviousAverage"))
    df = df.orderBy(col("HighestGain").desc())

    # Take only the stock with the biggest increase
    df = df.limit(1)
    # Proceed with data processing here, for example: print to the screen
    df.select("name", "window", "avg(price)", "PreviousAverage", "previous_window", "HighestGain").show(truncate=False)

windowedDF_2 = df \
    .withWatermark("timestamp", "3 seconds") \
    .groupBy(window("timestamp", "5 minutes"), "name") \
    .agg({"price": "avg"})

# No need to sort here as we're not writing out the sorted results, sorting will happen in process_batch if needed
# lost_value_stocks = windowedDF_2.orderBy("avg(price)", ascending=False)

query_2 = (windowedDF_2.writeStream
    .outputMode("complete")
    .format("memory")
    .queryName("TheStocksThatLostValue1")
    .foreachBatch(process_batch)
    .start())

query_2.awaitTermination()

## Checkpoint

In [None]:
from pyspark.sql.functions import window, col, first, last

windowed_data = df.withWatermark("timestamp", "1 hour").groupBy(
    window("timestamp", "1 hour"),
    col("name")
).agg(
    (((first(df.price) - last(df.price)) / first(df.price)) * 100).alias("percentage_change")
)

# Define the threshold for acceptable percentage change
threshold = 5  # For example, a 5% change threshold

# Filter the data for stocks that did not lose too much value
control_pass = windowed_data.filter(col("percentage_change") >= -threshold)

# Start the streaming query
query = (control_pass
    .writeStream
    .outputMode("update")
    .format("console")
    .start()
)

query.awaitTermination()


## Compute your assets

In [None]:
from pyspark.sql.functions import col, corr
# Define the schema for your stocks ownership data
schema = StructType([
    StructField("name", StringType(), True),
    StructField("price", DoubleType(), True)  # Represents the total value of the stocks owned by Aida
])

# Sample stocks ownership data with the total value of the stocks owned by Aida
data = [("Aida", 50.0), 
        ("Aida", 75.0), 
        ("Aida", 36.0)]  

# Create a DataFrame with the ownership data
stocks_ownership_df = spark.createDataFrame(data, schema)

# Select the relevant columns from the parsed Kafka data and alias them
df = lines.select(
    col("parsed_value.name").alias("stock_name"),
    col("parsed_value.price").alias("stock_price")
)

# Join your ownership data with the streaming data using an inner join
merged_df = df.join(stocks_ownership_df, df.stock_name == stocks_ownership_df.name, "inner")

# Calculate the correlation between your assets and each stock's price
correlation_result = merged_df.select(corr("price", "stock_price").alias("correlation"))

# Start the streaming query to continuously monitor changes in asset value and the correlation
query = (correlation_result
    .writeStream
    .outputMode("update")
    .format("console")
    .start()
)

query.awaitTermination()

23/11/06 20:18:18 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-abe18fab-9eb1-47fe-8c2e-7e78bb6839a3. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-----------+
|correlation|
+-----------+
|       null|
+-----------+



[Stage 7:>                                                          (0 + 1) / 1]

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------+
|correlation|
+-----------+
|       null|
+-----------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-----------+
|correlation|
+-----------+
|       null|
+-----------+



23/11/06 20:23:29 WARN NettyRpcEnv: Ignored failure: java.util.concurrent.TimeoutException: Cannot receive any reply from dc4eaebee9b9:46857 in 10000 milliseconds
23/11/06 20:23:29 WARN NetworkClient: [Consumer clientId=consumer-spark-kafka-source-f720b012-de44-4ee3-afa0-14f452e0f986--108891843-driver-0-1, groupId=spark-kafka-source-f720b012-de44-4ee3-afa0-14f452e0f986--108891843-driver-0] Error connecting to node kafka1:9092 (id: 2147483646 rack: null)
java.net.UnknownHostException: kafka1: Temporary failure in name resolution
	at java.net.Inet4AddressImpl.lookupAllHostAddr(Native Method)
	at java.net.InetAddress$2.lookupAllHostAddr(InetAddress.java:929)
	at java.net.InetAddress.getAddressesFromNameService(InetAddress.java:1330)
	at java.net.InetAddress.getAllByName0(InetAddress.java:1283)
	at java.net.InetAddress.getAllByName(InetAddress.java:1199)
	at java.net.InetAddress.getAllByName(InetAddress.java:1127)
	at org.apache.kafka.clients.ClientUtils.resolve(ClientUtils.java:104)
	at o

-------------------------------------------
Batch: 3
-------------------------------------------
+-----------+
|correlation|
+-----------+
|       null|
+-----------+



23/11/06 20:23:30 WARN NetworkClient: [Consumer clientId=consumer-spark-kafka-source-f720b012-de44-4ee3-afa0-14f452e0f986--108891843-driver-0-1, groupId=spark-kafka-source-f720b012-de44-4ee3-afa0-14f452e0f986--108891843-driver-0] Error connecting to node kafka1:9092 (id: 2147483646 rack: null)
java.net.UnknownHostException: kafka1
	at java.net.InetAddress.getAllByName0(InetAddress.java:1287)
	at java.net.InetAddress.getAllByName(InetAddress.java:1199)
	at java.net.InetAddress.getAllByName(InetAddress.java:1127)
	at org.apache.kafka.clients.ClientUtils.resolve(ClientUtils.java:104)
	at org.apache.kafka.clients.ClusterConnectionStates$NodeConnectionState.currentAddress(ClusterConnectionStates.java:403)
	at org.apache.kafka.clients.ClusterConnectionStates$NodeConnectionState.access$200(ClusterConnectionStates.java:363)
	at org.apache.kafka.clients.ClusterConnectionStates.currentAddress(ClusterConnectionStates.java:151)
	at org.apache.kafka.clients.NetworkClient.initiateConnect(NetworkClie

-------------------------------------------
Batch: 4
-------------------------------------------
+-----------+
|correlation|
+-----------+
|       null|
+-----------+



23/11/06 20:27:06 WARN NetworkClient: [Consumer clientId=consumer-spark-kafka-source-f720b012-de44-4ee3-afa0-14f452e0f986--108891843-driver-0-1, groupId=spark-kafka-source-f720b012-de44-4ee3-afa0-14f452e0f986--108891843-driver-0] Error connecting to node kafka1:9092 (id: 2147483646 rack: null)
java.net.UnknownHostException: kafka1: Temporary failure in name resolution
	at java.net.Inet4AddressImpl.lookupAllHostAddr(Native Method)
	at java.net.InetAddress$2.lookupAllHostAddr(InetAddress.java:929)
	at java.net.InetAddress.getAddressesFromNameService(InetAddress.java:1330)
	at java.net.InetAddress.getAllByName0(InetAddress.java:1283)
	at java.net.InetAddress.getAllByName(InetAddress.java:1199)
	at java.net.InetAddress.getAllByName(InetAddress.java:1127)
	at org.apache.kafka.clients.ClientUtils.resolve(ClientUtils.java:104)
	at org.apache.kafka.clients.ClusterConnectionStates$NodeConnectionState.currentAddress(ClusterConnectionStates.java:403)
	at org.apache.kafka.clients.ClusterConnection

-------------------------------------------
Batch: 5
-------------------------------------------
+-----------+
|correlation|
+-----------+
|       null|
+-----------+

