In [9]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.window import Window

spark = SparkSession.builder \
    .appName("MyApp") \
    .config("spark.jars.packages", 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0') \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .getOrCreate()

In [10]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, TimestampType, DateType

schema = StructType(
      [
        StructField("name", StringType(), False),
        StructField("price", DoubleType(), False),
        StructField("timestamp", TimestampType(), False),
      ]
    )

In [11]:
kafka_server = "kafka1:9092"   
from pyspark.sql.functions import from_json

lines = (spark.readStream                        # Get the DataStreamReader
  .format("kafka")                                 # Specify the source format as "kafka"
  .option("kafka.bootstrap.servers", kafka_server) # Configure the Kafka server name and port
  .option("subscribe", "stock")                       # Subscribe to the "en" Kafka topic 
  .option("startingOffsets", "earliest")           # The start point when a query is started
  .option("maxOffsetsPerTrigger", 100)             # Rate limit on max offsets per trigger interval
  .load()
  .select(from_json(col("value").cast("string"), schema).alias("parsed_value"))
# Load the DataFrame
)
df = lines.select("parsed_value.*")


## The assignment starts here

You can create a


## Select the stock that gained the most (between windows)

In [None]:
#remember you can register another stream

from pyspark.sql.functions import lag, window

# The function processes each batch of data
def process_batch(df, epoch_id):
    
    window_spec = Window.partitionBy("name").orderBy("window")

    # Create a column "previous_window" and "previous_price" using the lag function
    df = df.withColumn("previous_price", lag("avg(price)").over(window_spec))
    df = df.withColumn("previous_window", lag("window").over(window_spec))
    df = df.filter(df["previous_price"] < df["avg(price)"])
    
    # Show 
    df.show(truncate=False)
    
    
# Apply a time window to the data with a watermark of 30 seconds
# Group the data by a 5-minute window and the stock name
# Calculate the average price within each window for each stock
windowedDF_3 = df \
        .withWatermark("timestamp", "30 seconds") \
        .groupBy(window("timestamp", "5 minutes"), "name") \
        .agg({"price": "avg"})

gained_value_stocks = windowedDF_3.orderBy("avg(price)", ascending=False)


# Apply process_batch function, save and show result
query= (gained_value_stocks.writeStream
           .outputMode("complete")
           .format("memory")
           .queryName("GainedValueStocks")
           .option("truncate", False)
           .foreachBatch(process_batch)
           .start())
    
query.awaitTermination()

23/11/06 17:43:56 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-8014e752-b9af-4b7d-ba44-d692b46adee2. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
                                                                                

+------+----+----------+--------------+---------------+
|window|name|avg(price)|previous_price|previous_window|
+------+----+----------+--------------+---------------+
+------+----+----------+--------------+---------------+



                                                                                

+------+----+----------+--------------+---------------+
|window|name|avg(price)|previous_price|previous_window|
+------+----+----------+--------------+---------------+
+------+----+----------+--------------+---------------+



                                                                                

+------+----+----------+--------------+---------------+
|window|name|avg(price)|previous_price|previous_window|
+------+----+----------+--------------+---------------+
+------+----+----------+--------------+---------------+



                                                                                

+------+----+----------+--------------+---------------+
|window|name|avg(price)|previous_price|previous_window|
+------+----+----------+--------------+---------------+
+------+----+----------+--------------+---------------+



                                                                                

+------------------------------------------+----+----------+------------------+------------------------------------------+
|window                                    |name|avg(price)|previous_price    |previous_window                           |
+------------------------------------------+----+----------+------------------+------------------------------------------+
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|PRU |72.0      |68.6              |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|AGN |122.38    |93.13             |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|CHRW|60.36     |57.82             |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|NOV |73.1      |69.255            |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|JPM |55.85     |50.19             |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:

                                                                                

+------------------------------------------+----+----------+-----------------+------------------------------------------+
|window                                    |name|avg(price)|previous_price   |previous_window                           |
+------------------------------------------+----+----------+-----------------+------------------------------------------+
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|ULTA|99.21     |86.75            |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|FLIR|33.229    |26.44            |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|PNW |60.05     |54.63            |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|PRU |75.32     |68.6             |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|ADI |48.79     |44.7             |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2

                                                                                

+------------------------------------------+----+----------+--------------+------------------------------------------+
|window                                    |name|avg(price)|previous_price|previous_window                           |
+------------------------------------------+----+----------+--------------+------------------------------------------+
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|MMM |117.14    |105.88        |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|WEC |41.35     |39.9          |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|ULTA|99.21     |86.75         |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|FLIR|32.5095   |26.44         |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|EMN |77.89     |70.84         |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|PNW 

                                                                                

+------------------------------------------+----+------------------+--------------+------------------------------------------+
|window                                    |name|avg(price)        |previous_price|previous_window                           |
+------------------------------------------+----+------------------+--------------+------------------------------------------+
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|MMM |117.14            |105.88        |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|WEC |41.245000000000005|39.9          |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|ULTA|99.21             |86.75         |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|FLIR|32.208000000000006|26.44         |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|EMN |77.89             |70.84         |[2023-11-05 11:05:00, 2023-1

                                                                                

+------------------------------------------+----+------------------+--------------+------------------------------------------+
|window                                    |name|avg(price)        |previous_price|previous_window                           |
+------------------------------------------+----+------------------+--------------+------------------------------------------+
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|MMM |117.14            |105.88        |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|WEC |41.245000000000005|39.9          |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|ULTA|99.21             |86.75         |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|HAS |52.51             |40.93         |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|FLIR|32.208000000000006|26.44         |[2023-11-05 11:05:00, 2023-1

                                                                                

+------------------------------------------+----+------------------+--------------+------------------------------------------+
|window                                    |name|avg(price)        |previous_price|previous_window                           |
+------------------------------------------+----+------------------+--------------+------------------------------------------+
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|MMM |117.14            |105.88        |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|WEC |41.245000000000005|39.9          |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|EA  |22.57             |18.82         |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|OXY |93.2              |84.435        |[2023-11-05 11:05:00, 2023-11-05 11:10:00]|
|[2023-11-05 11:10:00, 2023-11-05 11:15:00]|ULTA|99.21             |86.75         |[2023-11-05 11:05:00, 2023-1

