## Project Template

In [1]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("MyApp") \
    .config("spark.jars.packages", 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0') \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .getOrCreate()


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-157bce1d-d2dd-4fec-b8d8-efa1a33dc1e1;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.0 in central
	found org.apache.kafka#kafka-clients;2.4.1 in central
	found com.github.luben#zstd-jni;1.4.4-3 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.7.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
:: resolution report :: resolve 193ms :: artifacts dl 4m

Be sure to start the stream on Kafka!

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, TimestampType, DateType

schema = StructType(
      [
        StructField("name", StringType(), False),
        StructField("price", DoubleType(), False),
        StructField("timestamp", TimestampType(), False),
      ]
    )

In [3]:
kafka_server = "kafka1:9092"   
from pyspark.sql.functions import from_json

lines = (spark.readStream                        # Get the DataStreamReader
  .format("kafka")                                 # Specify the source format as "kafka"
  .option("kafka.bootstrap.servers", kafka_server) # Configure the Kafka server name and port
  .option("subscribe", "stock")                       # Subscribe to the "en" Kafka topic 
  .option("startingOffsets", "earliest")           # The start point when a query is started
  .option("maxOffsetsPerTrigger", 100)             # Rate limit on max offsets per trigger interval
  .load()
  .select(from_json(col("value").cast("string"), schema).alias("parsed_value"))
# Load the DataFrame
)
df = lines.select("parsed_value.*")


## The assignment starts here

You can create a

## Select the N most valuable stocks in a window

In [None]:
from pyspark.sql.functions import window, desc, col, collect_list, struct, slice

# Define a 1-minute sliding time window and collect prices within each window
windowed_df = df.groupBy(window(df.timestamp, "1 minute")).agg(
    collect_list(struct("name", "price")).alias("stocks")
)

# Define a UDF (User Defined Function) to calculate the total value of each stock
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
@udf(returnType=FloatType())
def calculate_max_value(stocks):
    return max(stock.price for stock in stocks)

# Calculate the total value for each stock within each window
windowed_df = windowed_df.withColumn("total_value", calculate_max_value(col("stocks")))

# Select the N most valuable stocks within each window
n = 5  # Define the value of N
most_valuable_stocks = windowed_df.select("window", slice(col("stocks"), 1, n).alias("5 most valuable stocks and their prices"))

# Output the results to the console for real-time display
query = most_valuable_stocks.writeStream.outputMode("complete").format("console").option("truncate", False).start()
# Start the streaming query
query.awaitTermination()


## Select the stocks that lost value between two windows

In [None]:
from pyspark.sql.functions import from_json, col, lit
# Define a watermark for event-time processing
df = df.withWatermark("timestamp", "10 minutes")

# Create a previous DataFrame to join with the current DataFrame
previous_df = df.withColumnRenamed("name", "prev_name") \
    .withColumnRenamed("timestamp", "prev_timestamp") \
    .withColumnRenamed("price", "prev_price")

# Join the current and previous DataFrames
df_with_price_difference = df.join(
    previous_df,
    (df["name"] == previous_df["prev_name"]) & (df["timestamp"] > previous_df["prev_timestamp"]),
    "left"
)

# Calculate price_difference
df_with_price_difference = df_with_price_difference.withColumn("price_difference", col("price") - col("prev_price"))

# Filter for rows with a price loss
df_with_loss = df_with_price_difference.filter(col("price_difference") <= 0)
df_to_output = df_with_loss.select("name", "price", "timestamp", "prev_price", "prev_timestamp", "price_difference")

# Output the results to the console
output_query = df_with_loss.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

# Start the streaming query
output_query.awaitTermination()

## Select the stock that gained the most (between windows)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lag, window, col
from pyspark.sql.window import Window

# Function to handle each batch of data
def process_batch(df, epoch_id):
    window_spec = Window.partitionBy("name").orderBy("window")

    # Create a column "previous_price" by using the lag function
    df = df.withColumn("PreviousAverage", lag("avg(price)").over(window_spec))
    df = df.withColumn("previous_window", lag("window").over(window_spec))
    # Calculate the difference and add a new column
   

    # Filter where there is a previous price and the previous price is greater than the current avg price
    df = df.filter(col("PreviousAverage").isNotNull() & (col("PreviousAverage") < col("avg(price)")))
    df = df.withColumn("HighestGain", col("avg(price)") - col("PreviousAverage"))
    df = df.orderBy(col("HighestGain").desc())

    # Take only the stock with the biggest increase
    df = df.limit(1)
    # Proceed with data processing here, for example: print to the screen
    df.select("name", "window", "avg(price)", "PreviousAverage", "previous_window", "HighestGain").show(truncate=False)

windowedDF_2 = df \
    .withWatermark("timestamp", "3 seconds") \
    .groupBy(window("timestamp", "5 minutes"), "name") \
    .agg({"price": "avg"})

# No need to sort here as we're not writing out the sorted results, sorting will happen in process_batch if needed
# lost_value_stocks = windowedDF_2.orderBy("avg(price)", ascending=False)

query_2 = (windowedDF_2.writeStream
    .outputMode("complete")
    .format("memory")
    .queryName("TheStocksThatLostValue1")
    .foreachBatch(process_batch)
    .start())

query_2.awaitTermination()

## Checkpoint

In [4]:
from pyspark.sql.functions import window, col, first, last

windowed_data = df.withWatermark("timestamp", "1 hour").groupBy(
    window("timestamp", "1 hour"),
    col("name")
).agg(
    (((first(df.price) - last(df.price)) / first(df.price)) * 100).alias("percentage_change")
)

# Define the threshold for acceptable percentage change
threshold = 5  # For example, a 5% change threshold

# Filter the data for stocks that did not lose too much value
control_pass = windowed_data.filter(col("percentage_change") >= -threshold)

# Start the streaming query
query = (control_pass
    .writeStream
    .outputMode("update")
    .format("console")
    .start()
)

query.awaitTermination()


23/12/15 15:53:16 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-a6d0e78a-4fb6-41c8-9ff6-e92820203324. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+--------------------+----+-------------------+
|              window|name|  percentage_change|
+--------------------+----+-------------------+
|[2023-12-15 15:00...| CBG|                0.0|
|[2023-12-15 15:00...| PHM|                0.0|
|[2023-12-15 15:00...|ADSK|                0.0|
|[2023-12-15 15:00...| BEN|                0.0|
|[2023-12-15 15:00...| ETN|                0.0|
|[2023-12-15 15:00...|  MS|                0.0|
|[2023-12-15 15:00...|  GE|                0.0|
|[2023-12-15 15:00...| HAS|                0.0|
|[2023-12-15 15:00...| PKG|                0.0|
|[2023-12-15 15:00...| EOG|                0.0|
|[2023-12-15 15:00...|  GM|                0.0|
|[2023-12-15 15:00...| PPG|                0.0|
|[2023-12-15 15:00...| VFC|                0.0|
|[2023-12-15 15:00...| TSN|                0.0|
|[2023-12-15 15:00...| XYL|                0.0|
|[2023-12-15 15:00...|  IT|-0.694870222

                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------------+----+------------------+
|              window|name| percentage_change|
+--------------------+----+------------------+
|[2023-12-15 15:00...| MYL|               0.0|
|[2023-12-15 15:00...| DOV|               0.0|
|[2023-12-15 15:00...| ROK|               0.0|
|[2023-12-15 15:00...| BXP|               0.0|
|[2023-12-15 15:00...|  MS| 8.469601677148859|
|[2023-12-15 15:00...| IFF|               0.0|
|[2023-12-15 15:00...| DPS|               0.0|
|[2023-12-15 15:00...| SNA|               0.0|
|[2023-12-15 15:00...|LRCX|               0.0|
|[2023-12-15 15:00...| GPC|               0.0|
|[2023-12-15 15:00...| STZ|               0.0|
|[2023-12-15 15:00...|  CA|               0.0|
|[2023-12-15 15:00...|DLTR|               0.0|
|[2023-12-15 15:00...| BMY|               0.0|
|[2023-12-15 15:00...| PLD|               0.0|
|[2023-12-15 15:00...| XYL|1.6399286987522312|
|[2023-12-

                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+--------------------+----+-------------------+
|              window|name|  percentage_change|
+--------------------+----+-------------------+
|[2023-12-15 15:00...| BXP|  0.799273387829242|
|[2023-12-15 15:00...|CTAS|                0.0|
|[2023-12-15 15:00...| MMM|                0.0|
|[2023-12-15 15:00...| SNA|-3.5983467055677045|
|[2023-12-15 15:00...| GPC|  4.346714395295328|
|[2023-12-15 15:00...| XOM|                0.0|
|[2023-12-15 15:00...|  PX|                0.0|
|[2023-12-15 15:00...| KMI|                0.0|
|[2023-12-15 15:00...|ORLY|                0.0|
|[2023-12-15 15:00...|ULTA|                0.0|
|[2023-12-15 15:00...| XYL| 1.1051693404634662|
|[2023-12-15 15:00...| VNO|                0.0|
|[2023-12-15 15:00...| AEE|                0.0|
|[2023-12-15 15:00...|  PG| -3.171570500573169|
|[2023-12-15 15:00...|PCAR|                0.0|
|[2023-12-15 15:00...| NKE|            

                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+--------------------+----+--------------------+
|              window|name|   percentage_change|
+--------------------+----+--------------------+
|[2023-12-15 15:00...| CHD|                 0.0|
|[2023-12-15 15:00...| BXP| -2.4023614895549597|
|[2023-12-15 15:00...|  LB|                 0.0|
|[2023-12-15 15:00...|CELG|                 0.0|
|[2023-12-15 15:00...| DRI|                 0.0|
|[2023-12-15 15:00...| XOM| -0.6932218309859262|
|[2023-12-15 15:00...| HII|                 0.0|
|[2023-12-15 15:00...| ADS|                 0.0|
|[2023-12-15 15:00...|  PX| -1.3736984863067574|
|[2023-12-15 15:00...| AYI|                 0.0|
|[2023-12-15 15:00...|ORLY|  1.7027725773379658|
|[2023-12-15 15:00...| LUV|                 0.0|
|[2023-12-15 15:00...| AMT|                 0.0|
|[2023-12-15 15:00...| XYL| 0.32085561497326154|
|[2023-12-15 15:00...| JEC|                 0.0|
|[2023-12-15 15:00...



KeyboardInterrupt: 

                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+--------------------+----+-------------------+
|              window|name|  percentage_change|
+--------------------+----+-------------------+
|[2023-12-15 15:00...| BSX|                0.0|
|[2023-12-15 15:00...| BEN| 1.6799574694311519|
|[2023-12-15 15:00...| NUE|                0.0|
|[2023-12-15 15:00...|  GS|-4.7589659192210085|
|[2023-12-15 15:00...|  LB| 1.7094017094017007|
|[2023-12-15 15:00...|CTAS|-1.0867154579729315|
|[2023-12-15 15:00...|PRGO|                0.0|
|[2023-12-15 15:00...|CELG| 3.9626853771328325|
|[2023-12-15 15:00...| KSU|                0.0|
|[2023-12-15 15:00...| EMR|-1.0451140916216712|
|[2023-12-15 15:00...| EOG|-1.8888713993388995|
|[2023-12-15 15:00...|  NI|                0.0|
|[2023-12-15 15:00...| XYL|  5.062388591800363|
|[2023-12-15 15:00...| AEE| 2.5931034482758557|
|[2023-12-15 15:00...|HOLX|                0.0|
|[2023-12-15 15:00...| AEP|            



## Compute your assets

In [4]:
from pyspark.sql.functions import col, corr
# Define the schema for your stocks ownership data
schema = StructType([
    StructField("name", StringType(), True),
    StructField("price", DoubleType(), True)  # Represents the total value of the stocks owned by Aida
])

# Sample stocks ownership data with the total value of the stocks owned by Aida
data = [("BSX", 50.0), 
        ("EMR", 75.0), 
        ("ANSS", 36.0)]  

# Create a DataFrame with the ownership data
stocks_ownership_df = spark.createDataFrame(data, schema)

# Select the relevant columns from the parsed Kafka data and alias them
df = lines.select(
    col("parsed_value.name").alias("stock_name"),
    col("parsed_value.price").alias("stock_price")
)

# Join your ownership data with the streaming data using an inner join
merged_df = df.join(stocks_ownership_df, df.stock_name == stocks_ownership_df.name, "inner")

# Calculate the correlation between your assets and each stock's price
correlation_result = merged_df.select(corr("price", "stock_price").alias("correlation"))

# Start the streaming query to continuously monitor changes in asset value and the correlation
query = (correlation_result
    .writeStream
    .outputMode("update")
    .format("console")
    .start()
)

query.awaitTermination()

23/12/15 15:55:35 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-02de20e5-775c-4bcc-b2aa-39b621109e10. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-----------+
|correlation|
+-----------+
|       null|
+-----------+

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------+
|correlation|
+-----------+
|        NaN|
+-----------+

-------------------------------------------
Batch: 2
-------------------------------------------
+-----------+
|correlation|
+-----------+
|        NaN|
+-----------+

-------------------------------------------
Batch: 3
-------------------------------------------
+-----------+
|correlation|
+-----------+
|        NaN|
+-----------+

-------------------------------------------
Batch: 4
-------------------------------------------
+--------------------+
|         correlation|
+--------------------+
|-0.25956308445091103|
+--------------------+

-------------------------------------------
Batch: 5
-------------------------------------------
+-----------------

KeyboardInterrupt: 

-------------------------------------------
Batch: 85
-------------------------------------------
+--------------------+
|         correlation|
+--------------------+
|-0.08270688297560827|
+--------------------+

-------------------------------------------
Batch: 86
-------------------------------------------
+--------------------+
|         correlation|
+--------------------+
|-0.08270688297560827|
+--------------------+

-------------------------------------------
Batch: 87
-------------------------------------------
+--------------------+
|         correlation|
+--------------------+
|-0.08270688297560827|
+--------------------+

