## Project

In [1]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("MyApp") \
    .config("spark.jars.packages", 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0') \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .getOrCreate()


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-acee1fab-4598-4c3c-ab4d-8a61a2f7b6f4;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.0 in central
	found org.apache.kafka#kafka-clients;2.4.1 in central
	found com.github.luben#zstd-jni;1.4.4-3 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.7.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
:: resolution report :: resolve 202ms :: artifacts dl 3m

### Spark Stream with socket

### Spark streaming with a Kafka

We will use the other notebook "Kafka-producer" to handle Kafka

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, TimestampType, DateType

schema = StructType(
      [
        StructField("name", StringType(), False),
        StructField("price", DoubleType(), False),
        StructField("timestamp", TimestampType(), False),
      ]
    )

In [3]:
kafka_server = "kafka1:9092"   
from pyspark.sql.functions import from_json

lines = (spark.readStream                        # Get the DataStreamReader
  .format("kafka")                                 # Specify the source format as "kafka"
  .option("kafka.bootstrap.servers", kafka_server) # Configure the Kafka server name and port
  .option("subscribe", "stock")                       # Subscribe to the "en" Kafka topic 
  .option("startingOffsets", "earliest")           # The start point when a query is started
  .option("maxOffsetsPerTrigger", 100)             # Rate limit on max offsets per trigger interval
  .load()
  .select(from_json(col("value").cast("string"), schema).alias("parsed_value"))
# Load the DataFrame
)
df = lines.select("parsed_value.*")


## Select the N most valuable stocks in a window

In [4]:
from pyspark.sql.functions import window, col, desc, asc, rank

value = df.withWatermark("timestamp", "5 seconds").groupBy("name", window("timestamp", "5 seconds")).avg("price").orderBy(desc("avg(price)")).limit(5)

value.writeStream \
          .outputMode("complete") \
          .format("console") \
          .start()

24/11/15 16:33:02 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-011009ba-607c-4f51-a586-db6b77d4ef6d. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.


<pyspark.sql.streaming.StreamingQuery at 0xffff9849cf40>

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+--------------------+----------+
|name|              window|avg(price)|
+----+--------------------+----------+
| AZO|[2024-11-15 16:32...|    384.01|
|ISRG|[2024-11-15 16:32...|  192.6465|
| PXD|[2024-11-15 16:32...|    124.69|
| CMI|[2024-11-15 16:32...|    119.79|
| KMB|[2024-11-15 16:32...|     90.08|
+----+--------------------+----------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+----+--------------------+----------+
|name|              window|avg(price)|
+----+--------------------+----------+
| AZO|[2024-11-15 16:32...|    384.01|
|ISRG|[2024-11-15 16:32...|  192.6465|
| PXD|[2024-11-15 16:33...|     128.0|
| PXD|[2024-11-15 16:32...|    124.69|
| CMI|[2024-11-15 16:32...|    119.79|
+----+--------------------+----------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+----+--------------------+----------+
|name|              window|avg(price)|
+----+--------------------+----------+
| AZO|[2024-11-15 16:32...|    384.01|
|ISRG|[2024-11-15 16:32...|  192.6465|
| PXD|[2024-11-15 16:33...|     128.0|
| PXD|[2024-11-15 16:32...|    124.69|
| CMI|[2024-11-15 16:32...|    119.79|
+----+--------------------+----------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+----+--------------------+----------+
|name|              window|avg(price)|
+----+--------------------+----------+
| AZO|[2024-11-15 16:32...|    384.01|
|ISRG|[2024-11-15 16:32...|  192.6465|
| PXD|[2024-11-15 16:33...|     128.0|
| PXD|[2024-11-15 16:32...|    124.69|
| CMI|[2024-11-15 16:32...|    119.79|
+----+--------------------+----------+



## Select the stocks that lost value between two windows

In [None]:
# remember you can register another stream
from pyspark.sql.functions import expr, window, avg
from pyspark.sql import functions as F

secondStream = (spark.readStream                        # Get the DataStreamReader
  .format("kafka")                                 # Specify the source format as "kafka"
  .option("kafka.bootstrap.servers", kafka_server) # Configure the Kafka server name and port
  .option("subscribe", "stock")                       # Subscribe to the "en" Kafka topic 
  .option("startingOffsets", "earliest")           # The start point when a query is started
  .option("maxOffsetsPerTrigger", 100)             # Rate limit on max offsets per trigger interval
  .load()
  .select(from_json(col("value").cast("string"), schema).alias("parsed_value"))
# Load the DataFrame
)

df2 = secondStream.select("parsed_value.*").withColumnRenamed("name","secondName").withColumnRenamed("price","newPrice") \
.withWatermark("timestamp", "5 seconds").withColumnRenamed("timestamp","newTime")

df1 = df.withWatermark("timestamp", "5 seconds")

joined_streams_df = (df1.join(df2, expr("""
    name = secondName AND
    newTime > timestamp AND
    newTime <= timestamp + interval 5 seconds
    """),"inner")).groupBy("name","secondName",window("timestamp","5 seconds")).agg(F.avg("price"),F.avg("newPrice")).filter(col("avg(newPrice)") < col("avg(price)"))

#.agg(F.max("timestamp")).avg("price","newPrice")
joined_streams_df.writeStream \
          .outputMode("append") \
          .format("console") \
          .start()

## Select the stock that gained the most (between windows)

In [None]:
# remember you can register another stream
from pyspark.sql.functions import expr, window, avg, desc, abs
from pyspark.sql import functions as F

secondStream = (spark.readStream                        # Get the DataStreamReader
  .format("kafka")                                 # Specify the source format as "kafka"
  .option("kafka.bootstrap.servers", kafka_server) # Configure the Kafka server name and port
  .option("subscribe", "stock")                       # Subscribe to the "en" Kafka topic 
  .option("startingOffsets", "earliest")           # The start point when a query is started
  .option("maxOffsetsPerTrigger", 100)             # Rate limit on max offsets per trigger interval
  .load()
  .select(from_json(col("value").cast("string"), schema).alias("parsed_value"))
# Load the DataFrame
)

df2 = secondStream.select("parsed_value.*").withColumnRenamed("name","secondName").withColumnRenamed("price","newPrice") \
.withWatermark("timestamp", "5 seconds").withColumnRenamed("timestamp","newTime")

df1 = df.withWatermark("timestamp", "5 seconds")

joined_streams_df = (df1.join(df2, expr("""
    name = secondName AND
    newTime > timestamp AND
    newTime <= timestamp + interval 5 seconds
    """),"inner")).withColumn("amount",col("newPrice") - col("price")).groupBy("name",window("timestamp","5 seconds")).agg(F.avg("amount"), F.max("amount"))

#.avg("amount").agg(F.max("avg(amount)"))

#.agg(F.max_by("amount"))

#.agg(F.max("timestamp")).avg("price","newPrice")
joined_streams_df.writeStream \
          .outputMode("append") \
          .format("console") \
          .start()

## Compute your assets

In [4]:
! pip install pandas

Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/69/a8/6783854b7e7f64016f08c56b36a95ae5a89c6f7e99d68b8aea1c221cb68e/pandas-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata
  Downloading pandas-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (18 kB)
Collecting numpy>=1.22.4 (from pandas)
  Obtaining dependency information for numpy>=1.22.4 from https://files.pythonhosted.org/packages/41/95/1145b9072e39ef4c40d62f76d0d80be65a7c383ba3ef9ccd2d9a97974752/numpy-1.26.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata
  Downloading numpy-1.26.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas)
  Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packag

In [4]:
import pandas as pd

assets = [["CCL",100],["ADM",200]]

assetsDF = pd.DataFrame(assets, columns = ['name', 'amount'])
assetsDF.iteritems = assetsDF.items


In [5]:
assetsDF.iteritems = assetsDF.items


In [6]:
sparkDF=spark.createDataFrame(assetsDF) 
sparkDF.printSchema()
sparkDF.show()

  if is_datetime64tz_dtype(s.dtype):


root
 |-- name: string (nullable = true)
 |-- amount: long (nullable = true)

+----+------+
|name|amount|
+----+------+
| CCL|   100|
| ADM|   200|
+----+------+



                                                                                

In [7]:
from pyspark.sql.functions import window, col, desc, asc, rank

windowed = df.withWatermark("timestamp","5 seconds").groupBy("name", window("timestamp","5 seconds")).avg("price")
query = sparkDF.join(windowed,"name", "inner").withColumn("money",sparkDF.amount*windowed["avg(price)"])

query.writeStream \
          .outputMode("append") \
          .format("console") \
          .start()

23/10/13 14:18:31 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-ad5b0caa-b95d-4a35-ad09-7c2d548e66a1. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.


<pyspark.sql.streaming.StreamingQuery at 0xffff653eae20>

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+------+------+----------+-----+
|name|amount|window|avg(price)|money|
+----+------+------+----------+-----+
+----+------+------+----------+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+----+------+------+----------+-----+
|name|amount|window|avg(price)|money|
+----+------+------+----------+-----+
+----+------+------+----------+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+----+------+------+----------+-----+
|name|amount|window|avg(price)|money|
+----+------+------+----------+-----+
+----+------+------+----------+-----+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+----+------+--------------------+----------+------+
|name|amount|              window|avg(price)| money|
+----+------+--------------------+----------+------+
| ADM|   200|[2023-10-13 09:34...|     32.46|6492.0|
+----+------+--------------------+----------+------+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+----+------+--------------------+----------+------+
|name|amount|              window|avg(price)| money|
+----+------+--------------------+----------+------+
| ADM|   200|[2023-10-13 09:57...|     32.46|6492.0|
+----+------+--------------------+----------+------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+----+------+--------------------+----------+------------------+
|name|amount|              window|avg(price)|             money|
+----+------+--------------------+----------+------------------+
| CCL|   100|[2023-10-13 09:58...|     33.95|3395.0000000000005|
| ADM|   200|[2023-10-13 09:57...|     32.82|            6564.0|
+----+------+--------------------+----------+------------------+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+----+------+------+----------+-----+
|name|amount|window|avg(price)|money|
+----+------+------+----------+-----+
+----+------+------+----------+-----+



                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+----+------+--------------------+----------+------+
|name|amount|              window|avg(price)| money|
+----+------+--------------------+----------+------+
| CCL|   100|[2023-10-13 09:59...|     32.35|3235.0|
+----+------+--------------------+----------+------+



                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+----+------+------+----------+-----+
|name|amount|window|avg(price)|money|
+----+------+------+----------+-----+
+----+------+------+----------+-----+



                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+----+------+--------------------+----------+------+
|name|amount|              window|avg(price)| money|
+----+------+--------------------+----------+------+
| ADM|   200|[2023-10-13 10:01...|     37.57|7514.0|
+----+------+--------------------+----------+------+



                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+----+------+--------------------+----------+-----------------+
|name|amount|              window|avg(price)|            money|
+----+------+--------------------+----------+-----------------+
| ADM|   200|[2023-10-13 10:02...|     34.88|6976.000000000001|
+----+------+--------------------+----------+-----------------+



                                                                                

-------------------------------------------
Batch: 11
-------------------------------------------
+----+------+--------------------+----------+------------------+
|name|amount|              window|avg(price)|             money|
+----+------+--------------------+----------+------------------+
| CCL|   100|[2023-10-13 10:03...|   32.5101|           3251.01|
| CCL|   100|[2023-10-13 10:03...|     32.48|3247.9999999999995|
+----+------+--------------------+----------+------------------+



                                                                                

-------------------------------------------
Batch: 12
-------------------------------------------
+----+------+------+----------+-----+
|name|amount|window|avg(price)|money|
+----+------+------+----------+-----+
+----+------+------+----------+-----+



                                                                                

-------------------------------------------
Batch: 13
-------------------------------------------
+----+------+------+----------+-----+
|name|amount|window|avg(price)|money|
+----+------+------+----------+-----+
+----+------+------+----------+-----+



                                                                                

-------------------------------------------
Batch: 14
-------------------------------------------
+----+------+--------------------+----------+-------+
|name|amount|              window|avg(price)|  money|
+----+------+--------------------+----------+-------+
| ADM|   200|[2023-10-13 10:05...|   43.7599|8751.98|
+----+------+--------------------+----------+-------+



                                                                                

-------------------------------------------
Batch: 15
-------------------------------------------
+----+------+------+----------+-----+
|name|amount|window|avg(price)|money|
+----+------+------+----------+-----+
+----+------+------+----------+-----+



