# Stock Market Data Analysis
*Using PySpark, Spark SQL, and Confluent Kafka*

**Data Processing and Analytics (Prof. Angela Bonifati)**
* Loïc Chassin - 11720315 - loic.chassin-de-kergommeaux@etu.univ-lyon1.fr
* Myrfa Yumiaji Ferra - 12313695 - myrfa.yumiaji-ferra@etu.univ-lyon1.fr

**M2 DISS 2023/2024**

## Project Template

In [1]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("MyApp") \
    .config("spark.jars.packages", 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0') \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .getOrCreate()


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d0dc5f4b-b9b2-42a5-a7d1-08531d0d5aa0;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.0 in central
	found org.apache.kafka#kafka-clients;2.4.1 in central
	found com.github.luben#zstd-jni;1.4.4-3 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.7.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
:: resolution report :: resolve 180ms :: artifacts dl 4m

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, TimestampType, DateType

schema = StructType(
      [
        StructField("name", StringType(), False),
        StructField("price", DoubleType(), False),
        StructField("timestamp", TimestampType(), False),
      ]
    )

In [3]:
kafka_server = "kafka1:9092"   
from pyspark.sql.functions import from_json

lines = (spark.readStream                        # Get the DataStreamReader
  .format("kafka")                                 # Specify the source format as "kafka"
  .option("kafka.bootstrap.servers", kafka_server) # Configure the Kafka server name and port
  .option("subscribe", "stock")                       # Subscribe to the "en" Kafka topic 
  .option("startingOffsets", "earliest")           # The start point when a query is started
  .option("maxOffsetsPerTrigger", 100)             # Rate limit on max offsets per trigger interval
  .load()
  .select(from_json(col("value").cast("string"), schema).alias("parsed_value"))
# Load the DataFrame
)
df = lines.select("parsed_value.*")


## The assignment starts here


## 1. Select the N most valuable stocks in a window

In [None]:
# Query 1

from pyspark.sql.functions import col, window, max

# Group the data by time windows and take max price for each
windowed_df = df.groupBy(window(col("timestamp"), "5 minutes"), col("name")) \
    .agg(max(col("price")).alias("price"))

# Sort the aggregated DataFrame by total value in descending order
sorted_df = windowed_df.orderBy(col("window").desc(),col("price").desc()).limit(10)

# Display the results to console
query = sorted_df.writeStream.outputMode("complete").format("console").start()
query.awaitTermination()

## 2. Select the stocks that lost value between two windows

In [None]:
# Registration to another Stream

kafka_server = "kafka1:9092"   

lines = (spark.readStream                        # Get the DataStreamReader
  .format("kafka")                                 # Specify the source format as "kafka"
  .option("kafka.bootstrap.servers", kafka_server) # Configure the Kafka server name and port
  .option("subscribe", "stock")                       # Subscribe to the "en" Kafka topic 
  .option("startingOffsets", "earliest")           # The start point when a query is started
  .option("maxOffsetsPerTrigger", 100)             # Rate limit on max offsets per trigger interval
  .load()
  .select(from_json(col("value").cast("string"), schema).alias("parsed_value"))
# Load the DataFrame
)
new_df = lines.select("parsed_value.*")


In [None]:
# Query 2

from pyspark.sql.functions import expr, col, max, window

# Rename the 2 streams
df_old = df.withColumnRenamed("timestamp", "oldTime") \
        .withColumnRenamed("price", "oldPrice") \
        .withColumnRenamed("name", "oldName") \
        .withColumn("oldWindow", window("oldTime", "1 minutes"))

df_new = new_df.withColumnRenamed("timestamp", "newTime") \
        .withColumnRenamed("price", "newPrice") \
        .withColumnRenamed("name", "newName") \
        .withColumn("newWindow", window("newTime", "1 minutes"))


# Join streams on data with greater timestamp and lower price
# Windows should be different (comparison between windows)
# timestamp difference shouldn't be more than 2 * window (in order to have consecutive windows)
joined_stream_df = (df_old.join(df_new, expr("""
        oldName = newName AND
        NOT oldWindow = newWindow AND
        oldTime < newTime AND
        newTime <= oldTime + interval 2 minute AND
        oldPrice > newPrice """))) \
    .withColumn("difference", col("newPrice") - col("oldPrice")) \
    .select(col("oldName").alias("name"),col("difference"), col("oldWindow"), col("newWindow"))

# Display results to Cnsole
query = joined_stream_df.writeStream.outputMode("append").format("console").start()
query.awaitTermination()

## 3. Select the stock that gained the most (between windows)

In [None]:
# Registration to another Stream

kafka_server = "kafka1:9092"   

lines = (spark.readStream                        # Get the DataStreamReader
  .format("kafka")                                 # Specify the source format as "kafka"
  .option("kafka.bootstrap.servers", kafka_server) # Configure the Kafka server name and port
  .option("subscribe", "stock")                       # Subscribe to the "en" Kafka topic 
  .option("startingOffsets", "earliest")           # The start point when a query is started
  .option("maxOffsetsPerTrigger", 100)             # Rate limit on max offsets per trigger interval
  .load()
  .select(from_json(col("value").cast("string"), schema).alias("parsed_value"))
# Load the DataFrame
)
new_df = lines.select("parsed_value.*")


In [None]:
# Query 3

from pyspark.sql.functions import expr, col, max, window

# Rename the 2 streams
df_old = df.withColumnRenamed("timestamp", "oldTime") \
        .withColumnRenamed("price", "oldPrice") \
        .withColumnRenamed("name", "oldName") \
        .withColumn("oldWindow", window("oldTime", "1 minutes"))

df_new = new_df.withColumnRenamed("timestamp", "newTime") \
        .withColumnRenamed("price", "newPrice") \
        .withColumnRenamed("name", "newName") \
        .withColumn("newWindow", window("newTime", "1 minutes"))

# Join streams on data with greater timestamp and greater price
# Windows should be different (comparison between windows)
# timestamp difference shouldn't be more than 2 * window (in order to have consecutive windows)
joined_stream_df = (df_old.join(df_new, expr("""
        oldName = newName AND
        NOT oldWindow = newWindow AND
        oldTime < newTime AND
        newTime <= oldTime + interval 2 minute AND
        oldPrice < newPrice """))) \
    .withColumn("difference", col("newPrice") - col("oldPrice")) \
    .select(col("oldName").alias("name"),col("difference"),col("newWindow").alias("window"))

# HERE we would like to sort + filter(1), but we can't sort if not in complete output mode, and complete mode is not possible with streaming join

# Display results to Console
query = joined_stream_df.writeStream.outputMode("append").format("console").start()
query.awaitTermination()

## 4. Control of value loss

In [4]:
# Registration to another Stream

kafka_server = "kafka1:9092"   

lines = (spark.readStream                        # Get the DataStreamReader
  .format("kafka")                                 # Specify the source format as "kafka"
  .option("kafka.bootstrap.servers", kafka_server) # Configure the Kafka server name and port
  .option("subscribe", "stock")                       # Subscribe to the "en" Kafka topic 
  .option("startingOffsets", "earliest")           # The start point when a query is started
  .option("maxOffsetsPerTrigger", 100)             # Rate limit on max offsets per trigger interval
  .load()
  .select(from_json(col("value").cast("string"), schema).alias("parsed_value"))
# Load the DataFrame
)
new_df = lines.select("parsed_value.*")


In [5]:
# Query 4

from pyspark.sql.functions import expr, col, max, window
from pyspark.sql.types import LongType

# The stocks losing more than THRESHOLD % by minute will be detected
THRESHOLD = 10

# Rename the 2 streams
df_old = df.withColumnRenamed("timestamp", "oldTime") \
        .withColumnRenamed("price", "oldPrice") \
        .withColumnRenamed("name", "oldName")

df_new = new_df.withColumnRenamed("timestamp", "newTime") \
        .withColumnRenamed("price", "newPrice") \
        .withColumnRenamed("name", "newName")


# Join streams on data with greater timestamp and lower price
joined_stream_df = (df_old.join(df_new, expr("""
        oldName = newName AND
        oldTime < newTime AND
        newTime <= oldTime + interval 2 minute AND
        oldPrice > newPrice """))) \
    .withColumn("difference", col("newPrice") - col("oldPrice"))

    # compute the value lost by amount of time
joined_stream_df = joined_stream_df.withColumn("intervalSeconds",col("newTime").cast(LongType()) - col("oldTime").cast(LongType())) \
    .withColumn("lossByMinute", 60 * col("difference") / col("intervalSeconds")) \
    .withColumn("perCentLossByMinute", 100 * col("lossByMinute") / col("oldPrice"))

    # select column and filter according to the threshold
    # filter on "lossByMinute" to detect loss over a certain amount
    # filter on "perCentLossByMinute" to detect loss over a certain percentage of stock value
joined_stream_df = joined_stream_df.select(col("oldName").alias("name"),col("perCentLossbyMinute"),col("lossByMinute")) \
    .filter(col("perCentLossByMinute") < -THRESHOLD)

# Display results to Console
query = joined_stream_df.writeStream.outputMode("append").format("console").start()
query.awaitTermination()

23/12/15 15:38:19 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-1ce50bab-3955-40a3-83e8-098eb09f7a9d. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+-------------------+-------------------+
|name|perCentLossbyMinute|       lossByMinute|
+----+-------------------+-------------------+
|FITB| -18.84145119262373|-3.1333333333333258|
+----+-------------------+-------------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+----+-------------------+-------------------+
|name|perCentLossbyMinute|       lossByMinute|
+----+-------------------+-------------------+
|NFLX|-17.915982864084896| -5.043599999999995|
| DAL|-27.556644213104704| -4.695652173913041|
| DAL|-15.333939118582459|-2.6129032258064506|
| VAR|-18.685121107266557|-13.500000000000085|
+----+-------------------+-------------------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+----+-------------------+------------------+
|name|perCentLossbyMinute|      lossByMinute|
+----+-------------------+------------------+
| GPC|-10.432114548708789|-8.160000000000014|
+----+-------------------+------------------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+----+-------------------+------------------+
|name|perCentLossbyMinute|      lossByMinute|
+----+-------------------+------------------+
| WMB|-10.842354813574508|-4.004081632653065|
| EXC|-12.955997372977489|-4.822222222222221|
| SIG|-13.175597309208952|-9.466666666666631|
+----+-------------------+------------------+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+----+-------------------+-------------------+
|name|perCentLossbyMinute|       lossByMinute|
+----+-------------------+-------------------+
|  GS| -15.72617632070662|-26.451428571428533|
+----+-------------------+-------------------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+-----+-------------------+------------------+
| name|perCentLossbyMinute|      lossByMinute|
+-----+-------------------+------------------+
|GOOGL|-14.650932541027151|-66.50842105263169|
+-----+-------------------+------------------+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+-----+-------------------+------------------+
| name|perCentLossbyMinute|      lossByMinute|
+-----+-------------------+------------------+
|  ADM| -11.01533547634159|-4.138461538461535|
|  UAL|-20.501698244465356|-7.157142857142855|
|  UNM|-14.769071319068086|-4.717241379310347|
|GOOGL|-10.066631513350295|-45.69782608695663|
| VRSK|-11.435752434809885|-7.279999999999973|
|  MOS|-22.243421052631582|-9.965052631578947|
+-----+-------------------+------------------+



                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+----+-------------------+-------------------+
|name|perCentLossbyMinute|       lossByMinute|
+----+-------------------+-------------------+
|   C|-29.824940566241622| -15.77142857142857|
| CAH| -17.66685361749867| -9.450000000000038|
|  IT|-19.543973941368055|-11.399999999999986|
+----+-------------------+-------------------+



                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+----+-------------------+-------------------+
|name|perCentLossbyMinute|       lossByMinute|
+----+-------------------+-------------------+
|CELG|-11.869220438156015| -9.568965517241379|
|AKAM|-21.738136600530293|-11.411999999999992|
+----+-------------------+-------------------+



                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+----+-------------------+------------+
|name|perCentLossbyMinute|lossByMinute|
+----+-------------------+------------+
+----+-------------------+------------+



                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+----+-------------------+------------+
|name|perCentLossbyMinute|lossByMinute|
+----+-------------------+------------+
+----+-------------------+------------+



                                                                                

-------------------------------------------
Batch: 11
-------------------------------------------
+----+-------------------+------------+
|name|perCentLossbyMinute|lossByMinute|
+----+-------------------+------------+
+----+-------------------+------------+





KeyboardInterrupt: 

                                                                                

-------------------------------------------
Batch: 12
-------------------------------------------
+----+-------------------+------------+
|name|perCentLossbyMinute|lossByMinute|
+----+-------------------+------------+
+----+-------------------+------------+





## 5. Corelation between stocks

In [None]:
# Query 5

from pyspark.sql.functions import max, expr, col, window, sum, avg, count, stddev, mean

#create short windows (1 minute) and group by name and find latest price
#create longer windows (10-20 minutes) and send pair of 2 records for each small window


# compute a small window of each record to detect "simultaneous" events
windowed_df = df.withColumn("window", window("timestamp", "1 minutes"))

#copy of the stream for self-join
windowed_df_2 = windowed_df.withColumnRenamed("name", "name2") \
        .withColumnRenamed("price", "price2") \
        .withColumnRenamed("timestamp", "timestamp2") \
        .withColumnRenamed("window", "window2")

# Join events within the same time-window ("simultaneous" events)
# we are doing the corelation for only for GOOGL
joined_df = windowed_df.join(windowed_df_2, expr("""
        name = "GOOGL" AND
        window = window2
        """))
# result is a list of pair of records, happening in the same window

# now we can group by pair of stocks, in a large window for computation
# we can aggregate the prices to compute Pearson Corelation Coefficien (PCC)
final_df = joined_df.withWatermark("timestamp", "1 minutes") \
        .groupBy(window("timestamp", "5 minutes"), col("name"), col("name2")) \
        .agg(count(col("name")).alias("count"),
            avg(col("price")).alias("avg"),
            avg(col("price")).alias("avg"),
            stddev(col("price")).alias("stddev"),
            stddev(col("price2")).alias("stddev2"))

# here we only do basic aggregations like average difference or standard deviation
# In order to compute the PCC, we would need to do multiples aggregations on top of each other
# (average, then sum of difference to average of X and Y, then standard deviation...)
# but multiple aggregations are not possible

# the output should be the corelation coefficient of a pair of stocks for a given time-window

# Display results to Console
query = final_df.writeStream.outputMode("append").format("console").start()
query.awaitTermination()