## Project Template

In [1]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("MyApp") \
    .config("spark.jars.packages", 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0') \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .getOrCreate()


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.9/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-01be8b2c-2386-42dc-9ff3-05d93bcce7fc;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.0 in central
	found org.apache.kafka#kafka-clients;2.4.1 in central
	found com.github.luben#zstd-jni;1.4.4-3 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.7.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
:: resolution report :: resolve 172ms :: artifacts dl 3m

Be sure to start the stream on Kafka!

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, TimestampType, DateType

schema = StructType(
      [
        StructField("name", StringType(), False),
        StructField("price", DoubleType(), False),
        StructField("timestamp", TimestampType(), False),
      ]
    )

In [3]:
kafka_server = "kafka1:9092"   
from pyspark.sql.functions import from_json

lines = (spark.readStream                        # Get the DataStreamReader
  .format("kafka")                                 # Specify the source format as "kafka"
  .option("kafka.bootstrap.servers", kafka_server) # Configure the Kafka server name and port
  .option("subscribe", "stock")                       # Subscribe to the "en" Kafka topic 
  .option("startingOffsets", "earliest")           # The start point when a query is started
  .option("maxOffsetsPerTrigger", 100)             # Rate limit on max offsets per trigger interval
  .load()
  .select(from_json(col("value").cast("string"), schema).alias("parsed_value"))
# Load the DataFrame
)
df = lines.select("parsed_value.*")


## The assignment starts here



## Select the N most valuable stocks in a window (TASK 1)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col
from pyspark.sql import functions as F

# Group the data by a 20-minute window and stock name, and calculate the average price
windowedDF = df.groupBy(
    window(col("timestamp"), "20 minutes"),
    "name"
).avg("price").withColumnRenamed("avg(price)", "avg_price")

# Now, since you want the 5 most valuable stocks, we will have to do a ranking
# For streaming DataFrame, we would need to write the results in update mode to a query sink that supports updates (like a memory sink or a Delta table)

# We need to define an aggregation query
topStocksQuery = windowedDF \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .foreachBatch(lambda df, epoch_id: df.orderBy(F.desc("avg_price")).limit(5).show()) \
    .start()

topStocksQuery.awaitTermination(timeout=5)
topStocksQuery.stop()

## Select the stocks that lost value between two windows (TASK 2)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col, last
from pyspark.sql.types import StringType, DoubleType, StructType, StructField, TimestampType


# Use a dictionary to hold the state of each stock
stock_states = {}
# Set to hold names of stocks that lost value
stocks_that_lost_value = set()

# Define a class to hold the state of each stock
class StockState:
    def __init__(self, name):
        self.name = name
        self.last_price = None
        self.last_window_end = None
        self.lost_value = False  # Add flag to track if the stock lost value

    def update(self, price, window_end):
        if self.last_window_end is None or window_end > self.last_window_end:
            if self.last_price is not None and price < self.last_price:
                self.lost_value = True # Stock lost value
                stocks_that_lost_value.add(self.name)
            else:
                self.lost_value = False  # Stock did not lose value
            self.last_price = price
            self.last_window_end = window_end
        # Return the state including whether the stock lost value
        return (self.name, self.last_price, self.last_window_end, self.lost_value)


# Define the watermark and window operation on the streaming DataFrame
windowedDF = df \
    .withWatermark("timestamp", "10 minutes") \
    .groupBy(
        col("name"),
        window(col("timestamp"), "20 minutes")
    ) \
    .agg(
        last("price").alias("last_price")
    )

def process_batch(df, epoch_id):
    # Process the DataFrame row by row
    for row in df.collect():
        stock_name = row['name']
        price = row['last_price']
        window_end = row['window'].end

        # Get the stock state or create it if it doesn't exist
        stock_state = stock_states.get(stock_name, StockState(stock_name))
        
        # Update the state and check if the stock lost value
        result = stock_state.update(price, window_end)
        stock_states[stock_name] = stock_state  # Update the state in the dictionary

    print(stocks_that_lost_value)

# Define the streaming query using the process_batch function
query = windowedDF.writeStream \
    .outputMode("update") \
    .foreachBatch(process_batch) \
    .start()  # Start the streaming query

# Await termination of the streaming query (this is running continuously)
query.awaitTermination(timeout=60)
query.stop()


## Select the stock that gained the most (between windows) (TASK 3)

## TASK 4

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col, last

# Define a class to hold the state of each stock
class StockState:
    def __init__(self, name):
        self.name = name
        self.last_price = None
        self.last_window_end = None

    def update(self, new_price, window_end):
        if self.last_window_end is None or window_end > self.last_window_end:
            # Calculate the gain only if it's a new window and we have a previous price
            if self.last_price is not None:
                gain = new_price - self.last_price
                self.last_price = new_price
                self.last_window_end = window_end
                return gain  # Return the gain to be collected
            else:
                # If there's no previous price, just update the state
                self.last_price = new_price
                self.last_window_end = window_end
                return 0  # Return zero gain
        else:
            # If it's not a new window, do nothing
            return 0  # Return zero gain



# Assuming df is a DataFrame with the schema: [timestamp, name, price]
# Define the watermark and window operation on the streaming DataFrame
windowedDF = df \
    .withWatermark("timestamp", "10 minutes") \
    .groupBy(
        col("name"),
        window(col("timestamp"), "20 minutes")
    ) \
    .agg(
        last("price").alias("last_price")
    )

# Use a dictionary to hold the state of each stock
stock_states = {}
# Set to hold tuples of (stock_name, gain)
gains_set = set()

def process_batch(df, epoch_id):
    # Temporary list to hold gains for sorting
    gains_list = []

    # Process the DataFrame row by row
    for row in df.collect():
        stock_name = row['name']
        price = row['last_price']
        window_end = row['window'].end

        # Get the stock state or create it if it doesn't exist
        stock_state = stock_states.get(stock_name, StockState(stock_name))

        # Update the state and collect the gain
        gain = stock_state.update(price, window_end)
        if gain > 0:
            gains_list.append((stock_name, gain))
        stock_states[stock_name] = stock_state  # Update the state in the dictionary

    # Update the set with new gains and sort it
    gains_set.update(gains_list)
    # Sort the set by gain in descending order and take the top 10
    top_gainers = sorted(gains_set, key=lambda x: x[1], reverse=True)[:10]

    print(top_gainers)

# Define the streaming query using the process_batch function
query = windowedDF.writeStream \
    .outputMode("update") \
    .foreachBatch(process_batch) \
    .start()  # Start the streaming query

# Await termination of the streaming query (this is running continuously)
query.awaitTermination(timeout=60)
#query.stop()

23/12/18 08:42:56 WARN StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-815fcdaa-3740-4d2a-a0de-517f28e6ef56. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
                                                                                

[]


                                                                                

[]


                                                                                

[]


                                                                                

[]


                                                                                

[]


                                                                                

[]


                                                                                

[]


                                                                                

[]


                                                                                

[]


                                                                                

[]


                                                                                

[]


                                                                                

[('LYB', 19.099999999999994), ('A', 16.82), ('FISV', 13.925000000000004), ('ETFC', 11.35), ('ADM', 8.509999999999998), ('AFL', 8.350000000000001), ('MAR', 8.079999999999998), ('BBT', 6.669999999999998), ('CHK', 5.800000000000001), ('APH', 5.515000000000001)]


                                                                                

[('REGN', 104.67599999999999), ('ALXN', 72.17999999999999), ('BLK', 49.95999999999998), ('HII', 49.519999999999996), ('ORLY', 36.235), ('EOG', 32.010000000000005), ('MLM', 31.36), ('PKG', 30.0), ('CMI', 23.519899999999993), ('GWW', 23.430000000000007)]


                                                                                

[('AZO', 133.99), ('REGN', 104.67599999999999), ('ALXN', 72.17999999999999), ('BLK', 49.95999999999998), ('HII', 49.519999999999996), ('AAP', 45.59), ('NOC', 42.43000000000001), ('WDC', 40.5), ('ORLY', 36.235), ('FB', 32.56)]


                                                                                

[('AZO', 133.99), ('REGN', 104.67599999999999), ('ALXN', 72.17999999999999), ('BLK', 49.95999999999998), ('HII', 49.519999999999996), ('AAP', 45.59), ('NOC', 42.43000000000001), ('WDC', 40.5), ('LLL', 36.34), ('ORLY', 36.235)]


KeyboardInterrupt: 



## TASK 4

In [None]:
df.printSchema()

In [None]:
from pyspark.sql.functions import window, col, first, last

# Defining the windowed data with appropriate watermark
windowed_data = df.withWatermark("timestamp", "1 hour") \
                  .groupBy(window("timestamp", "1 hour"), col("name")) \
                  .agg((((first(df.price) - last(df.price)) / first(df.price)) * 100).alias("value_change"))

# Defining the threshold for acceptable percentage change
threshold = 5  # A 5% change threshold is used in this example

# Filtering the data for stocks that did not lose too much value
control_pass = windowed_data.filter(col("value_change") >= -threshold)

# Starting the streaming query
query = control_pass.writeStream \
                .outputMode("update") \
                .format("console") \
                .start()

query.awaitTermination()

## Bonus(Influx db & Gafana)

In [None]:
import hashlib

def generate_window_id(window_start):
    # This function assumes window_start is a string like '2023-10-29 14:40:00'
    # Replace non-numeric characters with nothing to create a numeric string
    numeric_string = window_start.replace('-', '').replace(' ', '').replace(':', '')
    
    # Alternatively, use a hash function to generate a shorter, unique identifier
    # window_id = hashlib.md5(window_start.encode()).hexdigest()[:8]  # Use the first 8 characters

    return numeric_string

In [None]:
import influxdb_client
from influxdb_client.client.write_api import SYNCHRONOUS
from pyspark.sql import Row
import time

class InfluxDBForeachWriter:
    def open(self, partition_id, epoch_id):
        self.url = "http://influxdb:8086"
        self.token = "uX9_2aP6otzrpPG6XRwZ8LPShIEtFbtiwU2Yya0HfC8fCWyMbHZ7Xm-ivXo7on2MYaPqEEwJL2TAtRU-O_n56A=="
        self.org = "streaming_practice"
        self.bucket = "stocks"
        self.client = influxdb_client.InfluxDBClient(
            url=self.url,
            token=self.token,
            org=self.org
        )
        self.write_api = self.client.write_api(write_options=SYNCHRONOUS)
        return True

    def process(self, row: Row,window_id:int):
        try:
            # Convert timestamp to the correct format for InfluxDB (RFC3339)
            # Assuming timestamp is a string in the format 'YYYY-MM-DD HH:MM:SS'

            window_start_str = row.window.start.strftime('%Y-%m-%d %H:%M:%S')

            # Prepare data for InfluxDB
            data_point = influxdb_client.Point("stock_testing") \
                .tag("name", str(row.name)) \
                .tag("window_id",window_id) \
                .field("avg_price", float(row.avg_price)) \
                .field("window_start", window_start_str) \
                .time(int(time.time_ns()))  # Use the current time in nanoseconds
            print(f"Attempting to write data point: {data_point.to_line_protocol()}")

            # Write data to InfluxDB
            self.write_api.write(bucket=self.bucket, record=data_point)
            print("Write successful")
        except Exception as e:
            print(f"An error occurred: {e}")



    def close(self, error):
        # Close the InfluxDB client
        self.write_api.close()
        self.client.close()
        return True


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col
from pyspark.sql import functions as F
from pyspark.sql.streaming import DataStreamWriter



# Initialize the window_id globally
window_id = 0

def write_to_influxdb(df, epoch_id):
    # Use the global keyword to indicate that you are using the global window_id variable
    global window_id

    # Instantiate the InfluxDBForeachWriter outside the loop
    writer = InfluxDBForeachWriter()

    if writer.open(epoch_id, None):  # Open the connection to InfluxDB using the epoch_id
        print(window_id)
        # Sort by average price and take the top 5
        top5_df = df.orderBy(F.desc("avg_price")).limit(5).collect()

        for row in top5_df:
            try:
                # Pass window_id along with the row data
                writer.process(row, window_id)
            except Exception as e:
                print(f"Error writing to InfluxDB: {e}")
                break  # or continue, depending on your requirement

        # Increment window_id for the next batch of data
        window_id += 1
        writer.close(epoch_id)  # Close the connection after all rows are written
    else:
        print("Failed to open InfluxDB connection")




writer = InfluxDBForeachWriter()

windowedDF = df.groupBy(
    window(col("timestamp"), "20 minutes"),
    "name"
).avg("price").withColumnRenamed("avg(price)", "avg_price")

# Define the streaming query using foreachBatch
topStocksQuery = windowedDF \
    .writeStream \
    .outputMode("complete") \
    .foreachBatch(write_to_influxdb) \
    .start()

topStocksQuery.awaitTermination(timeout=50)
topStocksQuery.stop()
