In [None]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

In [4]:
import os
import pandas as pd
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,org.mongodb.spark:mongo-spark-connector_2.12:10.1.1 pyspark-shell'
from pymongo import MongoClient
import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, element_at, when, broadcast

from pyspark.sql.types import (
    StructType, StringType, IntegerType, DoubleType, TimestampType
)
from pyspark.sql.functions import (
    col, expr, from_json
)
import uuid

class SparkInst:
    def __init__(self, app_name: str, batch_interval: int, kafka_output_topic: str):
        """
        Initializes a Spark instance with the given application name, batch interval, and Kafka topic.

        Args:
            app_name (str): The name of the Spark application.
            batch_interval (int): The interval (in seconds) at which streaming data is processed.
            kafka_topic (str): The name of the Kafka topic to consume from.
        """
        self.batch_interval = batch_interval
        self.kafka_output_topic = kafka_output_topic
        self.eventSchema= StructType() \
                        .add("batch_id", IntegerType()) \
                        .add("event_id", StringType()) \
                        .add("car_plate", StringType()) \
                        .add("camera_id", IntegerType()) \
                        .add("timestamp", TimestampType()) \
                        .add("speed_reading", DoubleType()) \
                        .add("producer", StringType()) \
                        .add("sent_at", TimestampType())
        self.spark = SparkSession.builder.appName(app_name).master("local[*]").getOrCreate()

    def get_session(self):
        return self.spark
    
    def attach_kafka_stream(self, topic_name:str, hostip:str, watermark_time:str):
        return (
            self.spark.readStream
            .format("kafka")
            .option("kafka.bootstrap.servers", f"{hostip}:9092")
            .option("subscribe", topic_name)
            .load()
            .selectExpr("CAST(value AS STRING) as json")
            .select(from_json(col("json"), self.eventSchema).alias("data"))
            .select("data.*")
            .withWatermark("sent_at", watermark_time)
        )
    

    def essentialData_broadcast(self, sdf):
        """
        Filter a Spark DataFrame by topic_id and broadcast it.

        Args:
            sdf (DataFrame): Spark DataFrame

        Returns:
            Broadcast variable containing a dictionary of camera_id to speed_limit
        """
        # Select necessary columns
        df_filtered = sdf.select("camera_id", "speed_limit")

        # Convert to a Python dictionary (camera_id -> speed_limit)
        data = df_filtered.rdd.map(lambda row: (row["camera_id"], row["speed_limit"])).collectAsMap()

        # Broadcast the dictionary
        spark_context = self.spark.sparkContext
        return spark_context.broadcast(data)

In [5]:
spark_job=SparkInst("AWAS SYSTEM", 5, "violations")

In [8]:
import pandas as pd

df_pd = pd.read_csv("data/camera.csv")
if '_id' in df_pd.columns:
    df_pd.drop(columns=['_id'], inplace=True)
spark_df = spark_job.get_session().createDataFrame(df_pd)

# Step 3: Broadcast your speed limit map
speed_limit_map = {row['camera_id']: row['speed_limit'] for row in spark_df.select("camera_id", "speed_limit").collect()}
broadcast_map = spark_job.essentialData_broadcast(spark_df)
# Step 4: Define your UDF using broadcast variable
def mark_speeding(camera_id, speed):
    limit = broadcast_map.value.get(camera_id)
    if limit is None:
        return "UNKNOWN"
    return "INSTANT_VIOLATION" if speed > limit else None

speeding_udf = udf(mark_speeding, StringType())

# Step 5: Apply UDF to each streaming dataframe
def add_speed_flag(df):
    return df.withColumn("speed_flag", speeding_udf(col("camera_id"), col("speed_reading")))

  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [None]:
# Attach Kafka streams
stream_a = spark_job.attach_kafka_stream("a", "9092", "5 minutes")
stream_b = spark_job.attach_kafka_stream("b", "9092", "5 minutes")
stream_c = spark_job.attach_kafka_stream("c", "9092", "5 minutes")

# Apply transformation
stream_a_flagged = add_speed_flag(stream_a)
stream_b_flagged = add_speed_flag(stream_b)
stream_c_flagged = add_speed_flag(stream_c)

dup1_b=stream_b_flagged
dup2_b=stream_b_flagged

stream_a_watermarked=stream_a_flagged.withWatermark("timestamp", "5 minutes")
dup1_b.withWatermark("car_plate", "5 minutes").join(
    stream_a_watermarked,
    "inner"
)

stream_c_watermarked=stream_c_flagged.withWatermark("timestamp", "5 minutes")
dup1_b.withWatermark("car_plate", "5 minutes").join(
    stream_a_watermarked,
    "inner"
)












root
 |-- batch_id: integer (nullable = true)
 |-- event_id: string (nullable = true)
 |-- car_plate: string (nullable = true)
 |-- camera_id: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- speed_reading: double (nullable = true)
 |-- producer: string (nullable = true)
 |-- sent_at: timestamp (nullable = true)
 |-- speed_flag: string (nullable = true)



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.8/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
spark_job.essentialData_broadcast()