In [5]:
from pyspark.sql.functions import udf, col, window, lit
from pyspark.sql.types import StringType
from util import kafkaConsumer
import pandas as pd
from operations import SparkInst


In [6]:
spark_job=SparkInst("AWAS SYSTEM", 5, kafka_output_topic="violations")

In [7]:
import pandas as pd

df_pd = pd.read_csv("data/camera.csv")
if '_id' in df_pd.columns:
    df_pd.drop(columns=['_id'], inplace=True)
spark_df = spark_job.get_session().createDataFrame(df_pd)

# Step 3: Broadcast your speed limit map
speed_limit_map = {row['camera_id']: row['speed_limit'] for row in spark_df.select("camera_id", "speed_limit").collect()}
broadcast_map = spark_job.essentialData_broadcast(spark_df)
# Step 4: Define your UDF using broadcast variable
def mark_speeding(camera_id:str, speed:float, ops:str):
    limit = broadcast_map.value.get(camera_id)
    if limit is not None and ops == "instant":
        return "INSTANT_VIOLATION" if speed > limit else None
    elif limit is not None and ops == "average":
        return "AVERAGE_VIOLATION" if speed > limit else None
    return "NONE"

speeding_udf = udf(mark_speeding, StringType())

# Step 5: Apply UDF to each streaming dataframe
def add_speed_flag(df, ops: str):
    return df.withColumn("speed_flag", speeding_udf(col("camera_id"), col("speed_reading"), lit(ops)))


  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [8]:
from pyspark.sql.functions import expr, col, lit

# Attach Kafka streams
stream_a = spark_job.attach_kafka_stream("camera_events_a", "172.17.0.1", "5 minutes")
stream_b = spark_job.attach_kafka_stream("camera_events_b", "172.17.0.1", "5 minutes")
stream_c = spark_job.attach_kafka_stream("camera_events_c", "172.17.0.1", "5 minutes")

# Drop unnecessary columns and apply speed flag
stream_a_flagged = add_speed_flag(stream_a.drop("batch_id", "event_id"), "instant")
stream_b_flagged = add_speed_flag(stream_b.drop("batch_id", "event_id"), "instant")
stream_c_flagged = add_speed_flag(stream_c.drop("batch_id", "event_id"), "instant")

# Duplicate stream_b for two separate joins
dup1_b = stream_b_flagged
dup2_b = stream_b_flagged

# Rename timestamp and camera_id for join logic
stream_a_renamed = stream_a_flagged \
    .withColumnRenamed("timestamp", "timestamp_start") \
    .withColumnRenamed("camera_id", "camera_id_start")

stream_b1_renamed = dup1_b \
    .withColumnRenamed("timestamp", "timestamp_end") \
    .withColumnRenamed("camera_id", "camera_id_end")

stream_c_renamed = stream_c_flagged \
    .withColumnRenamed("timestamp", "timestamp_end") \
    .withColumnRenamed("camera_id", "camera_id_end")

stream_b2_renamed = dup2_b \
    .withColumnRenamed("timestamp", "timestamp_start") \
    .withColumnRenamed("camera_id", "camera_id_start")

# Apply watermarks
stream_a_watermarked = stream_a_renamed.withWatermark("timestamp_start", "10 hours")
stream_b1_watermarked = stream_b1_renamed.withWatermark("timestamp_end", "10 hours")
stream_c_watermarked = stream_c_renamed.withWatermark("timestamp_end", "10 hours")
stream_b2_watermarked = stream_b2_renamed.withWatermark("timestamp_start", "10 hours")

# Join A and B
ab_join = stream_b1_watermarked.alias("b").join(
    stream_a_watermarked.alias("a"),
    (
        (col("a.car_plate") == col("b.car_plate")) &
        (col("a.timestamp_start") > col("b.timestamp_end")) &
        (col("a.timestamp_start") <= col("b.timestamp_end") + expr("interval 10 minutes"))
    ),
    "inner"
)

ab_join = ab_join.withColumn(
    "avg_speed_reading",
    (col("a.speed_reading") + col("b.speed_reading")) / 2
).withColumn(
    "speed_flag",
    speeding_udf(col("a.camera_id_start"), col("a.speed_reading"), lit("average"))
)

# Join B and C
bc_join = stream_b2_watermarked.alias("b").join(
    stream_c_watermarked.alias("c"),
    (
        (col("b.car_plate") == col("c.car_plate")) &
        (col("c.timestamp_end") > col("b.timestamp_start")) &
        (col("c.timestamp_end") <= col("b.timestamp_start") + expr("interval 10 minutes"))
    ),
    "inner"
)

bc_join = bc_join.withColumn(
    "avg_speed_reading",
    (col("b.speed_reading") + col("c.speed_reading")) / 2
).withColumn(
    "speed_flag",
    speeding_udf(col("c.camera_id_end"), col("c.speed_reading"), lit("average"))
)

res = ab_join.union(bc_join)
res = res.dropDuplicates()

# Write to the console
query = (
    res.writeStream
    .format("console")
    .option("checkpointLocation", "./stream_checkpoints_3")
    .outputMode("append")
    .option("truncate", False)  # Optional: show full column contents
    .start()
)

# Run query and handle termination gracefully
try:
    query.awaitTermination()
except KeyboardInterrupt:
    print("Interrupted by CTRL-C. Stopping query.")
except StreamingQueryException as exc:
    print(f"Streaming error: {exc}")
finally:
    query.stop()
