In [5]:
from pyspark.sql.functions import udf, col, window, lit
from pyspark.sql.types import StringType
from util import kafkaConsumer
import pandas as pd
from operations import SparkInst
import os


In [6]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,org.mongodb.spark:mongo-spark-connector_2.12:10.1.1 pyspark-shell'
spark_job=SparkInst("AWAS SYSTEM", 5, kafka_output_topic="violations")

In [None]:
import pandas as pd

df_pd = pd.read_csv("data/camera.csv")
if '_id' in df_pd.columns:
    df_pd.drop(columns=['_id'], inplace=True)
spark_df = spark_job.get_session().createDataFrame(df_pd)

speed_limit_map = {row['camera_id']: row['speed_limit'] for row in spark_df.select("camera_id", "speed_limit").collect()}
broadcast_map = spark_job.essentialData_broadcast(spark_df)

def mark_speeding(camera_id:str, speed:float, ops:str)-> str:
    """
    """
    limit = broadcast_map.value.get(camera_id)
    if limit is not None and ops == "instant":
        return "INSTANT_VIOLATION" if speed > limit else None
    elif limit is not None and ops == "average":
        return "AVERAGE_VIOLATION" if speed > limit else None
    return "NONE"

speeding_udf = udf(mark_speeding, StringType())

# Step 5: Apply UDF to each streaming dataframe
def add_speed_flag(df, ops: str):
    return df.withColumn("speed_flag_{ops}", speeding_udf(col("camera_id"), col("speed_reading"), lit(ops)))


  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [None]:
from pyspark.sql.functions import expr, col, lit

# Attach Kafka streams
stream_a = spark_job.attach_kafka_stream("camera_events_a", "172.17.0.1", "5 minutes")
stream_b = spark_job.attach_kafka_stream("camera_events_b", "172.17.0.1", "5 minutes")
stream_c = spark_job.attach_kafka_stream("camera_events_c", "172.17.0.1", "5 minutes")
from pyspark.sql.functions import expr, col, lit

# Flag and drop unnecessary fields
stream_a_flagged = add_speed_flag(stream_a.drop("batch_id", "event_id", "sent_at"), "instant")
stream_b_flagged = add_speed_flag(stream_b.drop("batch_id", "event_id", "sent_at"), "instant")
stream_c_flagged = add_speed_flag(stream_c.drop("batch_id", "event_id", "sent_at"), "instant")

# Rename for joining
a = stream_a_flagged.selectExpr(
    "car_plate",
    "camera_id as camera_id_start",
    "timestamp as timestamp_start",
    "speed_reading as speed_reading_start",
    "producer as producer_start",
    "speed_flag_instant as speed_flag_instant_start"
)

b1 = stream_b_flagged.selectExpr(
    "car_plate",
    "camera_id as camera_id_end",
    "timestamp as timestamp_end",
    "speed_reading as speed_reading_end",
    "producer as producer_end",
    "speed_flag_instant as speed_flag_instant_end"
)

b2 = stream_b_flagged.selectExpr(
    "car_plate",
    "camera_id as camera_id_start",
    "timestamp as timestamp_start",
    "speed_reading as speed_reading_start",
    "producer as producer_start",
    "speed_flag_instant as speed_flag_instant_start"
)

c = stream_c_flagged.selectExpr(
    "car_plate",
    "camera_id as camera_id_end",
    "timestamp as timestamp_end",
    "speed_reading as speed_reading_end",
    "producer as producer_end",
    "speed_flag_instant as speed_flag_instant_end"
)

# Add watermark
a = a.withWatermark("timestamp_start", "10 hours")
b1 = b1.withWatermark("timestamp_end", "10 hours")
b2 = b2.withWatermark("timestamp_start", "10 hours")
c = c.withWatermark("timestamp_end", "10 hours")

# Join A & B
ab_join = b1.alias("b").join(
    a.alias("a"),
    (col("a.car_plate") == col("b.car_plate")) &
    (col("a.timestamp_start") > col("b.timestamp_end")) &
    (col("a.timestamp_start") <= col("b.timestamp_end") + expr("interval 10 minutes")),
    "inner"
).select(
    col("a.car_plate"),
    col("a.camera_id_start"),
    col("b.camera_id_end"),
    col("a.timestamp_start"),
    col("b.timestamp_end"),
    ((col("a.speed_reading_start") + col("b.speed_reading_end")) / 2).alias("avg_speed_reading"),
    speeding_udf(
        col("a.camera_id_start"),
        ((col("a.speed_reading_start") + col("b.speed_reading_end")) / 2),
        lit("average")
    )
)

# Join B & C
bc_join = b2.alias("b").join(
    c.alias("c"),
    (col("b.car_plate") == col("c.car_plate")) &
    (col("c.timestamp_end") > col("b.timestamp_start")) &
    (col("c.timestamp_end") <= col("b.timestamp_start") + expr("interval 10 minutes")),
    "inner"
).select(
    col("b.car_plate"),
    col("b.camera_id_start"),
    col("c.camera_id_end"),
    col("b.timestamp_start"),
    col("c.timestamp_end"),
    ((col("b.speed_reading_start") + col("c.speed_reading_end")) / 2).alias("avg_speed_reading"),
    speeding_udf(
        col("b.camera_id_start"),
        ((col("b.speed_reading_start") + col("c.speed_reading_end")) / 2),
        lit("average")
    )
)

# Union result
res = ab_join.unionByName(bc_join).dropDuplicates()

# Write to the console
query = (
    res.writeStream
    .format("console")
    .option("checkpointLocation", "./stream_checkpoints_3")
    .outputMode("append")
    .option("truncate", False)  # Optional: show full column contents
    .start()
)

# Run query and handle termination gracefully
try:
    query.awaitTermination()
except KeyboardInterrupt:
    print("Interrupted by CTRL-C. Stopping query.")
except StreamingQueryException as exc:
    print(f"Streaming error: {exc}")
finally:
    query.stop()



# Write to the console
query = (
    res.writeStream
    .format("console")
    .option("checkpointLocation", "./stream_checkpoints_3")
    .outputMode("append")
    .option("truncate", False)  # Optional: show full column contents
    .start()
)

# Run query and handle termination gracefully
try:
    query.awaitTermination()
except KeyboardInterrupt:
    print("Interrupted by CTRL-C. Stopping query.")
except StreamingQueryException as exc:
    print(f"Streaming error: {exc}")
finally:
    query.stop()


In [5]:
from util import kafkaProducer_Violation
producer=kafkaProducer_Violation("data/camera_event_historic.csv", "172.17.0.1:9092", "producer_violation", "violation_input", 5)
producer.produce_all()

[INFO] Successfully loaded data from data/camera_event_historic.csv
[INFO] Connected to Kafka server at 172.17.0.1:9092
[INFO] Starting to produce 50000 records...
[INFO] Finished producing all records and closed producer.


In [6]:
import pandas as pd
df=pd.read_csv("data/camera_event_historic.csv")
df.columns = df.columns.str.strip()
df = df.sort_values(by="timestamp_start")
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,violation_id,car_plate,camera_id_start,camera_id_end,timestamp_start,timestamp_end,speed_reading
0,a8924006-c018-43b6-970b-a1815065ba99,HW 5499,1,2,2015-01-01T08:00:57,2015-01-01T08:01:23.544166,135.6
1,bf2d854a-1034-4ebe-b532-ce13f3322683,RO 15,1,2,2015-01-01T08:06:06,2015-01-01T08:06:30.995556,144.0
2,ead81e5b-1d61-4ae5-8b2e-5055b266c93e,FK 4505,1,2,2015-01-01T08:08:11,2015-01-01T08:08:35.505799,146.9
3,501d53cd-1d7b-4aa2-9952-594256160f25,ZQQ 5,2,3,2015-01-01T08:08:54.588118,2015-01-01T08:09:28.860412,105.0
4,54c18253-b1e4-4f2c-b8df-937fad38de79,TUL 99,1,2,2015-01-01T08:09:16,2015-01-01T08:09:48.660471,110.2
