In [1]:
import pymongo
from pymongo import MongoClient
import csv
client = MongoClient("172.27.65.143", 27017)
db = client.fit3182_db


In [None]:


# ── Your starter connection ──
camera_coll = db.Camera

# ── Skip if already imported ──
if camera_coll.estimated_document_count() > 0:
    print("Camera collection already contains data. Skipping import.")
else:
    idx_name = camera_coll.create_index(
        [("pos", pymongo.ASCENDING)],
        name="pos_idx"
    )
    print(f"Ensured index on 'pos': {idx_name}")
    
    # ── Load CSV and insert ──
    csv_path = 'data/camera.csv'
    docs = []
    with open(csv_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            docs.append({
                "_id":        int(row['camera_id']),
                "lat":        float(row['latitude']),
                "long":       float(row['longitude']),
                "pos":        float(row['position']),
                "speed_limit": int(row['speed_limit'])
            })

    if docs:
        result = camera_coll.insert_many(docs)
        print(f"Inserted {len(result.inserted_ids)} camera documents.")
        print("Current indexes on Camera:")
        for name, info in camera_coll.index_information().items():
            print(f" • {name}: {info['key']}")
    else:
        print("No camera records found in CSV.")

In [None]:
# ── Connection ──
vehicle_coll = db.Vehicle

#clear out the collection first 
deleted = vehicle_coll.delete_many({})
print(f"Cleared collection. Deleted {deleted.deleted_count} documents.")

if vehicle_coll.estimated_document_count() > 0:
    print("Vehicle collection already contains data. Skipping import.")
else:
    # ── Prepare sets & counters ──
    existing_ids = set(vehicle_coll.distinct('_id'))
    seen_in_file = set()
    docs_to_insert = []
    update_count = 0
    added_count = 0

    csv_path = 'data/vehicle.csv'
    with open(csv_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            plate = row['car_plate']

            # Parse the incoming registration_date
            ts = row['registration_date'].rstrip("Z")
            reg_date = datetime.fromisoformat(ts)

            if plate in seen_in_file:
                # Plate already in DB → check whether to update
                existing = vehicle_coll.find_one(
                    {"_id": plate}
                )
                if existing and reg_date > existing['registration_date']:
                    # Only update if the CSV date is newer
                    vehicle_coll.update_one(
                        {"_id": plate},
                        {"$set": {
                            "registration_date": reg_date,
                            "owner_name":        row['owner_name'],
                            "owner_addr":        row['owner_addr'],
                            "vehicle_type":      row['vehicle_type']
                        }}
                    )
                    update_count += 1
            else:
                seen_in_file.add(plate)
                # Brand-new plate → schedule for insert
                vehicle_coll.insert_one({
                    "_id":               plate,
                    "owner_name":        row['owner_name'],
                    "owner_addr":        row['owner_addr'],
                    "vehicle_type":      row['vehicle_type'],
                    "registration_date": reg_date
                })
                added_count += 1

    # ── Do the batch insert, if any ──
    if added_count > 0:
        print(f"Inserted {added_count} new vehicle documents.")
    else:
        print("No new vehicle records to insert.")

    # ── Report on any updates we made ──
    if update_count:
        print(f"Updated {update_count} existing vehicle document{'s' if update_count>1 else ''}.")

    # ── (Optional) show your indexes ──
    print("Current indexes on Vehicle:")
    for name, info in vehicle_coll.index_information().items():
        print(f" • {name}: {info['key']}")

Cleared collection. Deleted 868 documents.
Inserted 9844 new vehicle documents.
Updated 69 existing vehicle documents.
Current indexes on Vehicle:
 • _id_: [('_id', 1)]


# Create Violation and put historic.csv into it

In [None]:
import csv
import uuid
import pandas as pd
from datetime import datetime
from pymongo import MongoClient

# ── MongoDB Connection ──
client = MongoClient("172.27.65.143", 27017)
db = client.fit3182_db
violation_coll = db.Violation
violation_coll.drop()

# Clear out the collection first 
deleted = violation_coll.delete_many({})
print(f"Cleared collection. Deleted {deleted.deleted_count} documents.")

# ── Index Creation ──
violation_coll.create_index([("violations.violation_id", 1)], name="idx_violation_id")  # ❌ removed unique=True
violation_coll.create_index([("date", 1)], name="idx_date")
violation_coll.create_index([("violations.camera_id_start", 1)], name="idx_camera_start")
violation_coll.create_index([("violations.camera_id_end", 1)], name="idx_camera_end")
violation_coll.create_index([("date", 1), ("violations.measured_speed", -1)], name="idx_measured_speed")
violation_coll.create_index([("violations.timestamp_start", 1)], name="idx_timestamp_start")


# ── CSV Read ──
csv_path = "data/camera_event_historic.csv"
docs = []

with open(csv_path, newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Parse timestamp_start to datetime
        if row.get('timestamp_start'):
            timestamp_start = datetime.fromisoformat(row['timestamp_start'].rstrip("Z"))
        else:
            # skip row if no timestamp_start
            continue

        timestamp_end = None
        if row.get('timestamp_end'):
            timestamp_end = datetime.fromisoformat(row['timestamp_end'].rstrip("Z"))

        # Create date bucket from timestamp_start (just the date part)
        date_bucket = datetime(timestamp_start.year, timestamp_start.month, timestamp_start.day)

        # Construct document
        violation_doc = {
            "car_plate": row['car_plate'],
            "date": date_bucket,
            "violations": [
                {
                    "violation_id": str(uuid.uuid4()),
                    "type": "average",
                    "camera_id_start": row['camera_id_start'],
                    "camera_id_end": row['camera_id_end'] if row.get('camera_id_end') else None,
                    "timestamp_start": timestamp_start,
                    "timestamp_end": timestamp_end,
                    "measured_speed": float(row['speed_reading']) if row.get('speed_reading') else None,
                }
            ]
        }

        docs.append(violation_doc)

# ── Insert All Documents ──
if docs:
    violation_coll.insert_many(docs)
    print(f"Inserted {len(docs)} violation documents.")
else:
    print("No documents to insert.")

Cleared collection. Deleted 0 documents.
Inserted 50000 violation documents.


In [None]:
from Operations import SparkInst
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,org.mongodb.spark:mongo-spark-connector_2.12:10.1.1 pyspark-shell'
spark_job=SparkInst("AWAS SYSTEM", 5, kafka_output_topic="violations")

In [None]:
# add the folder where util.py lives
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import BooleanType
import pandas as pd

camera_coll=client.fit3182_db.Camera
cursor = camera_coll.find()
df_pd = pd.DataFrame(list(cursor))
#rename mongo index _id as camera_id
if '_id' in df_pd.columns:
    df_pd.rename(columns={'_id': 'camera_id'}, inplace=True)

# Convert the pandas DataFrame into a Spark DataFrame
spark_df = spark_job.get_session().createDataFrame(df_pd)


speed_limit_map = {row['camera_id']: row['speed_limit'] for row in spark_df.select("camera_id", "speed_limit").collect()}
broadcast_map = spark_job.essentialData_broadcast(spark_df)

def mark_speeding(camera_id:str, speed:float, ops:str)-> str:
    """
    """
    limit = broadcast_map.value.get(camera_id)
    if limit is not None and ops == "instant":
        return True if speed > limit else False
    elif limit is not None and ops == "average":
        return True  if speed > limit else False
    return False

speeding_udf = udf(mark_speeding, BooleanType())

# Step 5: Apply UDF to each streaming dataframe
def add_speed_flag(df, ops: str):
    return df.withColumn(f"speed_flag_{ops}", speeding_udf(col("camera_id"), col("speed_reading"), lit(ops)))


  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


In [14]:
from pyspark.sql.functions import expr, col, lit

# Attach Kafka streams
stream_a = spark_job.attach_kafka_stream("camera_event_a", "172.17.0.1", "24 hours")
stream_b = spark_job.attach_kafka_stream("camera_event_b", "172.17.0.1", "24 hours")
stream_c = spark_job.attach_kafka_stream("camera_event_c", "172.17.0.1", "24 hours")

# Flag and drop unnecessary fields
stream_a_flagged = add_speed_flag(stream_a.drop("event_id", "sent_at", "batch_id"), "instant")
stream_b_flagged = add_speed_flag(stream_b.drop("event_id", "sent_at", "batch_id"), "instant")
stream_c_flagged = add_speed_flag(stream_c.drop("event_id", "sent_at", "batch_id"), "instant")

# Rename for joining
a = stream_a_flagged.selectExpr(
    "car_plate",
    "camera_id as camera_id_a",
    "timestamp as timestamp_a",
    "speed_reading as speed_reading_a",
    "producer_id as producer_a",
    "speed_flag_instant as speed_flag_instant_a"
)

b = stream_b_flagged.selectExpr(
    "car_plate",
    "camera_id as camera_id_b",
    "timestamp as timestamp_b",
    "speed_reading as speed_reading_b",
    "producer_id as producer_b",
    "speed_flag_instant as speed_flag_instant_b"
)

c = stream_c_flagged.selectExpr(
    "car_plate",
    "camera_id as camera_id_c",
    "timestamp as timestamp_c",
    "speed_reading as speed_reading_c",
    "producer_id as producer_c",
    "speed_flag_instant as speed_flag_instant_c"
)

# Join A & B
ab_join = b.alias("b").join(
    a.alias("a"),
    (col("a.car_plate") == col("b.car_plate")) &
    (col("a.timestamp_a") < col("b.timestamp_b")) &
    (col("b.timestamp_b") <= col("a.timestamp_a") + expr("interval 10 minutes")),
    "inner"
).select(
    col("a.car_plate"),
    col("a.camera_id_a"),
    col("a.timestamp_a"),
    col("a.speed_reading_a"),
    col("a.speed_flag_instant_a"),
    ((col("a.speed_reading_a") + col("b.speed_reading_b")) / 2).alias("avg_speed_reading_ab"),
    speeding_udf(
        col("a.camera_id_a"),
        ((col("a.speed_reading_a") + col("b.speed_reading_b")) / 2),
        lit("average")
    ).alias("speed_flag_average_ab"),
    col("b.camera_id_b"),
    col("b.timestamp_b"),
    col("b.speed_reading_b"),
    col("b.speed_flag_instant_b")
)

# Join AB & C
abc_join = ab_join.alias("ab").join(
    c.alias("c"),
    (col("ab.car_plate") == col("c.car_plate")) &
    (col("c.timestamp_c") > col("ab.timestamp_b")) &
    (col("c.timestamp_c") <= col("ab.timestamp_b") + expr("interval 10 minutes")),
    "inner"
).select(
    col("ab.*"),
    ((col("ab.speed_reading_b") + col("c.speed_reading_c")) / 2).alias("avg_speed_reading_bc"),
    speeding_udf(
        col("ab.camera_id_b"),
        ((col("ab.speed_reading_b") + col("c.speed_reading_c")) / 2),
        lit("average")
    ).alias("speed_flag_average_bc"),
    col("c.camera_id_c"),
    col("c.timestamp_c"),
    col("c.speed_reading_c"),
    col("c.speed_flag_instant_c")
)

In [None]:
import os
import shutil
from pyspark.sql.streaming import StreamingQueryException
from Operations import DbWriter
checkpoint_dir = "./stream_checkpoints"

# 1) Clean up any existing checkpoint directory before starting
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
    print(f"Deleted existing checkpoint directory: {checkpoint_dir}")

# Write to the console
query = (
    abc_join.writeStream
    .format("console")
    .option("checkpointLocation", "./stream_checkpoints")
    .outputMode("append")
    .foreach(DbWriter(
        mongo_host="172.22.32.1",
        mongo_port=27017,
        mongo_db="fit3182_db",
        mongo_coll="Violation"
    ))
    .option("numRows", 1000)
    .option("truncate", False)  # Optional: show full column contents
    .start()
)

# Write to the console
# query = (
#     abc_join.writeStream
#     .format("console")
#     .option("checkpointLocation", "./stream_checkpoints")
#     .outputMode("append")
#     .option("numRows", 1000)
#     .option("truncate", False)  # Optional: show full column contents
#     .start()
# )

# Run query and handle termination gracefully
try:
    query.awaitTermination()
except KeyboardInterrupt:
    print("Interrupted by CTRL-C. Stopping query.")
except StreamingQueryException as exc:
    print(f"Streaming error: {exc}")
finally:
    query.stop()

In [None]:
import os
import shutil
from pyspark.sql import DataFrame
from pyspark.sql.streaming import StreamingQueryException
from Operations import DbWriter

# Import your custom DbWriter class here
# from your_module import DbWriter

# === Configuration ===
checkpoint_dir = "./stream_checkpoints"
mongo_host = "172.27.65.143"
mongo_port = 27017
mongo_db = "fit3182_db"
mongo_coll = "new_violations"

# === 1. Clean Up Checkpoint Directory ===
if os.path.isdir(checkpoint_dir):
    shutil.rmtree(checkpoint_dir)
    print(f"Deleted existing checkpoint directory: {checkpoint_dir}")

# === 2. Define Batch Processing Function ===
def process_batch(batch_df: DataFrame, batch_id: int):
    writer = DbWriter(
        mongo_host=mongo_host,
        mongo_port=mongo_port,
        mongo_db=mongo_db,
        mongo_coll=mongo_coll
    )
    
    writer.open(partition_id=str(batch_id), epoch_id=str(batch_id))

    for row in batch_df.rdd.collect():
        writer.process(row)
    
    writer.close(None)

# === 3. Start the Stream ===
query = (
    abc_join.writeStream
    .format("console")
    .option("checkpointLocation", "./stream_checkpoints")
    .outputMode("append")
    .foreach(DbWriter(
        mongo_host="172.22.32.1",
        mongo_port=27017,
        mongo_db="fit3182_db",
        mongo_coll="Violation"
    ))
    .option("numRows", 1000)
    .option("truncate", False) 
    .start()
)

# === 4. Run the Stream and Handle Termination ===
try:
    query.awaitTermination()
except KeyboardInterrupt:
    print("Interrupted by CTRL-C. Stopping query.")
except StreamingQueryException as exc:
    print(f"Streaming error: {exc}")
finally:
    query.stop()
    print("Query stopped.")


Deleted existing checkpoint directory: ./stream_checkpoints


In [11]:
print(5)

5
