In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SmartCityTraffic").getOrCreate()

spark

In [0]:
# 1. Data Ingestion & Schema Analysis
# Load CSV using PySpark with schema inference
df_infer = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/Coding Assessment Datasets/traffic_logs.csv")
df_infer.printSchema()

# Manually define schema and compare
from pyspark.sql.types import *

schema = StructType([
    StructField("LogID", StringType(), True),
    StructField("VehicleID", StringType(), True),
    StructField("EntryPoint", StringType(), True),
    StructField("ExitPoint", StringType(), True),
    StructField("EntryTime", TimestampType(), True),
    StructField("ExitTime", TimestampType(), True),
    StructField("VehicleType", StringType(), True),
    StructField("SpeedKMH", IntegerType(), True),
    StructField("TollPaid", IntegerType(), True)
])

df_manual = spark.read.schema(schema).option("header", True).csv("file:/Workspace/Shared/Coding Assessment Datasets/traffic_logs.csv")
df_manual.printSchema()

# Ensure EntryTime/ExitTime are timestamp
from pyspark.sql.functions import to_timestamp

df = df_infer.withColumn("EntryTime", to_timestamp("EntryTime", "dd-MM-yyyy HH:mm")) \
       .withColumn("ExitTime", to_timestamp("ExitTime", "dd-MM-yyyy HH:mm"))

df.select("EntryTime", "ExitTime").printSchema()
df.show()

root
 |-- LogID: string (nullable = true)
 |-- VehicleID: string (nullable = true)
 |-- EntryPoint: string (nullable = true)
 |-- ExitPoint: string (nullable = true)
 |-- EntryTime: string (nullable = true)
 |-- ExitTime: string (nullable = true)
 |-- VehicleType: string (nullable = true)
 |-- SpeedKMH: integer (nullable = true)
 |-- TollPaid: integer (nullable = true)

root
 |-- LogID: string (nullable = true)
 |-- VehicleID: string (nullable = true)
 |-- EntryPoint: string (nullable = true)
 |-- ExitPoint: string (nullable = true)
 |-- EntryTime: timestamp (nullable = true)
 |-- ExitTime: timestamp (nullable = true)
 |-- VehicleType: string (nullable = true)
 |-- SpeedKMH: integer (nullable = true)
 |-- TollPaid: integer (nullable = true)

root
 |-- EntryTime: timestamp (nullable = true)
 |-- ExitTime: timestamp (nullable = true)

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+
|LogID|VehicleID|EntryPoint|ExitPoint|        

In [0]:
df_infer.select("EntryTime", "ExitTime").distinct().show(truncate=False)
from pyspark.sql.functions import to_timestamp

df = df_infer.withColumn("EntryTime", to_timestamp("EntryTime", "dd-MM-yyyy HH:mm")) \
             .withColumn("ExitTime", to_timestamp("ExitTime", "dd-MM-yyyy HH:mm"))

df.select("EntryTime", "ExitTime").show(truncate=False)

+----------------+----------------+
|EntryTime       |ExitTime        |
+----------------+----------------+
|01-05-2024 09:15|01-05-2024 09:35|
|01-05-2024 10:05|01-05-2024 10:40|
|01-05-2024 08:10|01-05-2024 08:45|
|01-05-2024 09:00|01-05-2024 09:18|
|01-05-2024 08:01|01-05-2024 08:20|
+----------------+----------------+

+-------------------+-------------------+
|EntryTime          |ExitTime           |
+-------------------+-------------------+
|2024-05-01 08:01:00|2024-05-01 08:20:00|
|2024-05-01 08:10:00|2024-05-01 08:45:00|
|2024-05-01 09:00:00|2024-05-01 09:18:00|
|2024-05-01 09:15:00|2024-05-01 09:35:00|
|2024-05-01 10:05:00|2024-05-01 10:40:00|
+-------------------+-------------------+



In [0]:
# 2. Derived Column Creation
# Calculate TripDurationMinutes = ExitTime - EntryTime
from pyspark.sql.functions import col, round, when

df = df.withColumn("TripDurationMinutes",
                   round((col("ExitTime").cast("long") - col("EntryTime").cast("long")) / 60, 2)) \
       .withColumn("IsOverspeed", when(col("SpeedKMH") > 60, True).otherwise(False))

df.select("EntryTime", "ExitTime", "TripDurationMinutes", "IsOverspeed").show(truncate=False)

+-------------------+-------------------+-------------------+-----------+
|EntryTime          |ExitTime           |TripDurationMinutes|IsOverspeed|
+-------------------+-------------------+-------------------+-----------+
|2024-05-01 08:01:00|2024-05-01 08:20:00|19.0               |false      |
|2024-05-01 08:10:00|2024-05-01 08:45:00|35.0               |false      |
|2024-05-01 09:00:00|2024-05-01 09:18:00|18.0               |false      |
|2024-05-01 09:15:00|2024-05-01 09:35:00|20.0               |true       |
|2024-05-01 10:05:00|2024-05-01 10:40:00|35.0               |false      |
+-------------------+-------------------+-------------------+-----------+



In [0]:
# 3. Vehicle Behavior Aggregations
# Average speed per VehicleType
df.groupBy("VehicleType").avg("SpeedKMH").show()

# Total toll collected per gate (EntryPoint)
df.groupBy("EntryPoint").sum("TollPaid").withColumnRenamed("sum(TollPaid)", "TotalToll").show()

# Most used ExitPoint
from pyspark.sql.functions import count

df.groupBy("ExitPoint").count().orderBy(col("count").desc()).show(1)

+-----------+-------------+
|VehicleType|avg(SpeedKMH)|
+-----------+-------------+
|       Bike|         55.0|
|        Car|         70.0|
|      Truck|         45.0|
|        Bus|         40.0|
+-----------+-------------+

+----------+---------+
|EntryPoint|TotalToll|
+----------+---------+
|     GateA|       80|
|     GateB|      170|
|     GateC|       50|
+----------+---------+

+---------+-----+
|ExitPoint|count|
+---------+-----+
|    GateD|    2|
+---------+-----+
only showing top 1 row



In [0]:
# 4. Window Functions
# Rank vehicles by speed within VehicleType
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, lag

window_by_type = Window.partitionBy("VehicleType").orderBy(col("SpeedKMH").desc())

df = df.withColumn("SpeedRank", rank().over(window_by_type))

#Find last exit time for each vehicle using lag()
window_by_vehicle = Window.partitionBy("VehicleID").orderBy("ExitTime")

df = df.withColumn("LastExitTime", lag("ExitTime").over(window_by_vehicle))

In [0]:
# 5. Session Segmentation
from pyspark.sql.window import Window
from pyspark.sql.functions import lag
window_spec = Window.orderBy("EntryTime")

# Add LastExitTime
df = df.withColumn("LastExitTime", lag("ExitTime").over(window_spec))

from pyspark.sql.functions import unix_timestamp

df = df.withColumn(
    "IdleTimeMinutes",
    (unix_timestamp("EntryTime") - unix_timestamp("LastExitTime")) / 60
)

df.select("LastExitTime", "EntryTime", "IdleTimeMinutes").show(truncate=False)

+-------------------+-------------------+---------------+
|LastExitTime       |EntryTime          |IdleTimeMinutes|
+-------------------+-------------------+---------------+
|NULL               |2024-05-01 08:01:00|NULL           |
|2024-05-01 08:20:00|2024-05-01 08:10:00|-10.0          |
|2024-05-01 08:45:00|2024-05-01 09:00:00|15.0           |
|2024-05-01 09:18:00|2024-05-01 09:15:00|-3.0           |
|2024-05-01 09:35:00|2024-05-01 10:05:00|30.0           |
+-------------------+-------------------+---------------+



In [0]:
# 6. Anomaly Detection
# Identify vehicles with speed > 70 and TripDuration < 10 minutes
df.filter((col("SpeedKMH") > 70) & (col("TripDurationMinutes") < 10)).show()

# Vehicles that paid less toll for longer trips
df.filter((col("TripDurationMinutes") > 30) & (col("TollPaid") < 50)).show()

# Suspicious backtracking (ExitPoint earlier than EntryPoint)
df.filter(col("EntryPoint") > col("ExitPoint")).show()

+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+---------+------------+---------------+
|LogID|VehicleID|EntryPoint|ExitPoint|EntryTime|ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|SpeedRank|LastExitTime|IdleTimeMinutes|
+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+---------+------------+---------------+
+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+---------+------------+---------------+

+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+---------+------------+---------------+
|LogID|VehicleID|EntryPoint|ExitPoint|EntryTime|ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|SpeedRank|LastExitTime|IdleTimeMinutes|
+-----+---------+----------+---------+-------

In [0]:
# 7. Join with Metadata
# Prepare this small vehicle_registry.csv :
# VehicleID,OwnerName,Model,RegisteredCity
# V001,Anil,Hyundai i20,Delhi
# V002,Rakesh,Tata Truck,Chennai
# V003,Sana,Yamaha R15,Mumbai
# V004,Neha,Honda City,Bangalore
# V005,Zoya,Volvo Bus,Pune
# Join and group trips by RegisteredCity
vehicle_registry = spark.read.option("header", True).csv("file:/Workspace/Shared/Coding Assessment Datasets/vehicle_registry.csv")

df_joined = df.join(vehicle_registry, on="VehicleID", how="left")

df_joined.groupBy("RegisteredCity").count().show()

+--------------+-----+
|RegisteredCity|count|
+--------------+-----+
|     Bangalore|    1|
|       Chennai|    1|
|        Mumbai|    1|
|          Pune|    1|
|         Delhi|    1|
+--------------+-----+



In [0]:
# 8. Delta Lake Features
# Save Delta Table
df.write.format("delta").mode("overwrite").saveAsTable("traffic_logs")

# Read and verify
spark.sql("SELECT * FROM traffic_logs").show()

# Update toll for Bikes using MERGE
spark.sql("""
MERGE INTO traffic_logs AS target
USING (
    SELECT * FROM traffic_logs WHERE VehicleType = 'Bike'
) AS source
ON target.LogID = source.LogID
WHEN MATCHED THEN
UPDATE SET target.TollPaid = 40
""")

# Delete long trips
spark.sql("""
DELETE FROM traffic_logs
WHERE TripDurationMinutes > 60
""")

# View operation history
spark.sql("""
DESCRIBE HISTORY traffic_logs
""").show()

# Query an earlier version (version 0, before update/delete)
spark.sql("""
SELECT * FROM traffic_logs VERSION AS OF 0
""").show()

In [0]:
# 9. Advanced Conditions
# when/otherwise : Tag trip type as:
# "Short" <15min
# "Medium" 15-30min
# "Long" >30min
df = df.withColumn("TripType", when(col("TripDurationMinutes") < 15, "Short")
                   .when(col("TripDurationMinutes") <= 30, "Medium")
                   .otherwise("Long"))
df.show()

# Flag vehicles with more than 3 trips in a day
from pyspark.sql.functions import to_date

trip_count = df.withColumn("TripDate", to_date("EntryTime")) \
               .groupBy("VehicleID", "TripDate").count() \
               .filter(col("count") > 3)

trip_count.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+-------------------+---------------+--------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|SpeedRank|       LastExitTime|IdleTimeMinutes|TripType|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+-------------------+---------------+--------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|      50|               19.0|      false|        2|               NULL|           NULL|  Medium|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|     100|               35.0|      false|        1|2024-05-01 08:20:00|          -10.0|    Long|
| L003|     V003|   

In [0]:
# Export & Reporting
# Write final enriched DataFrame to:
# Parquet partitioned by VehicleType
df.write.mode("overwrite").partitionBy("VehicleType").parquet("/output/traffic_partitioned/")

# CSV for dashboards
df.write.mode("overwrite").option("header", True).csv("/output/traffic_csv/")

# Create summary SQL View: total toll by VehicleType + ExitPoint
df.createOrReplaceTempView("traffic_view")

spark.sql("""
SELECT VehicleType, ExitPoint, SUM(TollPaid) as TotalToll
FROM traffic_view
GROUP BY VehicleType, ExitPoint
""").show()

+-----------+---------+---------+
|VehicleType|ExitPoint|TotalToll|
+-----------+---------+---------+
|        Car|    GateD|       50|
|      Truck|    GateC|      100|
|       Bike|    GateD|       30|
|        Bus|    GateA|       70|
|        Car|    GateC|       50|
+-----------+---------+---------+

