In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import Row, DataFrame
from pyspark.sql.functions import *

from math import radians, cos, sin, asin, sqrt

In [2]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('Traffic Graph') \
    .getOrCreate()

In [3]:
spark.version

'2.3.0'

In [4]:
node_schema = StructType() \
    .add('node', StringType(), False) \
    .add('McsDsRefer', StringType(), False) \
    .add('Y', DoubleType(), False) \
    .add('X', DoubleType(), False) \
    .add('Valid_From', TimestampType(), False) \
    .add('Valid_To', TimestampType(), False) \
    .add('McsDsRefer_road', StringType(), True) \
    .add('McsDsRefer_meter', IntegerType(), False) 

edge_schema = StructType() \
    .add('src', StringType(), False) \
    .add('dest', StringType(), False) 

avg_sensor_speed_schema = StructType() \
    .add('node', StringType(), False) \
    .add('Average_Speed', DoubleType(), False) \

In [5]:
nodes = spark.read.csv(
    '../../data/nodes/*.csv', 
    sep=';', 
    schema=node_schema,
    ignoreLeadingWhiteSpace=True,
    ignoreTrailingWhiteSpace=True,
    header=True,
)

In [6]:
print("Number of nodes:", nodes.count())
nodes.show(1)

Number of nodes: 857
+-----------+------------+----------------+----------------+-------------------+-------------------+---------------+----------------+
|       node|  McsDsRefer|               Y|               X|         Valid_From|           Valid_To|McsDsRefer_road|McsDsRefer_meter|
+-----------+------------+----------------+----------------+-------------------+-------------------+---------------+----------------+
|E18_A-25940|E18_A 25,940|17.8788261183399|59.4008824050535|2016-05-09 00:00:00|9999-12-31 00:00:00|          E18_A|           25940|
+-----------+------------+----------------+----------------+-------------------+-------------------+---------------+----------------+
only showing top 1 row



In [7]:
edges = spark.read.csv(
    '../../data/edges/*.csv', 
    sep=';', 
    schema=edge_schema,
    ignoreLeadingWhiteSpace=True,
    ignoreTrailingWhiteSpace=True,
    header=True,
)

In [8]:
edges.show(1)

+----------+----------+
|       src|      dest|
+----------+----------+
|E182N-0005|E182N-0190|
+----------+----------+
only showing top 1 row



In [9]:
df_avg_sensor_speed = spark.read.csv(
    '../../data/avg_sensor_speed.csv', 
    sep=';', 
    schema=avg_sensor_speed_schema,
    ignoreLeadingWhiteSpace=True,
    ignoreTrailingWhiteSpace=True,
    header=True,
)

In [10]:
df_avg_sensor_speed.show(1)

+-----------+-------------+
|       node|Average_Speed|
+-----------+-------------+
|E425N-58125|         53.5|
+-----------+-------------+
only showing top 1 row



## Check for missing values

In [11]:
print(nodes.count(), df_avg_sensor_speed.count(), "Should be the same")

857 826 Should be the same


In [12]:
nodes.alias('n').join(
    df_avg_sensor_speed.alias('d'),
    col('n.node') == col('d.node'), "left_outer"
).where(col('d.node').isNull()).select('n.node').show(40)

+-----------+
|       node|
+-----------+
| E222W-5165|
|E226Z-22915|
|E226Z-23325|
|E265O-10950|
|E265O-11210|
|E265O-12330|
| E265W-0200|
| E265W-0570|
| E265W-1160|
|E265W-10950|
|E265W-11215|
|E265W-12270|
| E265W-4875|
| E4_C-61210|
| E4_C-66510|
|E426N-59530|
|  E4N-47800|
|  E4N-56165|
|  E4N-67230|
|  E4N-71440|
|  E4Z-39500|
|  E4Z-47635|
|  E4Z-48385|
|  E4Z-53595|
|  E4Z-57055|
|E73_G-52335|
| E75_C-6800|
| E75_E-2625|
| E75_U-4070|
| E75_U-4370|
|  E75W-6680|
| E4_M-58205|
| E4_M-57730|
+-----------+



In [13]:
# Ignore sensors not connected to the rest of the road network
nodes.alias('n').join(
    df_avg_sensor_speed.alias('d'),
    col('n.node') == col('d.node'), "right_outer"
).where(col('n.node').isNull()).select('d.node').show(40)

+----------+
|      node|
+----------+
|E18W-37625|
|E4_A-31975|
+----------+



## Fix missing average speed

In [14]:
# Use speed from nearest sensor
newRows = [\
    ["E222W-5165", 68.27130643868554],
    ["E226Z-22915", 63.23934818358499],
    ["E226Z-23325", 71.3023349436393],
    ["E265O-10950", 66.4693446088795],
    ["E265O-11210", 66.4693446088795],
    ["E265O-12330", 69.81336966040674],
    ["E265W-0200", 78.98594787940726],
    ["E265W-0570", 79.73280352252591],
    ["E265W-1160", 74.10397715252803],
    ["E265W-10950", 75.04028344010872],
    ["E265W-11215", 75.04028344010872],
    ["E265W-12270", 73.28029494518289],
    ["E265W-4875", 88.53025537132804],
    ["E4_C-61210", 66.55019463991616],
    ["E4_C-66510", 77.32215314136126],
    ["E426N-59530", 49.940901635270144],
    ["E4N-47800", 68.21320409782346],
    ["E4N-56165", 44.60272686983498],
    ["E4N-67230", 78.87379549991374],
    ["E4N-71440", 74.30119065869488],
    ["E4Z-39500", 72.76563630226165],
    ["E4Z-47635", 75.63683736333309],
    ["E4Z-48385", 79.1709561956707],
    ["E4Z-53595", 62.29304391916363],
    ["E4Z-57055", 48.35623931623932],
    ["E73_G-52335", 51.7314933464761],
    ["E75_C-6800", 61.999432697266634],
    ["E75_E-2625", 71.3023349436393],
    ["E75_U-4070", 45.566259021651966],
    ["E75_U-4370", 55.20294761999602],
    ["E75W-6680", 73.83707952255465],
    ["E4_M-58205", 50.580917642541756],
    ["E4_M-57730", 42.22751271494309]
]

newDF = spark.createDataFrame(newRows, avg_sensor_speed_schema)
fixed_avg_sensor_speed = df_avg_sensor_speed.union(newDF)

In [15]:
nodes_with_speed = nodes.alias('n').join(
    fixed_avg_sensor_speed.alias('d'),
    col('n.node') == col('d.node'), "left_outer"
).select(
    col("n.node").alias("node"),
    "McsDsRefer",
    "X",
    "Y",
    "Valid_From",
    "Valid_To",
    "McsDsRefer_road",
    "McsDsRefer_meter",
    "Average_Speed"
)

print(nodes_with_speed.count(), "should be:", nodes.count())

857 should be: 857


In [16]:
missing_speed = nodes_with_speed.where(isnull("node")).select("node").sort("node")
missing_speed.show()

+----+
|node|
+----+
+----+



In [17]:
nodes_with_speed.show(1)

+----------+-----------+----------------+----------------+-------------------+-------------------+---------------+----------------+-----------------+
|      node| McsDsRefer|               X|               Y|         Valid_From|           Valid_To|McsDsRefer_road|McsDsRefer_meter|    Average_Speed|
+----------+-----------+----------------+----------------+-------------------+-------------------+---------------+----------------+-----------------+
|E18O-37735|E18O 37,735|59.3823553194917|18.0415869015134|2001-01-01 00:00:00|9999-12-31 00:00:00|           E18O|           37735|59.29376224689745|
+----------+-----------+----------------+----------------+-------------------+-------------------+---------------+----------------+-----------------+
only showing top 1 row



## Calculate edge weight

In [18]:
# Uses the km references if sensors are on same road else the gps coordinates. 
# Based on: "https://stackoverflow.com/questions/15736995/"
@udf(DoubleType())
def extract_distance(start_name, end_name, start_lon, start_lat, end_lon, end_lat):
    start_r, start_km = start_name.split('-')
    end_r, end_km = end_name.split('-')
    if start_r == end_r:
        start_km_float = float(int(start_km) / 1000)
        end_km_float = float(int(end_km) / 1000) 
        if (start_km_float > end_km_float):
            return start_km_float - end_km_float
        return end_km_float - start_km_float
    
    earth_radius = 6371 
    # Degrees to radians 
    start_lon, start_lat, end_lon, end_lat = map(radians, [start_lon, start_lat, end_lon, end_lat])
    # Haversine formula 
    dlon = end_lon - start_lon 
    dlat = end_lat - start_lat 
    a = sin(dlat/2)**2 + cos(start_lat) * cos(end_lat) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = earth_radius * c
    return km

# Time it takes to travel from src sensor to dest sensor
@udf(DoubleType())
def calculate_time(distance, speed):
    return (distance / speed) * 60

edges_with_coords = edges.alias('a').join(
    nodes.select("node", "X", "Y")
    .withColumnRenamed("X", "src_X") 
    .withColumnRenamed("Y", "src_Y") 
    .alias('b'),
    col('a.src') == col('b.node'), "left_outer"
).select("src", "dest", "src_X", "src_Y").alias('c').join(
    nodes.select("node", "X", "Y")
    .withColumnRenamed("X", "dest_X") 
    .withColumnRenamed("Y", "dest_Y") 
    .alias('d'),
    col('c.dest') == col('d.node'), "left_outer"
).select('src', 'dest', 'src_X', 'src_Y', 'dest_X', 'dest_Y')

edges_with_distance = edges_with_coords.withColumn('distance', extract_distance('src', 'dest', 'src_X', 'src_Y', 'dest_X', 'dest_Y')) \
.select('src', 'dest', 'distance')

edges_with_avg_speed = edges_with_distance.alias('a').join(
    nodes_with_speed.alias('z'),
    col('a.dest') == col('z.node')
).select('src', 'dest', 'distance', 'Average_Speed')

edges_with_weight = edges_with_avg_speed.withColumn('weight', calculate_time('distance', 'Average_Speed'))\
    .select('src', 'dest', 'weight')

edges_with_weight.show(2)

+-----------+----------+-------------------+
|        src|      dest|             weight|
+-----------+----------+-------------------+
| E18O-37610|E18O-37735|  0.126488853393553|
|E182_V-3915|E18O-37735|0.08625730739048927|
+-----------+----------+-------------------+
only showing top 2 rows



In [19]:
edges_with_weight.where("weight == 0").show()

+---+----+------+
|src|dest|weight|
+---+----+------+
+---+----+------+



In [20]:
print("Total nodes:", nodes.count())
print("Total number of nodes with avg speed:", nodes_with_speed.count())
print("--------------------------------------")
print("Total edges:", edges.count())
print("Total edges with coords:", edges_with_coords.count())
print("Total edges with distance:", edges_with_distance.count())
print("Total edges with avg speed:", edges_with_avg_speed.count())
print("Total Edges with weight:", edges_with_weight.count())

Total nodes: 857
Total number of nodes with avg speed: 857
--------------------------------------
Total edges: 868
Total edges with coords: 868
Total edges with distance: 868
Total edges with avg speed: 868
Total Edges with weight: 868


In [21]:
# Fixes memory limitations
edges_with_weight.write.mode("overwrite").parquet("../../data/edges_with_weight-parquet")

In [22]:
spark.read.parquet("../../data/edges_with_weight-parquet/*").select("src","dest", "weight").coalesce(1).write \
    .option('sep', ';') \
    .format("com.databricks.spark.csv") \
    .option("header", "true") \
    .mode("overwrite") \
    .save("../../data/edges_with_weight")

## Statistics

In [25]:
print("Total edges:", edges_with_weight.count())
print(edges_with_weight.agg({"weight": "avg"}).collect()[0])
print(edges_with_weight.agg({"weight": "max"}).collect()[0])
print(edges_with_weight.agg({"weight": "min"}).collect()[0])
print(edges_with_weight.agg({"weight": "sum"}).collect()[0])

Total edges: 868
Row(avg(weight)=0.22417527307519705)
Row(max(weight)=1.612499999999999)
Row(min(weight)=0.03370598365322795)
Row(sum(weight)=194.58413702927103)
