In [1]:
# Run this if something is not working
#!pip install folium
#!pip install ipyleaflet
#!pip install geojson
# in terminal: jupyter nbextension enable --py --sys-prefix ipyleaflet

In [52]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import Row, DataFrame
from pyspark.sql.functions import *
import pyspark.sql.functions as F

from math import radians, cos, sin, asin, sqrt

In [3]:
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('Traffic Graph') \
    .getOrCreate()

In [4]:
spark.version

'2.3.0'

## Load Data

In [5]:
metadata_schema = StructType() \
        .add('Y', DoubleType(), False) \
        .add('X', DoubleType(), False) \
        .add('DetectorId', ShortType(), False) \
        .add('McsDetecto', ShortType(), False) \
        .add('McsDsRefer', StringType(), False) \
        .add('LaneId', ShortType(), False) \
        .add('Bearing', ShortType(), True) \
        .add('Location', StringType(), True) \
        .add('RegionId', ShortType(), False) \
        .add('Entreprene', StringType(), True) \
        .add('StationId', ShortType(), False) \
        .add('SiteId', ShortType(), False) \
        .add('SiteValidF', TimestampType(), False) \
        .add('SiteValidT', TimestampType(), False) \
        .add('DetectorVa', TimestampType(), False) \
        .add('Detector_1', TimestampType(), False)
        
edge_schema = StructType() \
    .add('src', StringType(), False) \
    .add('src_road', StringType(), False) \
    .add('src_meter', IntegerType(), False) \
    .add('dest', StringType(), False) \
    .add('dest_road', StringType(), False) \
    .add('dest_meter', IntegerType(), False) 

In [6]:
df_metadata_raw = spark.read.csv(
    '../../data/StockholmDetectorMetadata.csv', 
    sep=';', 
    schema=metadata_schema,
    ignoreLeadingWhiteSpace=True,
    ignoreTrailingWhiteSpace=True,
    header=True,
    timestampFormat='yyyy/MM/dd HH:mm:ss.SSS'
)

In [7]:
df_metadata_raw.show(1)

+----------------+----------------+----------+----------+------------+------+-------+--------+--------+----------+---------+------+-------------------+-------------------+-------------------+-------------------+
|               Y|               X|DetectorId|McsDetecto|  McsDsRefer|LaneId|Bearing|Location|RegionId|Entreprene|StationId|SiteId|         SiteValidF|         SiteValidT|         DetectorVa|         Detector_1|
+----------------+----------------+----------+----------+------------+------+-------+--------+--------+----------+---------+------+-------------------+-------------------+-------------------+-------------------+
|59.4008824050535|17.8788261183399|         1|         1|E18_A 25,940|     1|   null|    null|       4|      null|        1|     1|2000-01-01 00:00:00|9999-12-31 00:00:00|2016-05-09 00:00:00|9999-12-31 00:00:00|
+----------------+----------------+----------+----------+------------+------+-------+--------+--------+----------+---------+------+-------------------+-

### Remove Invalid sensors

In [8]:
print("Before removal:", df_metadata_raw.count(), "should be 2059")
df_metadata_raw = df_metadata_raw.where(col('Detector_1').like('%9999%'))
print("After removal:", df_metadata_raw.count(), "should be 2037")

Before removal: 2059 should be 2059
After removal: 2037 should be 2037


In [9]:
# Split Ds_Reference into road id and meters from reference point
@udf(StringType())
def get_road(s):
    try:
        r, km = s.split(' ')
        return r
    except:
        return None
    
@udf(IntegerType())
def get_meter(s):
    try:
        r, km = s.split(' ')
        k, m = km.split(',')
        meter = int(k)*1000 + int(m) # calculate how many meters from reference
        return meter
    except:
        return None
    
@udf(StringType())
def generate_node_id(reference):
    return reference.replace(' ', '-').replace(',', '')
    
df_metadata = df_metadata_raw \
    .withColumn('McsDsRefer_road', get_road('McsDsRefer')) \
    .withColumn('McsDsRefer_meter', get_meter('McsDsRefer')) \
    .withColumn('node', generate_node_id('McsDsRefer'))

In [10]:
df_metadata.show(1)

+----------------+----------------+----------+----------+------------+------+-------+--------+--------+----------+---------+------+-------------------+-------------------+-------------------+-------------------+---------------+----------------+-----------+
|               Y|               X|DetectorId|McsDetecto|  McsDsRefer|LaneId|Bearing|Location|RegionId|Entreprene|StationId|SiteId|         SiteValidF|         SiteValidT|         DetectorVa|         Detector_1|McsDsRefer_road|McsDsRefer_meter|       node|
+----------------+----------------+----------+----------+------------+------+-------+--------+--------+----------+---------+------+-------------------+-------------------+-------------------+-------------------+---------------+----------------+-----------+
|59.4008824050535|17.8788261183399|         1|         1|E18_A 25,940|     1|   null|    null|       4|      null|        1|     1|2000-01-01 00:00:00|9999-12-31 00:00:00|2016-05-09 00:00:00|9999-12-31 00:00:00|          E18_A|  

## Extract Nodes

In [11]:
num_sensor_at_node = df_metadata.groupBy("node").count()

print(num_sensor_at_node.count(), "should be 859")

859 should be 859


In [12]:
# Some senor locations have a laneId that does not start from 1
nodes = df_metadata.groupBy("node").agg(min('LaneId').alias('MinLaneId')).alias("m").join(
    df_metadata.alias("n"),
    (col('m.MinLaneId') == col('n.LaneId')) & (col('m.node') == col('n.node')),
    "inner"
).select(
    col("n.node").alias("node"),
    "McsDsRefer", 
    "X", 
    "Y", 
    col("DetectorVa").alias("Valid_From"), 
    col("Detector_1").alias("Valid_To"), 
    "McsDsRefer_road", 
    "McsDsRefer_meter"
).alias('n').join(
    num_sensor_at_node.select(col("node"), col("count").alias("Sensors")).alias('e'), 
    col('n.node') == col('e.node'),
    "inner"
).select(
    col("n.node").alias("node"),
    "McsDsRefer", 
    "X", 
    "Y", 
    "Valid_From", 
    "Valid_To", 
    "McsDsRefer_road", 
    "McsDsRefer_meter",
    "Sensors"
)

print(nodes.count(), "should be 859")

859 should be 859


Remove sensor not connected to any other sensor by road

In [13]:
# E18W 37,625
nodes = nodes.where(col("node") != "E18W-37625")

# E4_A 31,975
nodes = nodes.where(col("node") != "E4_A-31975")
print(nodes.count(), "should be 857")

857 should be 857


## Extract Edges

In [14]:
# Sort edges by reference point
# Flip roads that have wrong direction based on refernece point
sort = nodes.orderBy("McsDsRefer_road", "McsDsRefer_meter").select("node", "McsDsRefer_road", "McsDsRefer_meter")
flip_south = sort.where(col("McsDsRefer_road").like('%Z%')).orderBy("McsDsRefer_road", "McsDsRefer_meter", ascending=[0, 0])
tmp = sort.where(~ col("McsDsRefer_road").like('%Z%'))

flip_west = sort.where(col("McsDsRefer_road").like('%W%')).orderBy("McsDsRefer_road", "McsDsRefer_meter", ascending=[0, 0])
tmp = tmp.where(~ col("McsDsRefer_road").like('%W%'))

flip_c = sort.where(col("McsDsRefer_road").like('%C%')).orderBy("McsDsRefer_road", "McsDsRefer_meter", ascending=[0, 0])
tmp = tmp.where(~ col("McsDsRefer_road").like('%C%'))

flip_d = sort.where(col("McsDsRefer_road").like('%D%')).orderBy("McsDsRefer_road", "McsDsRefer_meter", ascending=[0, 0])
tmp = tmp.where(~ col("McsDsRefer_road").like('%D%'))

flip_f = sort.where(col("McsDsRefer_road").like('%F%')).orderBy("McsDsRefer_road", "McsDsRefer_meter", ascending=[0, 0])
tmp = tmp.where(~ col("McsDsRefer_road").like('%F%'))

flip_h = sort.where(col("McsDsRefer_road").like('%H%')).orderBy("McsDsRefer_road", "McsDsRefer_meter", ascending=[0, 0])
tmp = tmp.where(~ col("McsDsRefer_road").like('%H%'))

flip_s = sort.where(col("McsDsRefer_road").like('%S%')).orderBy("McsDsRefer_road", "McsDsRefer_meter", ascending=[0, 0])
tmp = tmp.where(~ col("McsDsRefer_road").like('%S%'))

flip_u = sort.where(col("McsDsRefer_road").like('%U%')).orderBy("McsDsRefer_road", "McsDsRefer_meter", ascending=[0, 0])
tmp = tmp.where(~ col("McsDsRefer_road").like('%U%'))

print(tmp.count() + 
      flip_south.count() + 
      flip_west.count() + 
      flip_c.count() +
      flip_d.count() + 
      flip_f.count() +
      flip_h.count() + 
      flip_s.count() + 
      flip_u.count(), "should equal 857")

857 should equal 857


In [15]:
def createEdges(df, flipped):
    head = df.first()[0]
    if flipped:
        tail = df.orderBy("node").first()[0]
    else:
        tail = df.orderBy(desc("node")).first()[0]
    left = df.where(col("node") != tail) \
        .withColumnRenamed("node", "src") \
        .withColumnRenamed("McsDsRefer_road", "src_road") \
        .withColumnRenamed("McsDsRefer_meter", "src_meter")
    right = df.where(col("node") != head) \
        .withColumnRenamed("node", "dest") \
        .withColumnRenamed("McsDsRefer_road", "dest_road") \
        .withColumnRenamed("McsDsRefer_meter", "dest_meter")

    return left.coalesce(1).rdd.zip(right.coalesce(1).rdd).map(
        lambda args: args[0] + args[1]
    )

rows = createEdges(tmp, False)
rows = rows.union(createEdges(flip_south, True))
rows = rows.union(createEdges(flip_west, True))
rows = rows.union(createEdges(flip_c, True))
rows = rows.union(createEdges(flip_d, True))
rows = rows.union(createEdges(flip_f, True))
rows = rows.union(createEdges(flip_h, True))
rows = rows.union(createEdges(flip_s, True))
rows = rows.union(createEdges(flip_u, True))

sqlContext = SQLContext(spark)
edges = sqlContext.createDataFrame(rows, edge_schema)

print(edges.count(), "should be 848")

848 should be 848


In [16]:
# Remove edges that go in a circle
edges = edges.where("src != dest")
print(edges.count(), "should be 848")

848 should be 848


In [17]:
# Remove edges not on same road
edges = edges.where(col("src_road") == col("dest_road"))
print(edges.count(), "should be 791")

791 should be 791


In [18]:
# Remove edges that are too far apart
edges = edges.where(col("dest_meter") - col("src_meter") <= 1000)
edges = edges.where(col("src_meter") - col("dest_meter") <= 1000)
print(edges.count(), "should be 752")

752 should be 752


Remove wrong edges

In [19]:
# E4_C 37,840 to E4_C 38,465
edges = edges.where((col("src") != 'E4_C-38465') & (col("dest") != 'E4_C-37840'))
# E75_A 5,195 to E75_A 5,945
edges = edges.where((col("src") != 'E75_A-5195') & (col("dest") != 'E75_A-5945'))
# E182_C 2,615 to E182_C 1,620
edges = edges.where((col("src") != 'E182_C-2615') & (col("dest") != 'E182_C-1620'))

print(edges.count(), "should be 749")

749 should be 749


Add missing edges manually

In [20]:
newRows = [ \
    ["E20_B-23500", '', 0, "E20O-23710", '', 0], 
    ["E20O-23920", '', 0, "E4N-24325", '', 0],   
    ["E4Z-24245", '', 0, "E4_F-23950", '', 0],   
    ["E4Z-26795", '', 0, "E4_C-26570", '', 0],   
    ["E4Z-32540", '', 0, "E4_C-32345", '', 0],   
    ["E4N-37610", '', 0, "E4_A-37885", '', 0],   
    ["E4Z-38710", '', 0, "E4_C-38465", '', 0],   
    ["E4Z-39895", '', 0, "E4_C-39770", '', 0],   
    ["E4N-39215", '', 0, "E4_A-39960", '', 0],   
    ["E4Z-38060", '', 0, "E4_C-37840", '', 0],   
    ["E4N-49370", '', 0, "E4_M-49770", '', 0],   
    ["E4N-50890", '', 0, "E4_E-51085", '', 0],   
    ["E75_U-0235", '', 0, "E4Z-51040", '', 0],   
    ["E4_E-51610", '', 0, "E75O-0900", '', 0],   
    ["E4_H-51530", '', 0, "E75O-0750", '', 0],   
    ["E75W-1270", '', 0, "E75_U-0905", '', 0],   
    ["E75O-1075", '', 0, "E75_A-1170", '', 0],   
    ["E75_B-1585", '', 0, "E75O-1660", '', 0],   
    ["E75W-1885", '', 0, "E75_C-1670", '', 0],   
    ["E226N-23925", '', 0, "E75W-2170", '', 0],  
    ["E75O-2085", '', 0, "E75_E-2300", '', 0],   
    ["E75_E-2750", '', 0, "E226Z-23325", '', 0], 
    ["E75_H-2370", '', 0, "E226Z-23325", '', 0], 
    ["E226N-23375", '', 0, "E226_R-23495", '', 0],
    ["E226_R-23865", '', 0, "E75O-2975", '', 0], 
    ["E75W-3155", '', 0, "E75_H-3015", '', 0],  
    ["E75O-3215", '', 0, "E75_A-3335", '', 0], 
    ["E75_D-3460", '', 0, "E75W-3360", '', 0], 
    ["E73_G-53400", '', 0, "E75W-3710", '', 0],
    ["E75O-3705", '', 0, "E75_R-3870", '', 0], 
    ["E73_E-53135", '', 0, "E75O-4510", '', 0],
    ["E75_R-4230", '', 0, "E75_U-3880", '', 0],
    ["E75_U-3710", '', 0, "E75_S-3515", '', 0],
    ["E73N-51880", '', 0, "E73_G-52155", '', 0],
    ["E265W-2440", '', 0, "E265_C-2240", '', 0],
    ["E265O-1590", '', 0, "E265_A-1800", '', 0],
    ["E182_D-2285", '', 0, "E182Z-2060", '', 0],
    ["E182N-2015", '', 0, "E182_A-2325", '', 0],
    ["E75W-4795", '', 0, "E75_U-4610", '', 0],
    ["E75O-4810", '', 0, "E75_A-4990", '', 0],
    ["E75_D-5155", '', 0, "E75W-4970", '', 0],
    ["E75W-5600", '', 0, "E75_C-5505", '', 0],
    ["E75W-7280", '', 0, "E75_C-6800", '', 0],
    ["E75O-5860", '', 0, "E75_A-5945", '', 0],       
    ["E75_B-5415", '', 0, "E75O-5505", '', 0],
    ["E222W-5165", '', 0, "E75W-7580", '', 0],
    ["E20W-63410", '', 0, "E20_C-63260", '', 0],
    ["E20O-62330", '', 0, "E4_E-62660", '', 0],
    ["E20O-62120", '', 0, "E20_A-62330", '', 0],
    ["E20W-62835", '', 0, "E20_D-62630", '', 0],
    ["E75W-0005", '', 0, "E4N-52220", '', 0],
    ["E4Z-52030", '', 0, "E4_H-51890", '', 0],
    ["E4N-57420", '', 0, "E4_M-57730", '', 0],
    ["E6N-14990", '', 0, "E6_A-15170", '', 0],
    ["E6Z-13460", '', 0, "E6_C-13220", '', 0],
    ["E6N-12920", '', 0, "E6_A-13110", '', 0],
    ["E6Z-12170", '', 0, "E6_C-12001", '', 0],
    ["E6Z-11420", '', 0, "E6_F-11180", '', 0],
    ["E4_N-50225", '', 0, "E4Z-49710", '', 0],    
    ["E4_E-71550", '', 0, "E265O-0820", '', 0],
    ["E4Z-71400", '', 0, "E265O-0450", '', 0],
    ["E265W-0200", '', 0, "E4Z-70960", '', 0],
    ["E4Z-69390", '', 0, "E4_C-69130", '', 0 ],
    ["E4N-68340", '', 0, "E4_A-68620", '', 0 ],
    ["E4Z-66710", '', 0, "E4_C-66510", '', 0 ],
    ["E4Z-66710", '', 0, "E4_S-66525", '', 0 ],
    ["E4_S-66280", '', 0, "E18W-31635", '', 0 ],
    ["E18O-31635", '', 0, "E18_T-31975", '', 0 ],
    ["E18O-32360", '', 0, "E4Z-65780", '', 0 ],
    ["E4N-65420", '', 0, "E18W-32810", '', 0 ],
    ["E18_T-32440", '', 0,"E4N-67230", '', 0],
    ["E18O-25760", '', 0,"E18_A-25940", '', 0],
    ["E279_T-7795", '', 0,"E18W-29140", '', 0],
    ["E18O-29550", '', 0,"E18_E-29765", '', 0],
    ["E18_E-29765", '', 0,"E279Z-7195", '', 0],
    ["E279N-7445", '', 0,"E18O-30020", '', 0],
    ["E18W-30280", '', 0,"E279Z-7595", '', 0],
    ["E279N-7230", '', 0,"E279_T-7325", '', 0],
    ["E4N-64090", '', 0,"E4_A-64770", '', 0],
    ["E4N-62410", '', 0,"E4_M-62705", '', 0],
    ["E4_M-62705", '', 0,"E4N-63105", '', 0],
    ["E4Z-63225", '', 0,"E4_N-62890", '', 0],
    ["E4_N-62890", '', 0,"E4Z-62490", '', 0],
    ["E4N-61570", '', 0,"E4_A-61730", '', 0],
    ["E4_A-61730", '', 0,"E4_B-62065", '', 0],
    ["E4_B-62065", '', 0,"E4N-62410", '', 0],
    ["E4N-60510", '', 0,"E4_M-60590", '', 0],
    ["E182N-3805", '', 0,"E182_V-3915", '', 0],
    ["E182_V-3915", '', 0,"E18O-37735", '', 0],
    ["E18_E-37410", '', 0,"E182Z-3370", '', 0],
    ["E182_B-2690", '', 0,"E182N-2980", '', 0],
    ["E182Z-2940", '', 0,"E182_C-2615", '', 0],
    ["E182Z-1805", '', 0,"E182_C-1620", '', 0],
    ["E182Z-1150", '', 0,"E182_U-0960", '', 0],
    ["E182_U-0150", '', 0,"E20O-61110", '', 0],
    ["E20W-61060", '', 0,"E20_F-60910", '', 0],
    ["E20_F-60410", '', 0,"E182N-0960", '', 0],
    ["E20_B-60415", '', 0,"E20O-60595", '', 0],
    ["E20W-60600", '', 0,"E20_C-60490", '', 0],
    ["E182Z-0280", '', 0,"E20W-60100", '', 0],
    ["E20O-59970", '', 0,"E182N-0005", '', 0],
    ["E20_D-59245", '', 0,"E20W-59240", '', 0],
    ["E4_H-58860", '', 0,"E20O-59620", '', 0],
    ["E20O-58995", '', 0,"E20_A-59100", '', 0],
    ["E20W-58570", '', 0,"E4Z-58140", '', 0],
    ["E4_M-58480", '', 0,"E4N-58560", '', 0],
    ["E425N-58125", '', 0,"E4N-58480", '', 0],
    ["E4Z-59835", '', 0,"E4_H-59690", '', 0],
    ["E4Z-60055", '', 0,"E426Z-59740", '', 0],
    ["E4_N-58480", '', 0,"E4Z-58140", '', 0],
    ["E4N-59975", '', 0,"E4_C-61210", '', 0],
    ["E20_S-58940", '', 0, "E4N-59735", '', 0],
    ["E426N-59735", '', 0, "E4N-59975", '', 0],
    ["E4N-56780", '', 0, "E4_A-60170", '', 0],
    ["E4N-58730", '', 0, "E20O-58840",'', 0],
    ["E20W-59830", '', 0, "E20_S-59610", '', 0],
    ["E75W-7075", '', 0, "E75_C-6880", '', 0],
    ["E4N-71200", '', 0, "E4_G-71310", '', 0],
    ["E4_G-71310", '', 0, "E4_E-71550", '', 0]
]

total = len(newRows) + edges.count()

newDF = spark.createDataFrame(newRows, edge_schema)
edges = edges.union(newDF)
print(edges.count(), "should be " + str(total)) 

868 should be 868


### Validate nodes and edges

Find missing nodes

In [21]:
src_nodes = edges.select("src")
dest_nodes = edges.select("dest").withColumnRenamed("dest", "src")

edge_nodes_union = src_nodes.union(dest_nodes).distinct()

missing_nodes = nodes.select("node", "X", "Y").alias('n').join(
    edge_nodes_union.select("src").alias('e'), 
    col('n.node') == col('e.src'), 
    "leftouter"
).filter(isnull(col("src")))

missing_nodes.count()

0

In [22]:
missing_nodes.show()

+----+---+---+---+
|node|  X|  Y|src|
+----+---+---+---+
+----+---+---+---+



Find end nodes

In [23]:
src_nodes = edges.select("src")
end_nodes = nodes.select("node", "X", "Y").alias('n').join(
    src_nodes.select("src").alias('e'), 
    col('n.node') == col('e.src'), 
    "leftouter"
).filter(isnull(col("src")))

end_nodes.count()

59

In [24]:
end_nodes.show(1)

+----------+----------------+----------------+----+
|      node|               X|               Y| src|
+----------+----------------+----------------+----+
|E18O-37735|18.0415869015134|59.3823553194917|null|
+----------+----------------+----------------+----+
only showing top 1 row



Find start nodes

In [25]:
dest_nodes = edges.select("dest")
start_nodes = nodes.select("node", "X", "Y").alias('n').join(
    dest_nodes.select("dest").alias('e'), 
    col('n.node') == col('e.dest'), 
    "leftouter"
).filter(isnull(col("dest")))

start_nodes.count()

33

In [26]:
start_nodes.show(1)

+-----------+----------------+----------------+----+
|       node|               X|               Y|dest|
+-----------+----------------+----------------+----+
|E425N-58125|18.0265518477877|59.3440134702393|null|
+-----------+----------------+----------------+----+
only showing top 1 row



## Visualize nodes and edges on a map

In [27]:
# Docs: https://ipyleaflet.readthedocs.io/en/latest/
from ipyleaflet import Map, GeoJSON, TileLayer
from geojson import FeatureCollection, Feature, MultiPolygon
import ipywidgets as widgets
#url = 'http://a.tile.basemaps.cartocdn.com/light_all/{z}/{x}/{y}.png'
url = 'http://a.tile.basemaps.cartocdn.com/pitney-bowes-grey/{z}/{x}/{y}.png'

In [28]:
provider = TileLayer(url=url, opacity=1)
myMap = Map(
    default_tiles=provider, 
    center=[59.304591, 17.703240], 
    zoom=10, 
    layout=widgets.Layout(width='130%', height='1000px')
)
myMap.layout.height = '600px'
myMap.layout.width = '100%'

In [29]:
node_coords = nodes.select("X", "Y", "node")

In [30]:
node_features = []
for data in node_coords.toLocalIterator():
    node_p = MultiPolygon([ \
                      ([(data['X'], data['Y']), \
                        (data['X']+0.000001, data['Y']+0.000001), \
                        (data['X']-0.000001, data['Y']+0.000001), \
                        (data['X'], data['Y'])],) \
                     ])
    
    node_features.append(Feature(geometry=node_p, \
                    properties={'style':{
                        'color': '#3498db', 
                        'fillColor': '#3498db', 
                        'fillOpacity': 1.0, 
                        'weight': 10
                    }}))

node_data = FeatureCollection(node_features)
node_g = GeoJSON(data=node_data)

Draw edges

In [31]:
edge_coords = edges.alias('a').join(
    nodes.select("node", "X", "Y")
    .withColumnRenamed("X", "src_X") 
    .withColumnRenamed("Y", "src_Y") 
    .alias('b'),
    col('a.src') == col('b.node')
).select("src", "dest", "src_X", "src_Y").alias('c').join(
    nodes.select("node", "X", "Y")
    .withColumnRenamed("X", "dest_X") 
    .withColumnRenamed("Y", "dest_Y") 
    .alias('d'),
    col('c.dest') == col('d.node')
).select("src", "dest", "src_X", "src_Y", "dest_X", "dest_Y")

In [32]:
edge_features = []
for data in edge_coords.toLocalIterator():
    edge_p = MultiPolygon([ \
                      ([(data['src_X'], data['src_Y']), \
                        (data['src_X']+0.00001, data['src_Y']+0.000001), \
                        (data['dest_X']-0.0001, data['dest_Y']+0.00001), \
                        (data['dest_X'], data['dest_Y'])],\
                      )])
    
    edge_features.append(Feature(geometry=edge_p, \
                    properties={'style':{
                        'color': '#e74c3c', 
                        'fillColor': '#e74c3c', 
                        'fillOpacity': 1.0, 
                        'weight': 3
                    }}))
    
    

edge_data = FeatureCollection(edge_features)
edge_g = GeoJSON(data=edge_data)

Draw missing nodes

In [33]:
missing_node_coords = missing_nodes.select("X", "Y", "node")

In [34]:
missing_node_features = []
for data in missing_node_coords.toLocalIterator():
    missing_node_p = MultiPolygon([ \
                      ([(data['X'], data['Y']), \
                        (data['X']+0.000001, data['Y']+0.000001), \
                        (data['X']-0.000001, data['Y']+0.000001), \
                        (data['X'], data['Y'])],) \
                     ])
    
    missing_node_features.append(Feature(geometry=missing_node_p, \
                    properties={'style':{
                        'color': '#8e44ad', 
                        'fillColor': '#8e44ad', 
                        'fillOpacity': 1.0, 
                        'weight': 16
                    }}))

missing_node_data = FeatureCollection(missing_node_features)
missing_node_g = GeoJSON(data=missing_node_data)

Draw end nodes

In [35]:
end_node_coords = end_nodes.select("X", "Y", "node")

In [36]:
end_node_features = []
for data in end_node_coords.toLocalIterator():
    end_node_p = MultiPolygon([ \
                      ([(data['X'], data['Y']), \
                        (data['X']+0.000001, data['Y']+0.000001), \
                        (data['X']-0.000001, data['Y']+0.000001), \
                        (data['X'], data['Y'])],) \
                     ])
    
    end_node_features.append(Feature(geometry=end_node_p, \
                    properties={'style':{
                        'color': '#e67e22', 
                        'fillColor': '#e67e22', 
                        'fillOpacity': 1.0, 
                        'weight': 13
                    }}))

end_node_data = FeatureCollection(end_node_features)
end_node_g = GeoJSON(data=end_node_data)

Draw start nodes

In [37]:
start_node_coords = start_nodes.select("X", "Y", "node")

In [38]:
start_node_features = []
for data in start_node_coords.toLocalIterator():
    start_node_p = MultiPolygon([ \
                      ([(data['X'], data['Y']), \
                        (data['X']+0.000001, data['Y']+0.000001), \
                        (data['X']-0.000001, data['Y']+0.000001), \
                        (data['X'], data['Y'])],) \
                     ])
    
    start_node_features.append(Feature(geometry=start_node_p, \
                    properties={'style':{
                        'color': '#2ecc71', 
                        'fillColor': '#2ecc71', 
                        'fillOpacity': 1.0, 
                        'weight': 13
                    }}))

start_node_data = FeatureCollection(start_node_features)
start_node_g = GeoJSON(data=start_node_data)

In [39]:
myMap.add_layer(edge_g)
myMap.add_layer(node_g)
myMap.add_layer(end_node_g)
myMap.add_layer(start_node_g)
myMap.add_layer(missing_node_g)

In [40]:
myMap

In [41]:
#myMap.save("graph_map.html")
fig = myMap.figure()
embed_minimal_html('graph_map.html', views=[fig])

AttributeError: 'Map' object has no attribute 'figure'

## Write CSV Files

In [None]:
nodes.select("node", "McsDsRefer", "X", "Y", "Valid_From", "Valid_To", "McsDsRefer_road", "McsDsRefer_meter", "Sensors") \
    .coalesce(1).write \
    .option('sep', ';') \
    .format("com.databricks.spark.csv") \
    .option("header", "true") \
    .mode("overwrite") \
    .save("../../data/nodes")

In [None]:
edges.select("src","dest").coalesce(1).write \
    .option('sep', ';') \
    .format("com.databricks.spark.csv") \
    .option("header", "true") \
    .mode("overwrite") \
    .save("../../data/edges")

## Write Parquet Files

In [None]:
nodes.select(
    "node", 
    "McsDsRefer", 
    "X", "Y", 
    "Valid_From", "Valid_To", 
    "McsDsRefer_road", "McsDsRefer_meter", 
    "Sensors"
).coalesce(1).write.mode("overwrite").parquet("../../data/nodes-parquet")

In [None]:
edges.select("src","dest").coalesce(1).write.mode("overwrite").parquet("../../data/edges-parquet")

## Statistics

In [59]:
print(nodes.agg({"Sensors": "avg"}).collect()[0])
print(nodes.agg({"Sensors": "max"}).collect()[0])
print(nodes.agg({"Sensors": "min"}).collect()[0])
print(nodes.agg({"Sensors": "sum"}).collect()[0])
print("total nodes:", nodes.count())
print("total start nodes:", start_nodes.count())
print("total end nodes:", end_nodes.count())
print("total edges: ", edges.count())

Row(avg(Sensors)=2.3733955659276544)
Row(max(Sensors)=6)
Row(min(Sensors)=1)
Row(sum(Sensors)=2034)
total nodes: 857
total start nodes: 33
total end nodes: 59
total edges:  868
