In [1]:
# Remember to add tensorflow-hadoop-1.0-SNAPSHOT.jar to the hops cluster setup

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1293,application_1536227070932_0857,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

import tensorflow as tf

In [3]:
spark.version

u'2.3.0'

# Parameters

In [4]:
year = 2016
month = 11

# Import Data

In [5]:
file_path = "hdfs:///Projects/TrafficFlow/TrafficFlowParquet/TrafficFlowAll/Year=" + str(year) + "/Month=" + str(month) + "/*.parquet"
df_raw = spark.read.parquet(file_path)

In [6]:
df_raw.printSchema()

root
 |-- Timestamp: timestamp (nullable = true)
 |-- Date: date (nullable = true)
 |-- Day: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Minute: integer (nullable = true)
 |-- Road: string (nullable = true)
 |-- Km_Ref: integer (nullable = true)
 |-- Detector_Number: integer (nullable = true)
 |-- Traffic_Direction: short (nullable = true)
 |-- Flow_In: short (nullable = true)
 |-- Average_Speed: short (nullable = true)
 |-- Density: double (nullable = true)
 |-- Sign_Aid_Det_Comms: short (nullable = true)
 |-- Status: short (nullable = true)
 |-- Legend_Group: short (nullable = true)
 |-- Legend_Sign: short (nullable = true)
 |-- Legend_SubSign: short (nullable = true)
 |-- Protocol_Version: string (nullable = true)

In [7]:
df_raw.show()

+-------------------+----------+---+---------+----+------+-----+------+---------------+-----------------+-------+-------------+------------------+------------------+------+------------+-----------+--------------+----------------+
|          Timestamp|      Date|Day|DayOfWeek|Hour|Minute| Road|Km_Ref|Detector_Number|Traffic_Direction|Flow_In|Average_Speed|           Density|Sign_Aid_Det_Comms|Status|Legend_Group|Legend_Sign|Legend_SubSign|Protocol_Version|
+-------------------+----------+---+---------+----+------+-----+------+---------------+-----------------+-------+-------------+------------------+------------------+------+------------+-----------+--------------+----------------+
|2016-11-11 07:19:00|2016-11-11| 11|        5|   6|    19|  E4Z| 71400|              1|               78|     14|           93|  9.03225806451613|                 0|     3|         255|          1|             1|               4|
|2016-11-11 23:52:00|2016-11-11| 11|        5|  22|    52|E265O|  1300|         

# Remove Errors

In [7]:
# Set 251 avg speed to 1 as it means less than 2
avg_speed_fix_udf = udf(lambda speed: 1 if speed == 251 else speed, ShortType())
df_raw = df_raw.withColumn('Average_Speed', avg_speed_fix_udf('Average_Speed'))

# Remove error codes
df_raw = df_raw.where(col('Average_Speed') <= 250)

# Only valid flow
df_raw = df_raw.where(col('Flow_In') >= 0)
df_raw = df_raw.where(col('Flow_In') <= 120)

# Generate Sensor Id

In [8]:
@udf(StringType())
def generate_sensor_id(road, km, lane):
    if km < 10:
        return road + "-000" + str(km)[-3:]  + "-" + str(lane)
    if km < 100:
        return road + "-00" + str(km)[-3:]  + "-" + str(lane)
    if km < 1000:
        return road + "-0" + str(km)[-3:]  + "-" + str(lane)
    return road + "-" + str(km)[:-3] + "" + str(km)[-3:] + "-" + str(lane)

df = df_raw.withColumn('Sensor_ID', generate_sensor_id('Road', 'Km_Ref', 'Detector_Number'))

df.show(1)

+-------------------+----------+---+---------+----+------+----+------+---------------+-----------------+-------+-------------+----------------+------------------+------+------------+-----------+--------------+----------------+-----------+
|          Timestamp|      Date|Day|DayOfWeek|Hour|Minute|Road|Km_Ref|Detector_Number|Traffic_Direction|Flow_In|Average_Speed|         Density|Sign_Aid_Det_Comms|Status|Legend_Group|Legend_Sign|Legend_SubSign|Protocol_Version|  Sensor_ID|
+-------------------+----------+---+---------+----+------+----+------+---------------+-----------------+-------+-------------+----------------+------------------+------+------------+-----------+--------------+----------------+-----------+
|2016-11-11 07:19:00|2016-11-11| 11|        5|   6|    19| E4Z| 71400|              1|               78|     14|           93|9.03225806451613|                 0|     3|         255|          1|             1|               4|E4Z-71400-1|
+-------------------+----------+---+--------

# Calculate Density

In [9]:
# Flow = Speed * Density (see chapter 3 in An Introduction to Traffic Flow Theory)
# veh/km = (veh/min) / (km/min)
density_udf = udf(lambda speed, flow: flow / (speed / 60), DoubleType())
df_raw = df_raw.withColumn('Density', avg_speed_fix_udf('Average_Speed', 'Flow_In'))

# Extract sensor density values for each timestamp

In [10]:
# Group rows by Timestamp 
# Each column is a sensor
# Take density value
# Order rows by Timestamp

df = df.groupBy(col('Timestamp')) \
    .pivot('Sensor_ID') \
    .agg(first("Density")) \
    .sort(col('Timestamp')) 

# Inspect data

In [11]:
# Check number of sensors
print("Number of sensors: ", len(df.columns))

('Number of sensors: ', 1942)

In [12]:
print(df.columns)

['Timestamp', 'E182N-0005-1', 'E182N-0190-1', 'E182N-0300-1', 'E182N-0410-1', 'E182N-0520-1', 'E182N-0630-1', 'E182N-0740-1', 'E182N-0830-1', 'E182N-0830-2', 'E182N-0960-1', 'E182N-0960-2', 'E182N-1080-1', 'E182N-1080-2', 'E182N-1325-1', 'E182N-1325-2', 'E182N-1580-1', 'E182N-1580-2', 'E182N-1810-1', 'E182N-1810-2', 'E182N-1810-3', 'E182N-1810-4', 'E182N-2015-1', 'E182N-2015-2', 'E182N-2015-3', 'E182N-2015-4', 'E182N-2325-1', 'E182N-2325-2', 'E182N-2325-3', 'E182N-2690-1', 'E182N-2690-2', 'E182N-2690-3', 'E182N-2980-1', 'E182N-2980-2', 'E182N-2980-3', 'E182N-3285-1', 'E182N-3285-2', 'E182N-3285-3', 'E182N-3615-1', 'E182N-3615-2', 'E182N-3615-3', 'E182N-3805-1', 'E182N-3805-2', 'E182Z-0280-1', 'E182Z-0390-1', 'E182Z-0500-1', 'E182Z-0610-1', 'E182Z-0720-1', 'E182Z-0830-1', 'E182Z-0960-1', 'E182Z-1150-1', 'E182Z-1150-2', 'E182Z-1325-1', 'E182Z-1325-2', 'E182Z-1620-1', 'E182Z-1620-2', 'E182Z-1805-1', 'E182Z-1805-2', 'E182Z-1805-3', 'E182Z-2060-1', 'E182Z-2060-2', 'E182Z-2060-3', 'E182Z-229

In [13]:
# Verify timestamp order is correct
df.select(col('Timestamp')).show(10)

+-------------------+
|          Timestamp|
+-------------------+
|2016-11-01 01:00:00|
|2016-11-01 01:01:00|
|2016-11-01 01:02:00|
|2016-11-01 01:03:00|
|2016-11-01 01:04:00|
|2016-11-01 01:05:00|
|2016-11-01 01:06:00|
|2016-11-01 01:07:00|
|2016-11-01 01:08:00|
|2016-11-01 01:09:00|
+-------------------+
only showing top 10 rows

In [14]:
# Find days with less datapoints than it should have
df.groupBy(dayofmonth('Timestamp').alias('day')).count() \
    .where(col('count') != 1440).show()

+---+-----+
|day|count|
+---+-----+
|  1| 1380|
| 30|   61|
+---+-----+

In [15]:
# Find hours for each day with less datapoints than it should have
df.groupBy(dayofmonth('Timestamp').alias('day'), hour('Timestamp').alias('hour')).count() \
    .orderBy('day', 'hour').where(col('count') != 60).show()

+---+----+-----+
|day|hour|count|
+---+----+-----+
| 30|   1|    1|
+---+----+-----+

In [16]:
# Examine day 1
df.select(
    dayofmonth('Timestamp').alias('day'), 
    hour('Timestamp').alias('hour')
).where(col('day') == 1).groupBy('hour').count().orderBy(col('hour')).show(40)

+----+-----+
|hour|count|
+----+-----+
|   1|   60|
|   2|   60|
|   3|   60|
|   4|   60|
|   5|   60|
|   6|   60|
|   7|   60|
|   8|   60|
|   9|   60|
|  10|   60|
|  11|   60|
|  12|   60|
|  13|   60|
|  14|   60|
|  15|   60|
|  16|   60|
|  17|   60|
|  18|   60|
|  19|   60|
|  20|   60|
|  21|   60|
|  22|   60|
|  23|   60|
+----+-----+

In [17]:
# Examine day 30
df.select(
    dayofmonth('Timestamp').alias('day'), 
    hour('Timestamp').alias('hour')
).where(col('day') == 30).groupBy('hour').count().show()

+----+-----+
|hour|count|
+----+-----+
|   1|    1|
|   0|   60|
+----+-----+

In [18]:
# Check if there is a row with no null values
df.select([first(x, ignorenulls=True).alias(x) for x in df.columns]).first()

Row(Timestamp=datetime.datetime(2016, 11, 2, 4, 51), E182N-0005-1=0.6451612903225806, E182N-0190-1=0.6185567010309279, E182N-0300-1=0.7317073170731707, E182N-0410-1=0.7228915662650602, E182N-0520-1=0.8571428571428571, E182N-0630-1=0.6451612903225806, E182N-0740-1=0.6741573033707865, E182N-0830-1=0.594059405940594, E182N-0830-2=0.6122448979591837, E182N-0960-1=1.2765957446808511, E182N-0960-2=0.6060606060606061, E182N-1080-1=0.5769230769230769, E182N-1080-2=1.0526315789473684, E182N-1325-1=0.8108108108108109, E182N-1325-2=0.5309734513274337, E182N-1580-1=0.6741573033707865, E182N-1580-2=1.518987341772152, E182N-1810-1=0.7142857142857143, E182N-1810-2=2.3376623376623376, E182N-1810-3=0.8, E182N-1810-4=1.1111111111111112, E182N-2015-1=0.6521739130434783, E182N-2015-2=0.7317073170731707, E182N-2015-3=0.7692307692307693, E182N-2015-4=0.8823529411764706, E182N-2325-1=0.5607476635514018, E182N-2325-2=0.631578947368421, E182N-2325-3=0.8333333333333334, E182N-2690-1=1.263157894736842, E182N-269

# Remove days with missing data

In [19]:
df = df.where(dayofmonth('Timestamp') != 1)
df = df.where(dayofmonth('Timestamp') != 30)

# Fix Null Values

In [20]:
# Replace null values with 0 as sensors usually give error when no cars are on the road 
df = df.na.fill(0)

# Export Data

In [21]:
# Export as Parquet
df.write.mode('overwrite').parquet("hdfs:///Projects/traffic_reginbald/processed_traffic_data/" + str(year) + "-" + str(month) + "_all-sensors-timeseries-parquet")