# Preprocessing dataset for single sensor supervised learning
*Remember to add tensorflow-hadoop-1.0-SNAPSHOT.jar to the hops cluster setup*

*Run all-sensors-preprocessing.ipynb before this notebook*

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import *

import datetime

import tensorflow as tf
import numpy as np

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
392,application_1529568156751_0204,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
spark.version

u'2.3.0'

## Parameters

In [3]:
year = 2016
month = 11

## Import Data

In [4]:
file_path = "hdfs:///Projects/traffic_reginbald/processed_traffic_data/" + str(year) + "-" + str(month) + "_all-sensors-timeseries-parquet/*.parquet" 
df = spark.read.parquet(file_path).orderBy('Timestamp')

## Transform dataset for supervised machine learning

In [5]:
def generate_timeseries_data(df, column):
    supervised_df = df.select('Timestamp', column)
    window = Window.orderBy(supervised_df['Timestamp']) 

    # Past data
    supervised_df = supervised_df.withColumnRenamed(supervised_df.columns[1], "t-9")
    for i in range(1, 9):
        supervised_df = supervised_df.withColumn("t-" + str(9 - i), lag("t-9", -i).over(window))

    # Current data
    supervised_df = supervised_df.withColumn("t", lag("t-9", -10).over(window))    

    # Future data
    for i in range (1, 31):
        supervised_df = supervised_df.withColumn("t+" + str(i), lag("t-9", -9 - i).over(window))
        
    supervised_df = supervised_df.where(col("t+30").isNotNull())
    return supervised_df

## Export Data

In [6]:
# Export as Parquet
def export_data(df, sensor_name):
    df.write.mode('overwrite') \
    .parquet("hdfs:///Projects/traffic_reginbald/processed_traffic_data/" + \
             str(year) + "-" + str(month) + \
             "_single-sensor-30-min-supervised-parquet/" + sensor_name)

## Generate supervised data for all sensors

In [7]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

columns = list(chunks(df.columns[1:], 150))

print(len(columns))
print(columns)

13
[['E182N-0005-1', 'E182N-0190-1', 'E182N-0300-1', 'E182N-0410-1', 'E182N-0520-1', 'E182N-0630-1', 'E182N-0740-1', 'E182N-0830-1', 'E182N-0830-2', 'E182N-0960-1', 'E182N-0960-2', 'E182N-1080-1', 'E182N-1080-2', 'E182N-1325-1', 'E182N-1325-2', 'E182N-1580-1', 'E182N-1580-2', 'E182N-1810-1', 'E182N-1810-2', 'E182N-1810-3', 'E182N-1810-4', 'E182N-2015-1', 'E182N-2015-2', 'E182N-2015-3', 'E182N-2015-4', 'E182N-2325-1', 'E182N-2325-2', 'E182N-2325-3', 'E182N-2690-1', 'E182N-2690-2', 'E182N-2690-3', 'E182N-2980-1', 'E182N-2980-2', 'E182N-2980-3', 'E182N-3285-1', 'E182N-3285-2', 'E182N-3285-3', 'E182N-3615-1', 'E182N-3615-2', 'E182N-3615-3', 'E182N-3805-1', 'E182N-3805-2', 'E182Z-0280-1', 'E182Z-0390-1', 'E182Z-0500-1', 'E182Z-0610-1', 'E182Z-0720-1', 'E182Z-0830-1', 'E182Z-0960-1', 'E182Z-1150-1', 'E182Z-1150-2', 'E182Z-1325-1', 'E182Z-1325-2', 'E182Z-1620-1', 'E182Z-1620-2', 'E182Z-1805-1', 'E182Z-1805-2', 'E182Z-1805-3', 'E182Z-2060-1', 'E182Z-2060-2', 'E182Z-2060-3', 'E182Z-2295-1', 'E1

In [8]:
for column in columns[0]:
    supervised_df = generate_timeseries_data(df, column)
    export_data(supervised_df, column)

In [8]:
for column in columns[1]:
    supervised_df = generate_timeseries_data(df, column)
    export_data(supervised_df, column)

In [8]:
for column in columns[2]:
    supervised_df = generate_timeseries_data(df, column)
    export_data(supervised_df, column)

In [8]:
for column in columns[3]:
    supervised_df = generate_timeseries_data(df, column)
    export_data(supervised_df, column)

In [8]:
for column in columns[4]:
    supervised_df = generate_timeseries_data(df, column)
    export_data(supervised_df, column)

In [8]:
for column in columns[5]:
    supervised_df = generate_timeseries_data(df, column)
    export_data(supervised_df, column)

In [8]:
for column in columns[6]:
    supervised_df = generate_timeseries_data(df, column)
    export_data(supervised_df, column)

In [8]:
for column in columns[7]:
    supervised_df = generate_timeseries_data(df, column)
    export_data(supervised_df, column)

In [8]:
for column in columns[8]:
    supervised_df = generate_timeseries_data(df, column)
    export_data(supervised_df, column)

In [8]:
for column in columns[9]:
    supervised_df = generate_timeseries_data(df, column)
    export_data(supervised_df, column)

In [8]:
for column in columns[10]:
    supervised_df = generate_timeseries_data(df, column)
    export_data(supervised_df, column)

In [8]:
for column in columns[11]:
    supervised_df = generate_timeseries_data(df, column)
    export_data(supervised_df, column)

In [8]:
for column in columns[12]:
    supervised_df = generate_timeseries_data(df, column)
    export_data(supervised_df, column)

### Verify

In [9]:
file_path = "hdfs:///Projects/traffic_reginbald/processed_traffic_data/" + \
             str(year) + "-" + str(month) + \
             "_single-sensor-30-min-supervised-parquet/" + df.columns[-1] 
sensor_df = spark.read.parquet(file_path).orderBy('Timestamp')

In [10]:
sensor_df.show()

+-------------------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|          Timestamp| t-9| t-8| t-7| t-6| t-5| t-4| t-3| t-2| t-1|   t| t+1| t+2| t+3| t+4| t+5| t+6| t+7| t+8| t+9|t+10|t+11|t+12|t+13|t+14|             t+15|             t+16|             t+17|             t+18|             t+19|             t+20|             t+21|             t+22|             t+23|             t+24|             t+25|             t+26|             t+27|             t+28|             t+29|             t+30|
+-------------------+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----

In [11]:
print("Original dataset number of rows:", df.count(), "Supervised dataset number of rows:", sensor_df.count())

('Original dataset number of rows:', 40320, 'Supervised dataset number of rows:', 40281)