<a href="https://colab.research.google.com/github/ppiont/tensor-flow-state/blob/master/data_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/gdrive', force_remount = True)

In [2]:
%cd '/gdrive/My Drive/tensor-flow-state/tensor-flow-state'

/gdrive/My Drive/tensor-flow-state/tensor-flow-state


In [0]:
sensor_list = ['RWS01_MONIBAS_0021hrl0403ra.csv', 'RWS01_MONIBAS_0021hrl0409ra.csv', 'RWS01_MONIBAS_0021hrl0414ra.csv', 'RWS01_MONIBAS_0021hrl0420ra.csv', 'RWS01_MONIBAS_0021hrl0426ra.csv']

In [0]:
import datetime
def dateparse (time_in_secs):
    # Unix/Epoch time to 'YYYY-MM-DD HH:MM:SS'
    return datetime.datetime.fromtimestamp(float(time_in_secs))

import pandas as pd
def repair_datetime_index(df, freq = 'T'):
    df = df.loc[~df.index.duplicated(keep='first')] # remove date time indexes
    df = df.reindex(pd.date_range(start = df.index.min(), end = df.index.max(), freq = freq)) # add missing date time indexes
    return df

import numpy as np
def fix_values(df):
    # The order of these operations is currently important! Pay attention when making changes
    df['speed_limit'] = np.where((df.index.hour < 19) & (df.index.hour >= 6), 100, 120)
    df.loc[df.flow < 0, 'flow'] = np.nan # flow is either -2 (missing data) or 0 or positive. -2 to nan
    df.loc[df.speed < -1, 'speed'] = np.nan # -2 (missing data) as well as oddities (-1.33, an average over -2 and -1 lanes?) to nan 
    df.speed.mask(df.speed == -1, df.speed_limit, inplace = True) # -1 means no cars, setting it to speed limit
    df.loc[(df.speed < 0) & (df.speed > -1), 'speed'] = 0 # anything else below zero is between 0 and -1, occuring when some lanes have non-moving cars while others have have no cars.
    df.speed.mask(df.speed > df.speed_limit, df.speed_limit, inplace = True) # cap speed at speed_limit, since higher speed dosn't add to representation
    return df

import os
def reduce_cols(sensors, path_in = "data/ndw_raw/", path_out = "data/"):
    for sensor in sensors:
        df = pd.read_csv(os.path.join(path_in, sensor), header = None, \
                         usecols = [0, 86, 87], names = ['timestamp', 'speed', 'flow'], \
                         index_col = 'timestamp', parse_dates = True, date_parser = dateparse)
        df.flow /= 60 # change flow unit to min^-1
        df = repair_datetime_index(df)
        df = fix_values(df)        
        df.to_csv(path_out + sensor)


In [0]:
reduce_cols(sensor_list)

In [0]:
test = pd.read_csv('data/RWS01_MONIBAS_0021hrl0403ra.csv', index_col = 0, parse_dates = True)


In [7]:
pd.options.display.float_format = '{:.2f}'.format
test.head()

Unnamed: 0,speed,flow,speed_limit
2011-01-03 00:00:00,100.0,8.0,120
2011-01-03 00:01:00,115.0,18.0,120
2011-01-03 00:02:00,112.67,14.0,120
2011-01-03 00:03:00,118.0,14.0,120
2011-01-03 00:04:00,105.5,21.0,120


In [8]:
test.describe()

Unnamed: 0,speed,flow,speed_limit
count,4287209.0,4288108.0,4727520.0
mean,101.35,60.86,109.17
std,12.01,44.92,9.97
min,0.0,0.0,100.0
25%,98.4,18.0,100.0
50%,100.0,59.0,100.0
75%,105.0,92.0,120.0
max,120.0,269.0,120.0
