<a href="https://colab.research.google.com/github/ppiont/tensor-flow-state/blob/master/data_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/gdrive', force_remount = True)

Mounted at /gdrive


In [2]:
%cd '/gdrive/My Drive/tensor-flow-state/tensor-flow-state'

/gdrive/My Drive/tensor-flow-state/tensor-flow-state


In [0]:
sensor_list = ['RWS01_MONIBAS_0021hrl0403ra.csv', 'RWS01_MONIBAS_0021hrl0409ra.csv', 'RWS01_MONIBAS_0021hrl0414ra.csv', 'RWS01_MONIBAS_0021hrl0420ra.csv', 'RWS01_MONIBAS_0021hrl0426ra.csv']

In [0]:
import datetime
def dateparse (time_in_secs):
    # Unix/Epoch time to 'YYYY-MM-DD HH:MM:SS'
    return datetime.datetime.fromtimestamp(float(time_in_secs))

import pandas as pd
def repair_datetime_index(df, freq = 'T'):
    df = df.loc[~df.index.duplicated(keep='first')]
    df = df.reindex(pd.date_range(start = df.index.min(), end = df.index.max(), freq = freq))
    return df

import numpy as np
def convert_invalid_to_nans(df):
    # The order of these operations is currently important! Pay attention when making changes
    df.loc[df.flow < 0, 'flow'] = np.nan # flow is either -2 (missing data) or 0 or positive. -2 to nan
    df.loc[df.speed < -1, 'speed'] = np.nan # -2 (missing data) as well as oddities (-1.33, an average over -2 and -1 lanes?) to nan 
    df.loc[df.speed == -1, 'speed'] = -99 # -1 means no cars
    df.loc[(df.speed < 0) & (df.speed > -98), 'speed'] = 0 # anything else below zero is between 0 and -1, occuring when some lanes have non-moving cars while others have have no cars.
    return df

import os
def reduce_cols(sensors, path_in = "data/ndw_raw/", path_out = "data/"):
    for sensor in sensors:
        df = pd.read_csv(os.path.join(path_in, sensor), header = None, \
                         usecols = [0, 86, 87], names = ['timestamp', 'speed', 'flow'], \
                         index_col = 'timestamp', parse_dates = True, date_parser = dateparse)
        df.flow /= 60
        df = repair_datetime_index(df)
        df = convert_invalid_to_nans(df)
        df.to_csv(path_out + sensor)


In [0]:
reduce_cols(sensor_list)

In [0]:
test = pd.read_csv('data/RWS01_MONIBAS_0021hrl0403ra.csv', index_col = 0)


In [0]:
pd.options.display.float_format = '{:.2f}'.format
test.head()

Unnamed: 0,speed,flow
2011-01-03 00:00:00,100.0,8.0
2011-01-03 00:01:00,115.0,18.0
2011-01-03 00:02:00,112.67,14.0
2011-01-03 00:03:00,118.0,14.0
2011-01-03 00:04:00,105.5,21.0


In [0]:
test.describe()

Unnamed: 0,speed,flow
count,4286397.0,4286342.0
mean,102.84,60.55
std,17.08,45.68
min,-99.0,-99.0
25%,98.4,18.0
50%,101.0,59.0
75%,111.6,92.0
max,205.0,269.0


In [10]:
test.loc[test.flow > 30].head()

Unnamed: 0,speed,flow
2011-01-03 00:12:00,106.5,32.0
2011-01-03 00:23:00,125.5,36.0
2011-01-03 00:34:00,105.25,36.0
2011-01-03 00:35:00,116.25,32.0
2011-01-03 00:36:00,112.75,48.0
