<a href="https://colab.research.google.com/github/ppiont/tensor-flow-state/blob/master/3_data_impute.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount("/gdrive", force_remount = True)

Mounted at /gdrive


In [0]:
%cd "/gdrive/My Drive/tensor-flow-state/tensor-flow-state"

/gdrive/My Drive/tensor-flow-state/tensor-flow-state


In [0]:
import pandas as pd
df = pd.read_csv("data/combined_df.csv", index_col = 0, parse_dates = True)

In [0]:
cols = ["speed", "flow", "speed_-2", "speed_-1","speed_+1", "speed_+2", "flow_-2", "flow_-1", "flow_+1", "flow_+2", "speed_limit"]
speed_cols = ["speed", "speed_-2", "speed_-1","speed_+1", "speed_+2"]
flow_cols = ["flow", "flow_-2", "flow_-1", "flow_+1", "flow_+2"]

In [0]:
def nans(df, title = ""):
    records = len(df)
    null_records = len(df.loc[df.isnull().any(axis=1)])
    null_ratio = len(df.loc[df.isnull().any(axis=1)]) / len(df)

    print(f"\
    {title}\n\
    ---------------------------------\n\
    Number of records:      {records:,}\n\
    Number of null records:   {null_records:,}\n\
    ---------------------------------\n\
    Null ratio:                 {null_ratio:.3f}\
    ")

In [0]:
nans(df, title = "Status before imputation")

    Status before imputation
    ---------------------------------
    Number of records:      4,727,520
    Number of null records:   448,433
    ---------------------------------
    Null ratio:                 0.095    


In [0]:
def fill_na_row_mean(df):
    row_avgs = df.mean(axis=1).values.reshape(-1,1)
    df = df.fillna(0) + df.isna().values * row_avgs
    return df

In [0]:
speed_df = fill_na_row_mean(df[speed_cols])
flow_df = fill_na_row_mean(df[flow_cols])

In [0]:
df = speed_df.join(flow_df, how = 'inner').join(df[['speed_limit']], how = 'inner')

In [0]:
nans(df, title = "Status after row meaning nans")

    Status after row meaning nans
    ---------------------------------
    Number of records:      4,727,520
    Number of null records:   289,265
    ---------------------------------
    Null ratio:                 0.061    


In [0]:
df.to_csv("data/df_NAs_row_meaned.csv")

Now interpolate the rest with week shifting.

In [0]:
# Interpolate null vals for the first week of data of speed and flow cols
def interpolate_week(df, cols):
    week = 7 * 24 * 60
    for col in cols: 
        df.iloc[:week, df.columns.get_loc(col)] = df[col][:week].interpolate(method = 'time')
    return df

import numpy as np
# Replace remaining nulls with value from 1 week previous
def shift_week(df, cols):
    # Use RangeIndex for the this operation
    df['timestamp'] = df.index
    df.reset_index(drop = True, inplace = True)
    week = 7 * 24 * 60
    for col in cols:
        col_index = df.columns.get_loc(col)
        for row in df.itertuples():
            if np.isnan(row[col_index + 1]):
                df.iat[row[0], col_index] = df.iat[(row[0] - week), col_index]
    # Return to DateTimeIndex again
    df.set_index(pd.to_datetime(df.timestamp.values), inplace = True) 
    df.drop('timestamp', axis = 1, inplace = True)
    return df

In [0]:
df = interpolate_week(df, cols)
df = shift_week(df, cols)

In [0]:
nans(df, title = "Status after week shifting")

    Status after week shifting
    ---------------------------------
    Number of records:      4,727,520
    Number of null records:   0
    ---------------------------------
    Null ratio:                 0.000    


In [0]:
df.head()

Unnamed: 0,speed,speed_-2,speed_-1,speed_+1,speed_+2,flow,flow_-2,flow_-1,flow_+1,flow_+2,speed_limit
2011-01-03 00:00:00,120.0,100.0,120.0,118.5,120.0,8.0,8.0,6.0,4.0,6.0,120.0
2011-01-03 00:01:00,120.0,115.0,120.0,118.0,104.0,5.0,18.0,6.0,5.0,2.0,120.0
2011-01-03 00:02:00,111.0,112.666667,104.0,91.0,112.666667,2.0,14.0,2.0,4.0,8.0,120.0
2011-01-03 00:03:00,105.333333,118.0,108.666667,120.0,120.0,10.0,14.0,6.0,9.0,10.0,120.0
2011-01-03 00:04:00,120.0,105.5,120.0,120.0,117.5,8.0,21.0,13.0,7.0,5.0,120.0


In [0]:
df.to_csv("data/df_imputed_week_shift.csv")