In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import datetime
from datetime import timedelta
import time
import math
import warnings
import os
warnings.filterwarnings("ignore")
import glob
import traces
import json
from tqdm import tqdm

# REDD:

In [35]:
base_path = "../data/redd/low_freq/"
applicance_list = ["mains", "dishwaser", "refrigerator"]

In [36]:
# Read the labels & create the label map
def read_label(base_path):
    label = {}
    for h_idx in range(1, 7):
        h_path = os.path.join(base_path, "house_{}/labels.dat".format(h_idx))
        label[h_idx] = {}
        with open(h_path) as f:
            for line in f:
                splitted_line = line.split(' ')
                label[h_idx][int(splitted_line[0])] = splitted_line[1].strip() + '_' + splitted_line[0]
    return label


# Read the data
def read_data(base_path, h_idx, labels, applicance_list):
    h_path = os.path.join(base_path, "house_{}/".format(h_idx))
    num_apps = len(glob.glob(h_path + 'channel*'))
    df = None
    for ch_i in range(1, num_apps + 1):
        file = h_path + 'channel_{}.dat'.format(ch_i)
        if labels[h_idx][ch_i].split("_")[0] in applicance_list:
            if df is None:
                df = pd.read_table(file, sep = ' ', names = ['unix_time', labels[h_idx][ch_i]], 
                                                  dtype = {'unix_time': 'int64', labels[h_idx][ch_i]: 'float64'})
            else:
                data = pd.read_table(file, sep = ' ', names = ['unix_time', labels[h_idx][ch_i]], 
                                                  dtype = {'unix_time': 'int64', labels[h_idx][ch_i]: 'float64'})
                df = pd.merge(df, data, how = 'inner', on = 'unix_time')
    df['timestamp'] = df['unix_time'].astype("datetime64[s]")
    df = df.set_index(df['timestamp'].values)
    df.drop(['unix_time','timestamp'], axis=1, inplace=True)
    df.sort_index(inplace=True)
    return df

In [37]:
%%time
labels = read_label(base_path)
df = {}
for i in range(1, 7):
    df[i] = read_data(base_path, i, labels, applicance_list)

CPU times: user 5.17 s, sys: 1.68 s, total: 6.86 s
Wall time: 6.9 s


In [38]:
df[1].head()

Unnamed: 0,mains_1,mains_2,refrigerator_5,dishwaser_6
2011-04-18 13:22:13,222.2,118.83,6.0,0.0
2011-04-18 13:22:16,223.17,119.19,6.0,0.0
2011-04-18 13:22:20,223.6,118.92,6.0,0.0
2011-04-18 13:22:23,222.91,119.16,6.0,1.0
2011-04-18 13:22:26,222.94,118.83,6.0,0.0


## Number of houses having a particular appliance: 

In [39]:
app_cnt = {}
for k, v in labels.items():
    cur_app_list = set()
    for kk, vv in v.items():
        app = "_".join(vv.split("_")[:-1])
        cur_app_list.add(app)
    for cur_app in cur_app_list:
        if app_cnt.get(cur_app) is None:
            app_cnt[cur_app] = 0
        app_cnt[cur_app] += 1
        
app_cnt = dict(sorted(app_cnt.items(), key=lambda x: x[1], reverse=True))

In [40]:
print(app_cnt)

{'washer_dryer': 6, 'mains': 6, 'kitchen_outlets': 6, 'dishwaser': 6, 'lighting': 6, 'refrigerator': 5, 'bathroom_gfi': 5, 'microwave': 4, 'stove': 4, 'outlets_unknown': 4, 'electric_heat': 3, 'disposal': 3, 'electronics': 3, 'furance': 3, 'smoke_alarms': 2, 'air_conditioning': 2, 'oven': 1, 'miscellaeneous': 1, 'outdoor_outlets': 1, 'subpanel': 1}


## Data Preprocessing:

### 1. Add Zero Boundary Records when data is not available for more than 180 seconds (3min):

In [41]:
# If there is no data for more than 180 seconds (3min), fill zeros
second_threshold = 180
boundary_second_interval = 5
num_input_vals = 4

In [42]:
def createZeroRecordsDF(cur_timestamp_list, col_list, cur_input_vals=4, cur_boundary_second_interval=5):
    zero_end_records = []
    for cur_ts in cur_timestamp_list:
        cur_ts_new = cur_ts + timedelta(seconds=cur_boundary_second_interval)
        cur_record = [cur_ts_new] + [0] * cur_input_vals
        zero_end_records.append(cur_record)

    zero_end_records_df = pd.DataFrame(zero_end_records, columns=col_list)
    zero_end_records_df = zero_end_records_df.set_index(zero_end_records_df['time_stamp'].values)
    return zero_end_records_df

def addZeroBoundaryRecords(cur_df):
    col_list = ["time_stamp"] + list(cur_df.columns)
    cur_df["time_stamp"] = cur_df.index
    
    cur_df["time_diff"] = cur_df["time_stamp"].diff(periods = 1).dt.total_seconds()
    cur_df["prev_time_diff"] = cur_df["time_diff"].shift(periods = -1)
    
    cur_input_vals = len(col_list) - 1
    prev_timestamps = list(cur_df[cur_df["prev_time_diff"] >= second_threshold]["time_stamp"])
    next_zero_boundary_records = createZeroRecordsDF(prev_timestamps, col_list, cur_input_vals, boundary_second_interval)
    cur_df = cur_df.append(next_zero_boundary_records)
    
    next_timestamps = list(cur_df[cur_df["time_diff"] >= second_threshold]["time_stamp"])
    prev_zero_boundary_ecords = createZeroRecordsDF(next_timestamps, col_list, cur_input_vals, -1 * boundary_second_interval)
    cur_df = cur_df.append(prev_zero_boundary_ecords)
    
    # Sort by Timestamp
    cur_df.sort_index(inplace=True)
    cur_df.drop(columns=["time_stamp", "time_diff", "prev_time_diff"], inplace=True)
    
    return cur_df

In [43]:
%%time
for i in range(1, 7):
    df[i] = addZeroBoundaryRecords(df[i].copy())

CPU times: user 406 ms, sys: 154 ms, total: 559 ms
Wall time: 559 ms


In [44]:
df[1].head()

Unnamed: 0,dishwaser_6,mains_1,mains_2,refrigerator_5
2011-04-18 13:22:13,0.0,222.2,118.83,6.0
2011-04-18 13:22:16,0.0,223.17,119.19,6.0
2011-04-18 13:22:20,0.0,223.6,118.92,6.0
2011-04-18 13:22:23,1.0,222.91,119.16,6.0
2011-04-18 13:22:26,0.0,222.94,118.83,6.0


### 2. Check the sampling rate and resample everything to 3 second interval:

In [45]:
sampling_rate_seconds = 3

In [46]:
def resampleDF(cur_df, cur_sampling_rate):
    col_list = list(cur_df.columns)
    cur_df["time_stamp"] = cur_df.index
    out_df = pd.DataFrame(columns=cur_df.columns)
    for cur_col in col_list:
        cur_val_list = cur_df[["time_stamp", cur_col]].values.tolist()
        
        start_time = cur_val_list[0][0]
        end_time = cur_val_list[-1][0]
        
        # Create the traces TimeSeries Object and resample
        cur_ts = traces.TimeSeries(cur_val_list)
        cur_ts_sampled = cur_ts.sample(
            sampling_period=timedelta(seconds=cur_sampling_rate),
            start=start_time,
            end=end_time,
            interpolate='linear',
        )
        out_df["time_stamp"] = [x[0] for x in cur_ts_sampled]
        out_df[cur_col] = [round(x[1],3) for x in cur_ts_sampled]
        
        print("Finished interpolating the column =", cur_col)
        
        # Free up memory
        del cur_ts
        
    out_df.set_index("time_stamp", inplace=True)
    return out_df

In [47]:
%%time
df_resampled = {}
for i in range(1, 7):
    print("House " + str(i) + ":")
    df_resampled[i] = resampleDF(df[i].copy(), sampling_rate_seconds)
    print("\n")

House 1:
Finished interpolating the column = dishwaser_6
Finished interpolating the column = mains_1
Finished interpolating the column = mains_2
Finished interpolating the column = refrigerator_5


House 2:
Finished interpolating the column = dishwaser_10
Finished interpolating the column = mains_1
Finished interpolating the column = mains_2
Finished interpolating the column = refrigerator_9


House 3:
Finished interpolating the column = dishwaser_9
Finished interpolating the column = mains_1
Finished interpolating the column = mains_2
Finished interpolating the column = refrigerator_7


House 4:
Finished interpolating the column = dishwaser_15
Finished interpolating the column = mains_1
Finished interpolating the column = mains_2


House 5:
Finished interpolating the column = dishwaser_20
Finished interpolating the column = mains_1
Finished interpolating the column = mains_2
Finished interpolating the column = refrigerator_18


House 6:
Finished interpolating the column = dishwaser_9


In [48]:
df_resampled[1].head()

Unnamed: 0_level_0,dishwaser_6,mains_1,mains_2,refrigerator_5
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-04-18 13:22:13,0.0,222.2,118.83,6.0
2011-04-18 13:22:16,0.0,223.17,119.19,6.0
2011-04-18 13:22:19,0.0,223.493,118.987,6.0
2011-04-18 13:22:22,0.667,223.14,119.08,6.0
2011-04-18 13:22:25,0.333,222.93,118.94,6.0


### 3. Normalize the data

In [49]:
def normalizeDF(cur_df, input_cols = ["mains_1", "mains_2"], return_params=True):
    cur_df_mean = cur_df[input_cols].mean(axis=0)
    cur_df_std = cur_df[input_cols].std(axis=0)
    out_df = cur_df
    out_df[input_cols] = (out_df[input_cols] - cur_df_mean)/(cur_df_std)
    
    params = {}
    params["mean_vector"] = cur_df_mean.to_json()
    params["std_vector"] = cur_df_std.to_json()
    
    if return_params:
        return out_df, params
    else:
        return out_df
    
def renormalizeDF(cur_df, input_cols = ["mains_1", "mains_2"], params=None):
    out_df = cur_df
    out_df[input_cols] = (out_df[input_cols] * pd.read_json(params["std_vector"], typ='series')) + pd.read_json(params["mean_vector"], typ='series')
    return out_df

In [50]:
%%time
normalization_params = {}
df_normalized = {}
for i in range(1, 7):
    df_normalized[i], normalization_params[i] = normalizeDF(df_resampled[i].copy())

CPU times: user 577 ms, sys: 227 ms, total: 804 ms
Wall time: 650 ms


In [51]:
df_normalized[1].head()

Unnamed: 0_level_0,dishwaser_6,mains_1,mains_2,refrigerator_5
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011-04-18 13:22:13,0.0,0.347528,0.128814,6.0
2011-04-18 13:22:16,0.0,0.350646,0.129969,6.0
2011-04-18 13:22:19,0.0,0.351684,0.129318,6.0
2011-04-18 13:22:22,0.667,0.35055,0.129616,6.0
2011-04-18 13:22:25,0.333,0.349875,0.129167,6.0


In [55]:
normalization_params[1]

{'mean_vector': '{"mains_1":114.0950544201,"mains_2":78.6753711835}',
 'std_vector': '{"mains_1":311.06848957,"mains_2":311.7258815882}'}

### 4. Store the normalized data and normalization params: 

In [53]:
# Store the normalization params and the normalized output:
out_base_dir = "../data/redd_processed/low_freq/"

In [54]:
# Create the output directory if it doesn't exist
if not os.path.exists(out_base_dir):
    os.makedirs(out_base_dir)

for i in range(1, 7):
    cur_house_data_path = os.path.join(out_base_dir, "house_{}_data.csv".format(i))
    cur_house_params_path = os.path.join(out_base_dir, "house_{}_norm_params.json".format(i))
    
    # Store the house data
    df_normalized[i].to_csv(cur_house_data_path)
    
    # Store the house normalization params (useful to renormalize the data if needed after prediction)
    with open(cur_house_params_path, "w") as f:
        json.dump(normalization_params[i], f)

## Train/test split:

### 1. Create windows of data points with a given window size.

In [112]:
def createSamples(cur_df, cur_window_segment=33, target_col="dishwaser", causal=False):
    cur_df_cols = list(cur_df.columns)
    cur_df_timestamps = cur_df["time_stamp"]
    cur_df = cur_df.round(4)
    cur_df_list = cur_df.to_numpy()
    input_cols=["mains_1", "mains_2"]
    
    # Find the col idx
    input_col_idx = []
    target_col_idx = None
    for idx, cur_col in enumerate(cur_df_cols):
        if target_col in cur_col:
            target_col_idx = idx
            break
     
    for inp_col in input_cols:
        input_col_idx.append(cur_df_cols.index(inp_col))
            
    # Create the data samples
    x_cols = ["time_stamp"]
    for inp_col in input_cols:
        prev_list = []
        for i in range(cur_window_segment-1):
            prev_list.append(inp_col + "_prev_" + str(i+1))
        prev_list = list(reversed(prev_list))

        next_list = []
        if not causal:
            for i in range(cur_window_segment-1):
                next_list.append(inp_col + "_next_" + str(i+1))
            
        x_cols += prev_list + [inp_col] + next_list
    
    y_cols = ["output"]
    cols = x_cols + y_cols
    data = []
        
    num_records = len(cur_df_list)
    for idx in range(num_records):
        cur_start = idx - cur_window_segment + 1
        
        if causal:
            cur_end = idx + 1
        else:
            cur_end = idx + cur_window_segment
        
        if cur_start < 0 or cur_end > num_records:
            continue
            
        cur_timestamp = [cur_df_timestamps[idx]]
        cur_input = None
        for inp_col_idx in input_col_idx:
            if cur_input is None:
                cur_input = list(cur_df_list[cur_start:cur_end, inp_col_idx])
            else:
                cur_input += list(cur_df_list[cur_start:cur_end, inp_col_idx])
        
        # Filter the samples containing all same values for mains_1 window and mains_2 window (so only 2 unique values):
        if len(np.unique(cur_input)) <= 2:
            continue
        
        cur_output = list(cur_df_list[idx, [target_col_idx]])
        data.append(cur_timestamp + cur_input + cur_output)

    df_out = pd.DataFrame(data=data, columns=cols)
    return df_out

def create_train_test_split(data_dir, out_dir, window_segment = 3, train_house_list = [1, 2, 3, 4], test_house_list = [5, 6]):
    # Note: Window segment should be of the form 2^n + 1
    appliance_list = ["refrigerator", "dishwaser"]
    causal_type = False
    
    for cur_appliance in appliance_list:
        # Create training data
        df_train = None
        
        print("Appliance =", cur_appliance)
        for cur_house_idx in tqdm(train_house_list):
            
            try:
                house_data_path = os.path.join(data_dir, "house_{}_data.csv".format(cur_house_idx))
                cur_df = pd.read_csv(house_data_path)

                if df_train is None:
                    df_train = createSamples(cur_df.copy(), cur_window_segment=window_segment, target_col=cur_appliance, causal=causal_type)
                    df_train["house_idx"] = cur_house_idx
                else:
                    c_df = createSamples(cur_df.copy(), cur_window_segment=window_segment, target_col=cur_appliance, causal=causal_type)
                    c_df["house_idx"] = cur_house_idx
                    df_train = df_train.append(c_df, ignore_index=True)
                    
            except Exception as e:
                print("Skipped House =", cur_house_idx, " for appliance =", cur_appliance)
                
        # Write the output to folder
        out_app_folder = os.path.join(out_dir, "window_{}".format(window_segment), cur_appliance)
        
        if not os.path.exists(out_app_folder):
            os.makedirs(out_app_folder)
        
        out_train_path = os.path.join(out_app_folder, "train.csv")
        print("Writing data to ", out_train_path)
        df_train.set_index("time_stamp", inplace=True)
        df_train.to_csv(out_train_path)
        
        
        # Create testing data
        df_test = None
        for cur_house_idx in tqdm(test_house_list):
            
            try:
                house_data_path = os.path.join(data_dir, "house_{}_data.csv".format(cur_house_idx))
                cur_df = pd.read_csv(house_data_path)

                if df_test is None:
                    df_test = createSamples(cur_df.copy(), cur_window_segment=window_segment, target_col=cur_appliance, causal=causal_type)
                    df_test["house_idx"] = cur_house_idx
                else:
                    c_df = createSamples(cur_df.copy(), cur_window_segment=window_segment, target_col=cur_appliance, causal=causal_type)
                    c_df["house_idx"] = cur_house_idx
                    df_test = df_test.append(c_df, ignore_index=True)
                    
            except Exception as e:
                print("Skipped House =", cur_house_idx, " for appliance =", cur_appliance)
        
        # Write the output to folder
        out_test_path = os.path.join(out_app_folder, "test.csv")
        print("Writing data to ", out_test_path)
        df_test.set_index("time_stamp", inplace=True)
        df_test.to_csv(out_test_path)

In [113]:
inp_dir = "../data/redd_processed/low_freq/"
out_dir = "../data/redd_processed/"

In [None]:
# Sample data
create_train_test_split(inp_dir, out_dir, window_segment = 3, train_house_list = [1], test_house_list = [5])

In [None]:
%%time
window_seg_list = [33, 65, 129, 257]
for cur_seg in window_seg_list:
    create_train_test_split(inp_dir, out_dir, window_segment = cur_seg, train_house_list = [1, 2, 3, 4], test_house_list = [5, 6])