In [230]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import datetime
from datetime import timedelta
import time
import math
import warnings
import os
warnings.filterwarnings("ignore")
import glob
import traces

# REDD:

In [231]:
base_path = "../data/redd/low_freq/"
applicance_list = ["mains", "dishwaser", "refrigerator"]

In [232]:
# Read the labels & create the label map
def read_label(base_path):
    label = {}
    for h_idx in range(1, 7):
        h_path = os.path.join(base_path, "house_{}/labels.dat".format(h_idx))
        label[h_idx] = {}
        with open(h_path) as f:
            for line in f:
                splitted_line = line.split(' ')
                label[h_idx][int(splitted_line[0])] = splitted_line[1].strip() + '_' + splitted_line[0]
    return label


# Read the data
def read_data(base_path, h_idx, labels, applicance_list):
    h_path = os.path.join(base_path, "house_{}/".format(h_idx))
    num_apps = len(glob.glob(h_path + 'channel*'))
    df = None
    for ch_i in range(1, num_apps + 1):
        file = h_path + 'channel_{}.dat'.format(ch_i)
        if labels[h_idx][ch_i].split("_")[0] in applicance_list:
            if df is None:
                df = pd.read_table(file, sep = ' ', names = ['unix_time', labels[h_idx][ch_i]], 
                                                  dtype = {'unix_time': 'int64', labels[h_idx][ch_i]: 'float64'})
            else:
                data = pd.read_table(file, sep = ' ', names = ['unix_time', labels[h_idx][ch_i]], 
                                                  dtype = {'unix_time': 'int64', labels[h_idx][ch_i]: 'float64'})
                df = pd.merge(df, data, how = 'inner', on = 'unix_time')
    df['timestamp'] = df['unix_time'].astype("datetime64[s]")
    df = df.set_index(df['timestamp'].values)
    df.drop(['unix_time','timestamp'], axis=1, inplace=True)
    df.sort_index(inplace=True)
    return df

In [233]:
%%time
labels = read_label(base_path)
df = {}
for i in range(1, 7):
    df[i] = read_data(base_path, i, labels, applicance_list)

CPU times: user 5.23 s, sys: 1.79 s, total: 7.03 s
Wall time: 7.07 s


In [235]:
df[1].head()

Unnamed: 0,mains_1,mains_2,refrigerator_5,dishwaser_6
2011-04-18 13:22:13,222.2,118.83,6.0,0.0
2011-04-18 13:22:16,223.17,119.19,6.0,0.0
2011-04-18 13:22:20,223.6,118.92,6.0,0.0
2011-04-18 13:22:23,222.91,119.16,6.0,1.0
2011-04-18 13:22:26,222.94,118.83,6.0,0.0


## Number of houses having a particular appliance: 

In [236]:
app_cnt = {}
for k, v in labels.items():
    cur_app_list = set()
    for kk, vv in v.items():
        app = "_".join(vv.split("_")[:-1])
        cur_app_list.add(app)
    for cur_app in cur_app_list:
        if app_cnt.get(cur_app) is None:
            app_cnt[cur_app] = 0
        app_cnt[cur_app] += 1
        
app_cnt = dict(sorted(app_cnt.items(), key=lambda x: x[1], reverse=True))

In [237]:
print(app_cnt)

{'kitchen_outlets': 6, 'dishwaser': 6, 'washer_dryer': 6, 'lighting': 6, 'mains': 6, 'refrigerator': 5, 'bathroom_gfi': 5, 'stove': 4, 'microwave': 4, 'outlets_unknown': 4, 'electric_heat': 3, 'disposal': 3, 'electronics': 3, 'furance': 3, 'smoke_alarms': 2, 'air_conditioning': 2, 'oven': 1, 'miscellaeneous': 1, 'subpanel': 1, 'outdoor_outlets': 1}


## Data Preprocessing:

### 1. Add Zero Boundary Records when data is not available for more than 180 seconds (3min):

In [238]:
# If there is no data for more than 180 seconds (3min), fill zeros
second_threshold = 180
boundary_second_interval = 5
num_input_vals = 4

In [239]:
def createZeroRecordsDF(cur_timestamp_list, col_list, cur_input_vals=4, cur_boundary_second_interval=5):
    zero_end_records = []
    for cur_ts in cur_timestamp_list:
        cur_ts_new = cur_ts + timedelta(seconds=cur_boundary_second_interval)
        cur_record = [cur_ts_new] + [0] * cur_input_vals
        zero_end_records.append(cur_record)

    zero_end_records_df = pd.DataFrame(zero_end_records, columns=col_list)
    zero_end_records_df = zero_end_records_df.set_index(zero_end_records_df['time_stamp'].values)
    return zero_end_records_df

def addZeroBoundaryRecords(cur_df):
    col_list = ["time_stamp"] + list(cur_df.columns)
    cur_df["time_stamp"] = cur_df.index
    
    cur_df["time_diff"] = cur_df["time_stamp"].diff(periods = 1).dt.total_seconds()
    cur_df["prev_time_diff"] = cur_df["time_diff"].shift(periods = -1)
    
    cur_input_vals = len(col_list) - 1
    prev_timestamps = list(cur_df[cur_df["prev_time_diff"] >= second_threshold]["time_stamp"])
    next_zero_boundary_records = createZeroRecordsDF(prev_timestamps, col_list, cur_input_vals, boundary_second_interval)
    cur_df = cur_df.append(next_zero_boundary_records)
    
    next_timestamps = list(cur_df[cur_df["time_diff"] >= second_threshold]["time_stamp"])
    prev_zero_boundary_ecords = createZeroRecordsDF(next_timestamps, col_list, cur_input_vals, -1 * boundary_second_interval)
    cur_df = cur_df.append(prev_zero_boundary_ecords)
    
    # Sort by Timestamp
    cur_df.sort_index(inplace=True)
    cur_df.drop(columns=["time_stamp", "time_diff", "prev_time_diff"], inplace=True)
    
    return cur_df

In [240]:
%%time
for i in range(1, 7):
    df[i] = addZeroBoundaryRecords(df[i].copy())

CPU times: user 429 ms, sys: 163 ms, total: 591 ms
Wall time: 591 ms


In [244]:
df[1].head()

Unnamed: 0,dishwaser_6,mains_1,mains_2,refrigerator_5
2011-04-18 13:22:13,0.0,222.2,118.83,6.0
2011-04-18 13:22:16,0.0,223.17,119.19,6.0
2011-04-18 13:22:20,0.0,223.6,118.92,6.0
2011-04-18 13:22:23,1.0,222.91,119.16,6.0
2011-04-18 13:22:26,0.0,222.94,118.83,6.0


### 2. Check the sampling rate and resample everything to 3 second interval:

In [83]:
sampling_rate_seconds = 3

In [105]:
def resampleDF(cur_df, cur_sampling_rate):
    col_list = list(cur_df.columns)
    cur_df["time_stamp"] = cur_df.index
    out_df = pd.DataFrame(columns=cur_df.columns)
    for cur_col in col_list:
        cur_val_list = cur_df[["time_stamp", cur_col]].values.tolist()
        
        start_time = cur_val_list[0][0]
        end_time = cur_val_list[-1][0]
        
        # Create the traces TimeSeries Object and resample
        cur_ts = traces.TimeSeries(cur_val_list)
        cur_ts_sampled = cur_ts.sample(
            sampling_period=timedelta(seconds=cur_sampling_rate),
            start=start_time,
            end=end_time,
            interpolate='linear',
        )
        out_df["time_stamp"] = [x[0] for x in cur_ts_sampled]
        out_df[cur_col] = [round(x[1],3) for x in cur_ts_sampled]
        
        print("Finished interpolating the column =", cur_col)
        
        # Free up memory
        del cur_ts
        
    return out_df

In [None]:
%%time
df_resampled = {}
for i in range(1, 7):
    print("House " + str(i) + ":")
    df_resampled[i] = resampleDF(df[i].copy(), sampling_rate_seconds)
    print("\n")

### 3. Normalize the data

In [245]:
def normalizeDF(cur_df):
    cur_df_mean = cur_df.mean(axis=0)
    cur_df_std = cur_df.std(axis=0)
    out_df = (cur_df - cur_df_mean)/(cur_df_std)
    return out_df

In [249]:
%%time
for i in range(1, 7):
    df[i] = normalizeDF(df[i].copy())

CPU times: user 188 ms, sys: 58.2 ms, total: 246 ms
Wall time: 148 ms


In [251]:
df[1].head()

Unnamed: 0,dishwaser_6,mains_1,mains_2,refrigerator_5
2011-04-18 13:22:13,-0.166425,-0.012543,-0.087618,-0.555263
2011-04-18 13:22:16,-0.166425,-0.010175,-0.086772,-0.555263
2011-04-18 13:22:20,-0.166425,-0.009126,-0.087406,-0.555263
2011-04-18 13:22:23,-0.159746,-0.01081,-0.086843,-0.555263
2011-04-18 13:22:26,-0.166425,-0.010737,-0.087618,-0.555263


## Train/test split:

In [292]:
train_houses = [1,2,3,4]
test_houses = [5,6]