In [76]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import datetime
from datetime import timedelta
import time
import math
import warnings
import os
warnings.filterwarnings("ignore")
import glob
import traces
import json
from tqdm import tqdm

# REDD:

In [77]:
base_path = "../data/redd/low_freq/"
applicance_list = ["dishwaser", "refrigerator"]

In [78]:
# Read the labels & create the label map
def read_label(base_path):
    label = {}
    for h_idx in range(1, 7):
        h_path = os.path.join(base_path, "house_{}/labels.dat".format(h_idx))
        label[h_idx] = {}
        with open(h_path) as f:
            for line in f:
                splitted_line = line.split(' ')
                label[h_idx][int(splitted_line[0])] = splitted_line[1].strip() + '_' + splitted_line[0]
    return label


# Read the data
def read_data(base_path, h_idx, labels, applicance_list):
    h_path = os.path.join(base_path, "house_{}/".format(h_idx))
    num_apps = len(glob.glob(h_path + 'channel*'))
    df = None
    for ch_i in range(1, num_apps + 1):
        file = h_path + 'channel_{}.dat'.format(ch_i)
        if labels[h_idx][ch_i].split("_")[0] in applicance_list:
            if df is None:
                df = pd.read_table(file, sep = ' ', names = ['unix_time', labels[h_idx][ch_i]], 
                                                  dtype = {'unix_time': 'int64', labels[h_idx][ch_i]: 'float64'})
            else:
                data = pd.read_table(file, sep = ' ', names = ['unix_time', labels[h_idx][ch_i]], 
                                                  dtype = {'unix_time': 'int64', labels[h_idx][ch_i]: 'float64'})
                df = pd.merge(df, data, how = 'inner', on = 'unix_time')
    df['timestamp'] = df['unix_time'].astype("datetime64[s]")
    df = df.set_index(df['timestamp'].values)
    df.drop(['unix_time','timestamp'], axis=1, inplace=True)
    df.sort_index(inplace=True)
    return df

In [79]:
%%time
labels = read_label(base_path)
df = {}
for i in range(1, 7):
    df[i] = read_data(base_path, i, labels, applicance_list)

CPU times: user 1.15 s, sys: 310 ms, total: 1.46 s
Wall time: 1.48 s


In [80]:
df[1].head()

Unnamed: 0,refrigerator_5,dishwaser_6
2011-04-18 13:22:13,6.0,0.0
2011-04-18 13:22:16,6.0,0.0
2011-04-18 13:22:20,6.0,0.0
2011-04-18 13:22:23,6.0,1.0
2011-04-18 13:22:26,6.0,0.0


## Number of houses having a particular appliance: 

In [81]:
app_cnt = {}
for k, v in labels.items():
    cur_app_list = set()
    for kk, vv in v.items():
        app = "_".join(vv.split("_")[:-1])
        cur_app_list.add(app)
    for cur_app in cur_app_list:
        if app_cnt.get(cur_app) is None:
            app_cnt[cur_app] = 0
        app_cnt[cur_app] += 1
        
app_cnt = dict(sorted(app_cnt.items(), key=lambda x: x[1], reverse=True))

In [82]:
print(app_cnt)

{'washer_dryer': 6, 'mains': 6, 'lighting': 6, 'kitchen_outlets': 6, 'dishwaser': 6, 'bathroom_gfi': 5, 'refrigerator': 5, 'stove': 4, 'microwave': 4, 'outlets_unknown': 4, 'electric_heat': 3, 'disposal': 3, 'electronics': 3, 'furance': 3, 'smoke_alarms': 2, 'air_conditioning': 2, 'oven': 1, 'miscellaeneous': 1, 'outdoor_outlets': 1, 'subpanel': 1}


## Data Preprocessing:

### 1. Add Zero Boundary Records when data is not available for more than 180 seconds (3min):

In [83]:
# If there is no data for more than 180 seconds (3min), fill zeros
second_threshold = 180
boundary_second_interval = 5
num_input_vals = 4

In [84]:
def createZeroRecordsDF(cur_timestamp_list, col_list, cur_input_vals=4, cur_boundary_second_interval=5):
    zero_end_records = []
    for cur_ts in cur_timestamp_list:
        cur_ts_new = cur_ts + timedelta(seconds=cur_boundary_second_interval)
        cur_record = [cur_ts_new] + [0] * cur_input_vals
        zero_end_records.append(cur_record)

    zero_end_records_df = pd.DataFrame(zero_end_records, columns=col_list)
    zero_end_records_df = zero_end_records_df.set_index(zero_end_records_df['time_stamp'].values)
    return zero_end_records_df

def addZeroBoundaryRecords(cur_df):
    col_list = ["time_stamp"] + list(cur_df.columns)
    cur_df["time_stamp"] = cur_df.index
    
    cur_df["time_diff"] = cur_df["time_stamp"].diff(periods = 1).dt.total_seconds()
    cur_df["prev_time_diff"] = cur_df["time_diff"].shift(periods = -1)
    
    cur_input_vals = len(col_list) - 1
    prev_timestamps = list(cur_df[cur_df["prev_time_diff"] >= second_threshold]["time_stamp"])
    next_zero_boundary_records = createZeroRecordsDF(prev_timestamps, col_list, cur_input_vals, boundary_second_interval)
    cur_df = cur_df.append(next_zero_boundary_records)
    
    next_timestamps = list(cur_df[cur_df["time_diff"] >= second_threshold]["time_stamp"])
    prev_zero_boundary_ecords = createZeroRecordsDF(next_timestamps, col_list, cur_input_vals, -1 * boundary_second_interval)
    cur_df = cur_df.append(prev_zero_boundary_ecords)
    
    # Sort by Timestamp
    cur_df.sort_index(inplace=True)
    cur_df.drop(columns=["time_stamp", "time_diff", "prev_time_diff"], inplace=True)
    
    return cur_df

In [85]:
%%time
for i in range(1, 7):
    df[i] = addZeroBoundaryRecords(df[i].copy())

CPU times: user 463 ms, sys: 213 ms, total: 676 ms
Wall time: 677 ms


In [86]:
df[1].head()

Unnamed: 0,dishwaser_6,refrigerator_5
2011-04-18 13:22:13,0.0,6.0
2011-04-18 13:22:16,0.0,6.0
2011-04-18 13:22:20,0.0,6.0
2011-04-18 13:22:23,1.0,6.0
2011-04-18 13:22:26,0.0,6.0


### 2. Check the sampling rate and resample everything to 1 hr interval:

In [90]:
sampling_rate_seconds = 60

In [91]:
def resampleDF(cur_df, cur_sampling_rate):
    col_list = list(cur_df.columns)
    cur_df["time_stamp"] = cur_df.index
    out_df = pd.DataFrame(columns=cur_df.columns)
    for cur_col in col_list:
        cur_val_list = cur_df[["time_stamp", cur_col]].values.tolist()
        
        start_time = cur_val_list[0][0]
        end_time = cur_val_list[-1][0]
        
        # Create the traces TimeSeries Object and resample
        cur_ts = traces.TimeSeries(cur_val_list)
        cur_ts_sampled = cur_ts.sample(
            sampling_period=timedelta(seconds=cur_sampling_rate),
            start=start_time,
            end=end_time,
            interpolate='linear',
        )
        out_df["time_stamp"] = [x[0] for x in cur_ts_sampled]
        out_df[cur_col] = [round(x[1],3) for x in cur_ts_sampled]
        
        print("Finished interpolating the column =", cur_col)
        
        # Free up memory
        del cur_ts
        
    out_df.set_index("time_stamp", inplace=True)
    return out_df

In [92]:
%%time
df_resampled = {}
for i in range(1, 7):
    print("House " + str(i) + ":")
    df_resampled[i] = resampleDF(df[i].copy(), sampling_rate_seconds)
    df_resampled[i] = df_resampled[i].resample("1H").mean()
    print("\n")

House 1:
Finished interpolating the column = dishwaser_6
Finished interpolating the column = refrigerator_5


House 2:
Finished interpolating the column = dishwaser_10
Finished interpolating the column = refrigerator_9


House 3:
Finished interpolating the column = dishwaser_9
Finished interpolating the column = refrigerator_7


House 4:
Finished interpolating the column = dishwaser_15


House 5:
Finished interpolating the column = dishwaser_20
Finished interpolating the column = refrigerator_18


House 6:
Finished interpolating the column = dishwaser_9
Finished interpolating the column = refrigerator_8


CPU times: user 42.7 s, sys: 728 ms, total: 43.5 s
Wall time: 43.5 s


In [102]:
df_resampled[1].head()

Unnamed: 0_level_0,dishwaser_6,refrigerator_5
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-04-18 13:00:00,0.142395,5.991237
2011-04-18 14:00:00,0.4463,139.541667
2011-04-18 15:00:00,0.00555,120.904883
2011-04-18 16:00:00,0.016667,59.741267
2011-04-18 17:00:00,0.008333,49.729117


### 3. Store the resampled data (Split into train/test)

In [109]:
def split_Store(cur_df, cur_path="", appliance="refrigerator", train_percent=0.70):
    target_col_list = []
    for cur_col in cur_df.columns:
        if appliance in cur_col:
            target_col_list.append(cur_col)
            
    if len(target_col_list) == 0:
        print("Column doesn't exist!")
        return
    
    cur_df_copy = cur_df.copy()
    for target_col in target_col_list:
        cur_df = cur_df_copy.copy()
        cur_df.rename(columns={target_col:"output"}, inplace=True)
        cols = ["output"]
            
        # Write full data
        cur_full_path = cur_path + ".csv"
        cur_full_seg = cur_df[cols]
        print("Writing to ", cur_full_path)
        cur_full_seg.to_csv(cur_full_path)
        
        # Write Train data
        num_train_samples = int(cur_df.shape[0] * train_percent)
        cur_train_path = cur_path + "_train.csv"
        cur_train_seg = cur_df.iloc[:num_train_samples][cols]
        print("Writing to ", cur_train_path)
        cur_train_seg.to_csv(cur_train_path)
        
        # Write Test data
        cur_test_path = cur_path + "_test.csv"
        cur_test_seg = cur_df.iloc[num_train_samples:][cols]
        print("Writing to ", cur_test_path)
        cur_test_seg.to_csv(cur_test_path)

In [None]:
# out_base_dir = "../data/redd_forecast_processed/"
    
# applicance_list = ["dishwaser", "refrigerator"]
# for cur_app in applicance_list:
#     for i in range(1, 7):
#         cur_path = os.path.join(out_base_dir, cur_app)
        
#         # Create the output directory if it doesn't exist
#         if not os.path.exists(cur_path):
#             os.makedirs(cur_path)
    
#         split_Store(df_resampled[i], cur_path=os.path.join(cur_path, "h" + str(i)), appliance=cur_app, train_percent=0.70)
#         print("\n")