In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import datetime
from datetime import timedelta
import time
import math
import warnings
import os
warnings.filterwarnings("ignore")
import glob
import traces
import json
from tqdm import tqdm

# REDD:

In [15]:
base_path = "../data/redd/low_freq/"
applicance_list = ["mains", "kitchen_outlets"]

In [16]:
# Read the labels & create the label map
def read_label(base_path):
    label = {}
    for h_idx in range(1, 7):
        h_path = os.path.join(base_path, "house_{}/labels.dat".format(h_idx))
        label[h_idx] = {}
        with open(h_path) as f:
            for line in f:
                splitted_line = line.split(' ')
                label[h_idx][int(splitted_line[0])] = splitted_line[1].strip() + '_' + splitted_line[0]
    return label


# Read the data
def read_data(base_path, h_idx, labels, applicance_list):
    h_path = os.path.join(base_path, "house_{}/".format(h_idx))
    num_apps = len(glob.glob(h_path + 'channel*'))
    df = None
    flag_map = {}
    for ch_i in range(1, num_apps + 1):
        file = h_path + 'channel_{}.dat'.format(ch_i)
        cur_app_name = "_".join(labels[h_idx][ch_i].split("_")[:-1])
        if cur_app_name in applicance_list:
#             if cur_app_name != "mains" and flag_map.get(cur_app_name) is not None:
#                 continue
            if df is None:
                df = pd.read_table(file, sep = ' ', names = ['unix_time', labels[h_idx][ch_i]], 
                                                  dtype = {'unix_time': 'int64', labels[h_idx][ch_i]: 'float64'})
            else:
                data = pd.read_table(file, sep = ' ', names = ['unix_time', labels[h_idx][ch_i]], 
                                                  dtype = {'unix_time': 'int64', labels[h_idx][ch_i]: 'float64'})
                df = pd.merge(df, data, how = 'inner', on = 'unix_time')
#             flag_map[cur_app_name] = 1
    df['timestamp'] = df['unix_time'].astype("datetime64[s]")
    df = df.set_index(df['timestamp'].values)
    df.drop(['unix_time','timestamp'], axis=1, inplace=True)
    df.sort_index(inplace=True)
    return df

In [17]:
%%time
labels = read_label(base_path)
df = {}
for i in range(1, 7):
    df[i] = read_data(base_path, i, labels, applicance_list)

CPU times: user 5.52 s, sys: 1.76 s, total: 7.28 s
Wall time: 7.33 s


In [18]:
df[1].head()

Unnamed: 0,mains_1,mains_2,kitchen_outlets_7,kitchen_outlets_8,kitchen_outlets_15,kitchen_outlets_16
2011-04-18 13:22:13,222.2,118.83,34.0,21.0,2.0,0.0
2011-04-18 13:22:16,223.17,119.19,34.0,21.0,2.0,0.0
2011-04-18 13:22:20,223.6,118.92,34.0,22.0,2.0,0.0
2011-04-18 13:22:23,222.91,119.16,35.0,21.0,1.0,0.0
2011-04-18 13:22:26,222.94,118.83,34.0,21.0,2.0,0.0


## Number of houses having a particular appliance: 

In [19]:
app_cnt = {}
for k, v in labels.items():
    cur_app_list = set()
    for kk, vv in v.items():
        app = "_".join(vv.split("_")[:-1])
        cur_app_list.add(app)
    for cur_app in cur_app_list:
        if app_cnt.get(cur_app) is None:
            app_cnt[cur_app] = 0
        app_cnt[cur_app] += 1
        
app_cnt = dict(sorted(app_cnt.items(), key=lambda x: x[1], reverse=True))

In [20]:
print(app_cnt)

{'mains': 6, 'lighting': 6, 'dishwaser': 6, 'washer_dryer': 6, 'kitchen_outlets': 6, 'bathroom_gfi': 5, 'refrigerator': 5, 'stove': 4, 'microwave': 4, 'outlets_unknown': 4, 'electric_heat': 3, 'disposal': 3, 'electronics': 3, 'furance': 3, 'smoke_alarms': 2, 'air_conditioning': 2, 'oven': 1, 'miscellaeneous': 1, 'subpanel': 1, 'outdoor_outlets': 1}


## Data Preprocessing:

### 1. Normalize the data

In [21]:
def normalizeDF(cur_df, input_cols = ["mains_1", "mains_2"], return_params=True):
    cur_df_mean = cur_df[input_cols].mean(axis=0)
    cur_df_std = cur_df[input_cols].std(axis=0)
    out_df = cur_df
    out_df[input_cols] = (out_df[input_cols] - cur_df_mean)/(cur_df_std)
    
    params = {}
    params["mean_vector"] = cur_df_mean.to_json()
    params["std_vector"] = cur_df_std.to_json()
    
    if return_params:
        return out_df, params
    else:
        return out_df
    
def renormalizeDF(cur_df, input_cols = ["mains_1", "mains_2"], params=None):
    out_df = cur_df
    out_df[input_cols] = (out_df[input_cols] * pd.read_json(params["std_vector"], typ='series')) + pd.read_json(params["mean_vector"], typ='series')
    return out_df

In [22]:
%%time
normalization_params = {}
df_normalized = {}
for i in range(1, 7):
    df_normalized[i], normalization_params[i] = normalizeDF(df[i].copy())

CPU times: user 113 ms, sys: 46 ms, total: 159 ms
Wall time: 151 ms


In [23]:
df_normalized[1].head()

Unnamed: 0,mains_1,mains_2,kitchen_outlets_7,kitchen_outlets_8,kitchen_outlets_15,kitchen_outlets_16
2011-04-18 13:22:13,-0.012578,-0.087639,34.0,21.0,2.0,0.0
2011-04-18 13:22:16,-0.01021,-0.086793,34.0,21.0,2.0,0.0
2011-04-18 13:22:20,-0.009161,-0.087427,34.0,22.0,2.0,0.0
2011-04-18 13:22:23,-0.010845,-0.086864,35.0,21.0,1.0,0.0
2011-04-18 13:22:26,-0.010772,-0.087639,34.0,21.0,2.0,0.0


In [24]:
normalization_params[1]

{'mean_vector': '{"mains_1":227.3531298494,"mains_2":156.1432875884}',
 'std_vector': '{"mains_1":409.6954279752,"mains_2":425.762416359}'}

In [25]:
out_base_dir = "../data/redd_processed/"

# Create the output directory if it doesn't exist
if not os.path.exists(out_base_dir):
    os.makedirs(out_base_dir)

In [26]:
df_normalized[1].describe()

Unnamed: 0,mains_1,mains_2,kitchen_outlets_7,kitchen_outlets_8,kitchen_outlets_15,kitchen_outlets_16
count,406748.0,406748.0,406748.0,406748.0,406748.0,406748.0
mean,-4.658697e-15,-5.510896e-15,21.236535,28.331126,5.501908,2.028563
std,1.0,1.0,1.958581,16.799793,68.599415,54.460008
min,-0.4283258,-0.2804928,7.0,3.0,0.0,0.0
25%,-0.3300821,-0.2753491,20.0,21.0,1.0,0.0
50%,-0.2366957,-0.2688666,21.0,22.0,1.0,0.0
75%,0.01734672,-0.08801455,22.0,23.0,1.0,0.0
max,14.28868,14.40305,59.0,1550.0,1118.0,1585.0


### 2. Split into continuous segments and store:

In [31]:
def splitSegments_Store(cur_df, cur_path="", appliance="refrigerator", split_second_threshold=10800):
    target_col_list = []
    for cur_col in cur_df.columns:
        if appliance in cur_col:
            target_col_list.append(cur_col)
            
    if len(target_col_list) == 0:
        print("Column doesn't exist!")
        return
    
    col_idx = 1
    cur_df_copy = cur_df.copy()
    for target_col in target_col_list:
        cur_df = cur_df_copy.copy()
        cur_df.rename(columns={target_col:"output"}, inplace=True)
        cur_df["time_stamp"] = cur_df.index
        cur_df["time_diff"] = cur_df["time_stamp"].diff(periods = 1).dt.total_seconds()
        split_timestamps = [cur_df.iloc[0]["time_stamp"]]
        split_timestamps += list(cur_df[cur_df["time_diff"] > split_second_threshold]["time_stamp"])
        split_timestamps += [cur_df.iloc[-1]["time_stamp"]]

        num_segments = len(split_timestamps) - 1

        cols = ["mains_1", "mains_2", "output"]
        for i in range(len(split_timestamps)-1):
            start_timestamp = split_timestamps[i]
            end_timestamp = split_timestamps[i+1]

            cur_seg_df = cur_df[(cur_df["time_stamp"] >= start_timestamp) & (cur_df["time_stamp"] < end_timestamp)].copy()
            cur_seg_df = cur_seg_df[cols]
            cur_seg_path = cur_path.format(col_idx * 100 + i + 1)
            print("Writing to ", cur_seg_path)
            cur_seg_df.to_csv(cur_seg_path)
        
        col_idx += 1

In [None]:
# Store the normalized data:

# train_house_list = [1, 2, 3, 4]
# test_house_list = [5, 6]
# appliance_list = ["kitchen_outlets"]

# out_base_dir = "../data/redd_processed/"
# for h_idx in train_house_list:
#     for cur_app in appliance_list:
#         cur_out_dir = os.path.join(out_base_dir, "original", "normalized", cur_app, "train")
        
#         if not os.path.exists(cur_out_dir):
#             os.makedirs(cur_out_dir)
        
#         cur_path = os.path.join(cur_out_dir, "h" + str(h_idx) + "_p{}.csv")
#         splitSegments_Store(df_normalized[h_idx].copy(), cur_path=cur_path, appliance=cur_app)
#         print("\n")
#     print("\n")


# for h_idx in test_house_list:
#     for cur_app in appliance_list:
#         cur_out_dir = os.path.join(out_base_dir, "original", "normalized", cur_app, "test")
        
#         if not os.path.exists(cur_out_dir):
#             os.makedirs(cur_out_dir)
        
#         cur_path = os.path.join(cur_out_dir, "h" + str(h_idx) + "_p{}.csv")
#         splitSegments_Store(df_normalized[h_idx].copy(), cur_path=cur_path, appliance=cur_app)
#         print("\n")
#     print("\n")

In [None]:
# # Store the raw data:

# train_house_list = [1, 2, 3, 4]
# test_house_list = [5, 6]
# appliance_list = ["kitchen_outlets", "stove"]

# out_base_dir = "../data/redd_processed/"
# for h_idx in train_house_list:
#     for cur_app in appliance_list:
#         cur_out_dir = os.path.join(out_base_dir, "original", "raw", cur_app, "train")
        
#         if not os.path.exists(cur_out_dir):
#             os.makedirs(cur_out_dir)
        
#         cur_path = os.path.join(cur_out_dir, "h" + str(h_idx) + "_p{}.csv")
#         splitSegments_Store(df[h_idx].copy(), cur_path=cur_path, appliance=cur_app)
#         print("\n")
#     print("\n")


# for h_idx in test_house_list:
#     for cur_app in appliance_list:
#         cur_out_dir = os.path.join(out_base_dir, "original", "raw", cur_app, "test")
        
#         if not os.path.exists(cur_out_dir):
#             os.makedirs(cur_out_dir)
        
#         cur_path = os.path.join(cur_out_dir, "h" + str(h_idx) + "_p{}.csv")
#         splitSegments_Store(df[h_idx].copy(), cur_path=cur_path, appliance=cur_app)
#         print("\n")
#     print("\n")

## Train/test split:

### 1. Create windows of data points with a given window size.

In [112]:
def createSamples(cur_df, cur_window_segment=33, target_col="dishwaser", causal=False):
    cur_df_cols = list(cur_df.columns)
    cur_df_timestamps = cur_df["time_stamp"]
    cur_df = cur_df.round(4)
    cur_df_list = cur_df.to_numpy()
    input_cols=["mains_1", "mains_2"]
    
    # Find the col idx
    input_col_idx = []
    target_col_idx = None
    for idx, cur_col in enumerate(cur_df_cols):
        if target_col in cur_col:
            target_col_idx = idx
            break
     
    for inp_col in input_cols:
        input_col_idx.append(cur_df_cols.index(inp_col))
            
    # Create the data samples
    x_cols = ["time_stamp"]
    for inp_col in input_cols:
        prev_list = []
        for i in range(cur_window_segment-1):
            prev_list.append(inp_col + "_prev_" + str(i+1))
        prev_list = list(reversed(prev_list))

        next_list = []
        if not causal:
            for i in range(cur_window_segment-1):
                next_list.append(inp_col + "_next_" + str(i+1))
            
        x_cols += prev_list + [inp_col] + next_list
    
    y_cols = ["output"]
    cols = x_cols + y_cols
    data = []
        
    num_records = len(cur_df_list)
    for idx in range(num_records):
        cur_start = idx - cur_window_segment + 1
        
        if causal:
            cur_end = idx + 1
        else:
            cur_end = idx + cur_window_segment
        
        if cur_start < 0 or cur_end > num_records:
            continue
            
        cur_timestamp = [cur_df_timestamps[idx]]
        cur_input = None
        for inp_col_idx in input_col_idx:
            if cur_input is None:
                cur_input = list(cur_df_list[cur_start:cur_end, inp_col_idx])
            else:
                cur_input += list(cur_df_list[cur_start:cur_end, inp_col_idx])
        
        # Filter the samples containing all same values for mains_1 window and mains_2 window (so only 2 unique values):
        if len(np.unique(cur_input)) <= 2:
            continue
        
        cur_output = list(cur_df_list[idx, [target_col_idx]])
        data.append(cur_timestamp + cur_input + cur_output)

    df_out = pd.DataFrame(data=data, columns=cols)
    return df_out

def create_train_test_split(data_dir, out_dir, window_segment = 3, train_house_list = [1, 2, 3, 4], test_house_list = [5, 6]):
    # Note: Window segment should be of the form 2^n + 1
    appliance_list = ["refrigerator", "dishwaser"]
    causal_type = False
    
    for cur_appliance in appliance_list:
        # Create training data
        df_train = None
        
        print("Appliance =", cur_appliance)
        for cur_house_idx in tqdm(train_house_list):
            
            try:
                house_data_path = os.path.join(data_dir, "house_{}_data.csv".format(cur_house_idx))
                cur_df = pd.read_csv(house_data_path)

                if df_train is None:
                    df_train = createSamples(cur_df.copy(), cur_window_segment=window_segment, target_col=cur_appliance, causal=causal_type)
                    df_train["house_idx"] = cur_house_idx
                else:
                    c_df = createSamples(cur_df.copy(), cur_window_segment=window_segment, target_col=cur_appliance, causal=causal_type)
                    c_df["house_idx"] = cur_house_idx
                    df_train = df_train.append(c_df, ignore_index=True)
                    
            except Exception as e:
                print("Skipped House =", cur_house_idx, " for appliance =", cur_appliance)
                
        # Write the output to folder
        out_app_folder = os.path.join(out_dir, "window_{}".format(window_segment), cur_appliance)
        
        if not os.path.exists(out_app_folder):
            os.makedirs(out_app_folder)
        
        out_train_path = os.path.join(out_app_folder, "train.csv")
        print("Writing data to ", out_train_path)
        df_train.set_index("time_stamp", inplace=True)
        df_train.to_csv(out_train_path)
        
        
        # Create testing data
        df_test = None
        for cur_house_idx in tqdm(test_house_list):
            
            try:
                house_data_path = os.path.join(data_dir, "house_{}_data.csv".format(cur_house_idx))
                cur_df = pd.read_csv(house_data_path)

                if df_test is None:
                    df_test = createSamples(cur_df.copy(), cur_window_segment=window_segment, target_col=cur_appliance, causal=causal_type)
                    df_test["house_idx"] = cur_house_idx
                else:
                    c_df = createSamples(cur_df.copy(), cur_window_segment=window_segment, target_col=cur_appliance, causal=causal_type)
                    c_df["house_idx"] = cur_house_idx
                    df_test = df_test.append(c_df, ignore_index=True)
                    
            except Exception as e:
                print("Skipped House =", cur_house_idx, " for appliance =", cur_appliance)
        
        # Write the output to folder
        out_test_path = os.path.join(out_app_folder, "test.csv")
        print("Writing data to ", out_test_path)
        df_test.set_index("time_stamp", inplace=True)
        df_test.to_csv(out_test_path)

In [113]:
inp_dir = "../data/redd_processed/low_freq/"
out_dir = "../data/redd_processed/"

In [None]:
# Sample data
create_train_test_split(inp_dir, out_dir, window_segment = 3, train_house_list = [1], test_house_list = [5])

In [None]:
%%time
window_seg_list = [33, 65, 129, 257]
for cur_seg in window_seg_list:
    create_train_test_split(inp_dir, out_dir, window_segment = cur_seg, train_house_list = [1, 2, 3, 4], test_house_list = [5, 6])