In [1]:
import os
import pandas as pd
import numpy as np
import os
import psutil 
from datetime import datetime
import random


label = {'1':['Timestamp', 
                     'Fridge',
                     'Freezer(1)',
                     'Freezer(2)',
                     'Washer Dryer',
                     'Washing Machine',
                    'Dishwasher',
                     'Computer',
                    'Television Site',
                    'Electric Heater' ],
                 '2':['Timestamp', 
                    'Fridge-Freezer',
                    'Washing Machine',
                    'Dishwasher',
                    'Television Site',
                     'Microwave',
                    'Toaster',
                    'Hi-Fi',
                    'Kettle',
                    'Overhead Fan'],
                }




def read_channel(filename, app_name):
    """Method to read home channel data from .dat file into panda dataframe
        Args:
                filename: path to a specific channel_(m) from house(n)
        return:
                [pandas.Dataframe] of a signle channel_(m) from house(n)
    """
    channel_to_read = pd.read_feather(filename)[['Timestamp',app_name]]
    return channel_to_read


def resample_data(main, meter):
    '''Convernt time column into index and downsampling aggregate data 
    from 1s data to ~6s data as it was sampled for each individual appliance

    Attributes
    ----------
    main :  Aggregate dataframe
    meter : appliance dataframe

    Return : 
       [pandas.Dataframe] main, meter - so it can be  possibly for some other transformations)

    '''
    main.index = main['Timestamp']
    meter.index = meter['Timestamp']

    del meter['Timestamp']
    del main['Timestamp']
    
    main = main.sort_index()
    meter = meter.sort_index()

    meter = meter[meter.index.isin(main.index)]
    main = main[main.index.isin(meter.index)]
    
    return main, meter

def generate_clean_data(path, appliance, window_size, threshold, proportion=[1,1],  test=False, test_on='All'):
    
    activation_proportion = proportion[0]
    non_activation_proportion = proportion[1]
    aggregate_channels = []
    individual_channels = []
    aggregate_channels_test = []
    individual_channels_test = []
    activations = []
    non_activations = []
    
    houses = []
    for key, values in label.items():
        if app_name in values:
            houses.append(key)

    for house in houses:
        csv_filename = path + "house_"+house
        #print(csv_filename)
        iam = read_channel(filename=csv_filename, app_name=appliance)
        aggregate = read_channel(filename=csv_filename, app_name="Aggregate")
        print("Reading house_: {}".format(house))
        aggregate, iam = resample_data(aggregate, iam)
        aggregate, iam = np.array(aggregate['Aggregate']), np.array(iam[appliance])
        
    if test:
            split = round(len(aggregate) * 0.8)
            aggregate_test = aggregate[split:]
            iam_test = iam[split:]
            if test_on == 'All':
                aggregate_channels_test.append(aggregate_test)
                individual_channels_test.append(iam_test)
                aggregate = aggregate[:split]
                iam = iam[:split]
            elif test_on == house:
                aggregate_channels_test.append(aggregate_test)
                individual_channels_test.append(iam_test)
                aggregate = aggregate[:split]
                iam = iam[:split]
                
    aggregate_channels.append(aggregate) #appending the aggregate to aggregate list and iam to iam list so that their indices match
    individual_channels.append(iam)    
    
    for channel in individual_channels: #iterating through frigde power usage in each house
        activations_for_house = [] #buffer list to fill all activations detected in iam
        non_activations_for_house = []
        non_activation_samples = 0
        #print("Creating activations for house : {}".format(channel))
        for i in range(len(channel)):
            start = 0
            end = 0
            if channel[i] > threshold: #if power is above threshold power that may possibly be an activation
                if non_activation_samples > window_size:
                    non_activations_for_house.append([i - non_activation_samples, i-1])
                non_activation_samples = 0
                start = i
                while channel[i] > threshold and i < len(channel) - 1:
                    i += 1 #increasing index indicator until it reaches the value below threshold
                end = i
                activation = [start, end]
                activations_for_house.append(activation) #appending activation start and end time to buffer of activations for house
            else:
                non_activation_samples +=1
        activations.append(activations_for_house) #appending whole bufer to list of activations of specific appliance in all houses used for loading activations
        non_activations.append(non_activations_for_house)
        
    agg, iam = [], []
    
    for i in range(len(aggregate_channels)):
        #iterating through aggregate data of each house
        print('Number of activations in this channel: ', len(activations[i]))
        print('Number of non-activations in this channel: ', len(non_activations[i]))
        agg_windows, iam_windows = create_overlap_windows(aggregate_channels[i], individual_channels[i], window_size, stride=2)
        agg.extend(agg_windows)
        iam.extend(iam_windows)
        for start, end in activations[i]:
            #then iterating through activation positions in specified house [i]
            for j in range(activation_proportion):
                #randomly generate windows #n times with one activation
                activation_size = end - start
                #randomly positioning activation in window
                if (window_size - activation_size)> 0:
                    start_aggregate = start - np.random.randint(0, (window_size - activation_size))
                
                    #if start_aggregate + window_size < len(aggregate_channels[i]):
                    agg_buff, iam_buff = aggregate_channels[i][start_aggregate: start_aggregate + window_size], individual_channels[i][start_aggregate: start_aggregate + window_size]
                    agg_buff, iam_buff = np.copy(agg_buff), np.copy(iam_buff)
                    agg.append(agg_buff)
                    iam.append(iam_buff)

        for start, end in non_activations[i]:
            for j in range(non_activation_proportion):
                if (end - window_size)>0:
                    window_start = np.random.randint(start, (end - window_size))
                    agg_buff, iam_buff = aggregate_channels[i][window_start: window_start + window_size], individual_channels[i][window_start: window_start + window_size]
                    agg_buff, iam_buff = np.copy(agg_buff), np.copy(iam_buff)
                    agg.append(agg_buff)
                    iam.append(iam_buff)

    zipper = list(zip(agg, iam))
    random.shuffle(zipper)
    agg, iam = zip(*zipper)
    agg, iam = np.array(agg), np.array(iam)
    dataset = [agg, iam]


    #Creating test set if test==True
    agg_test = []
    iam_test = []
    isFirst = True
    if test:
        for i in range(len(aggregate_channels_test)):
            agg_buff_test, iam_buff_test = create_windows(aggregate_channels_test[i], individual_channels_test[i], window_size=window_size)
            if isFirst:
                agg_test = agg_buff_test
                iam_test = iam_buff_test
                isFirst = False
            else:
                print(agg_test)
                print(agg_buff_test)
                agg_test = np.concatenate((agg_test, agg_buff_test), axis=0)
                iam_test = np.concatenate((iam_test, iam_buff_test), axis=0)
        testset = [agg_test, iam_test]
        return dataset, testset
    print("Finish finding activation with {}".format(len(dataset)))
    return dataset
    

    
    
def create_overlap_windows(agg, iam, window_size, stride = 10):
    position = 0
    agg_windows = []
    iam_windows = []
    while position < len(agg) - window_size -1:
        agg_buffer = agg[position: position + window_size]
        iam_buffer = iam[position: position + window_size]
        agg_windows.append(agg_buffer)
        iam_windows.append(iam_buffer)
        position += stride
    return agg_windows, iam_windows


    
               

In [2]:
path="/home/ibcn079/data/REFIT/house_1"
app_name = "Fridge"
df = read_channel(path, app_name)

In [3]:
import timeit

In [5]:
start = timeit.default_timer()
path="/home/ibcn079/data/REFIT/"
app_name = "Washing Machine"
dataset=generate_clean_data(path, appliance=app_name, window_size=500, threshold=10, proportion=[1,1],  test=False, test_on='All')
stop = timeit.default_timer()
print( stop - start )

Reading house_: 1
Reading house_: 2
Number of activations in this channel:  239399
Number of non-activations in this channel:  340
Finish finding activation with 2
22.748541730998113


In [10]:
aggregate=dataset[0]
meter = dataset[1]

In [None]:
p