In [1]:
!pip install tqdm
import csv
import os
import datetime
import pandas as pd
from tqdm import tqdm
import numpy as np
import pickle
import matplotlib.pyplot as plt 

device_dict={2: 'Dishwasher', 3: 'Treadmill', 4: 'Car charger', 5: 'Microwave', 6: 'Fridge', 7: 'Air exchanger', 8: 'Heat bedroom #1', 9: 'Heat bedroom #2', 10: 'Heat kitchen', 11: 'Heat living room', 12: 'Heat bedroom #3', 13: 'Heat dining room', 14: 'Water heater', 15: 'Office outlet #3', 16: 'Heat basement', 17: 'Office outlet #2', 18: 'Basement bathroom', 19: 'Office outlet #1', 20: 'Heat garage', 21: 'Garage/Basement outlets', 22: 'Oven', 23: 'Outdoor outlets', 24: 'Whirlpool bath', 25: 'Powder room outlets', 26: 'Lights Powder room/kitchen', 27: 'Lights bedroom #1 and #3', 28: 'Basement outlets', 29: 'Living/dining room outlets', 30: 'Bedroom #2 outlets', 31: 'Secondary outlets', 32: 'Washing machine', 33: 'Air Conditioning', 34: 'Basement/powder room outlets', 35: 'Kitchen outlet #1', 36: 'Kitchen outlet #2', 37: 'Kitchen outlet #3', 38: 'Kitchen outlet #4', 39: 'Dryer', 40: 'Towel dryer'}



In [3]:
def process_file(filename, device_dict):
    # Initialize power_dict with keys for each device name (lowercase)
    power_dict = {key.lower(): {} for key in device_dict.values()}

    # Get file name and create date object
    date_str = os.path.basename(filename).split(".")[0]

    # Open file and read through it, adding data to power_dict
    with open(filename, errors='ignore') as csv_file:
        
        # Delete all empty lines that may appear in the file
        lines = [line.replace('\x00', '') for line in csv_file]

        # Use tqdm to display progress bar while reading through file
        for row in csv.reader(lines):
            parts = row[0].split(";")

            if len(parts) != 3 or not all(parts):
                continue

            time = parts[0]
            datetime = date_str + ' ' + time
            sensor_number = int(parts[1].lstrip("0"))
            sensor_value = int(parts[2].replace(".", ""))

            device_name = device_dict[sensor_number].lower()
            power_dict[device_name][datetime] = sensor_value

    processed_data = {}
    for appliance in power_dict:
        # Convert the data in the inner dictionary to a dataframe
        power_dict[appliance] = pd.DataFrame.from_dict(power_dict[appliance], 
                                                       orient='index', 
                                                       columns=[appliance])
        
        power_dict[appliance].index = pd.to_datetime(power_dict[appliance].index)
 
        # Resample the dataframe to 7 seconds intervals and backfill the missing values
        power_dict[appliance] = power_dict[appliance].resample('7s').ffill(limit=1).fillna(0)
        # Convert the index to period of seconds
        power_dict[appliance].index = power_dict[appliance].index.to_period('s')

        # Check if sample has at least 2 non-NaN elements and has at least one value greater than 20
        sample = power_dict[appliance].to_numpy()
        if len(sample) < 2 or not np.any(sample > 20):
            continue

        processed_data[appliance] = power_dict[appliance]

    return processed_data


In [4]:
# Get list of files in the path
path = 'HES_july'
files = os.listdir(path)

# Get keys from first file
keys = list(process_file(path+"/"+files[0], device_dict).keys())

# Process all files in the directory and store them in a list
samples = [process_file(path+"/"+filename, device_dict) for filename in tqdm(files)]

# Create dictionary where each key corresponds to a list of the values from that key in each sample
processed_data = {key: [sample[key] for sample in samples] for key in keys}

100%|██████████| 29/29 [02:07<00:00,  4.38s/it]


In [7]:
# Save processed data to a pickle file
with open('HES_PROCESSED.pickle', 'wb') as handle:
    pickle.dump(processed_data, handle, protocol=pickle.HIGHEST_PROTOCOL)