In [1]:
# We can use the same thing to convert any dataset
source_dir = 'C:\\Users\\nicho\\OneDrive - The University of Western Ontario\\Ecolux\\Databases\\REFIT\\Processed_Data_CSV'
out_dir = 'C:\\Users\\nicho\\OneDrive - The University of Western Ontario\\Ecolux\\Databases\\REFIT\\refit.h5'

# Only change this to code if you want to convert everything again
from nilmtk.dataset_converters import convert_refit

convert_refit(source_dir, out_dir)

In [2]:
from nilmtk import DataSet
from nilmtk.utils import print_dict

# Seeing dataset metadata
refit = DataSet(out_dir)
print_dict(refit.metadata)

In [3]:
print_dict(refit.buildings)

In [4]:
import pandas as pd

# Dictionary to hold the date ranges for each house
date_ranges = {}

for house_id in refit.buildings:
    house = refit.buildings[house_id]
    
    # The entire house
    mains = house.elec.mains()
    
    # Get the first and last sample timestamps
    start = pd.to_datetime(mains.get_timeframe().start).tz_convert('UTC')
    end = pd.to_datetime(mains.get_timeframe().end).tz_convert('UTC')
    start = start + pd.Timedelta(hours=1) - pd.Timedelta(minutes=start.minute, seconds=start.second, microseconds=start.microsecond)
    end = end - pd.Timedelta(minutes=end.minute, seconds=end.second, microseconds=end.microsecond)
    
    # Store the date range for this building
    date_ranges[house_id] = (start, end)

sorted_date_ranges = sorted(date_ranges.items())

# Print the date ranges for each house
for house_id, (start, end) in sorted_date_ranges:
    print(f"Building {house_id}: {start} to {end}")

Building 1: 2013-10-09 14:00:00+00:00 to 2014-01-02 16:00:00+00:00
Building 2: 2013-09-17 23:00:00+00:00 to 2015-05-28 08:00:00+00:00
Building 3: 2013-09-25 20:00:00+00:00 to 2015-06-02 10:00:00+00:00
Building 4: 2013-10-11 11:00:00+00:00 to 2015-07-07 09:00:00+00:00
Building 5: 2013-09-26 10:00:00+00:00 to 2015-07-06 17:00:00+00:00
Building 6: 2013-11-28 13:00:00+00:00 to 2015-06-28 22:00:00+00:00
Building 7: 2013-11-01 23:00:00+00:00 to 2015-07-08 02:00:00+00:00
Building 8: 2013-11-01 23:00:00+00:00 to 2015-05-10 23:00:00+00:00
Building 9: 2013-12-17 18:00:00+00:00 to 2015-07-08 18:00:00+00:00
Building 10: 2013-11-20 12:00:00+00:00 to 2015-06-30 11:00:00+00:00
Building 11: 2014-06-03 12:00:00+00:00 to 2015-06-30 15:00:00+00:00
Building 12: 2014-03-07 11:00:00+00:00 to 2015-07-08 02:00:00+00:00
Building 13: 2014-01-17 23:00:00+00:00 to 2015-05-31 11:00:00+00:00
Building 14: 2013-12-17 18:00:00+00:00 to 2015-07-08 02:00:00+00:00
Building 15: 2014-01-10 11:00:00+00:00 to 2015-07-08 02:0

In [5]:
refit.buildings[1].elec

MeterGroup(meters=
  ElecMeter(instance=1, building=1, dataset='REFIT', site_meter, appliances=[])
  ElecMeter(instance=2, building=1, dataset='REFIT', appliances=[Appliance(type='fridge', instance=1)])
  ElecMeter(instance=3, building=1, dataset='REFIT', appliances=[Appliance(type='freezer', instance=1)])
  ElecMeter(instance=4, building=1, dataset='REFIT', appliances=[Appliance(type='freezer', instance=2)])
  ElecMeter(instance=5, building=1, dataset='REFIT', appliances=[Appliance(type='washer dryer', instance=1)])
  ElecMeter(instance=6, building=1, dataset='REFIT', appliances=[Appliance(type='washing machine', instance=1)])
  ElecMeter(instance=7, building=1, dataset='REFIT', appliances=[Appliance(type='dish washer', instance=1)])
  ElecMeter(instance=8, building=1, dataset='REFIT', appliances=[Appliance(type='computer', instance=1)])
  ElecMeter(instance=9, building=1, dataset='REFIT', appliances=[Appliance(type='television', instance=1)])
  ElecMeter(instance=10, building=1, data

In [6]:
for meter in refit.buildings[1].elec.submeters().meters:
    for appliance in meter.appliances:
        print(appliance.metadata.get('type'))

fridge
freezer
freezer
washer dryer
washing machine
dish washer
computer
television
electric space heater


In [7]:
for house in refit.buildings:
    print(refit.buildings[house].elec.mains())

ElecMeter(instance=1, building=1, dataset='REFIT', site_meter, appliances=[])
ElecMeter(instance=1, building=10, dataset='REFIT', site_meter, appliances=[])
ElecMeter(instance=1, building=11, dataset='REFIT', site_meter, appliances=[])
ElecMeter(instance=1, building=12, dataset='REFIT', site_meter, appliances=[])
ElecMeter(instance=1, building=13, dataset='REFIT', site_meter, appliances=[])
ElecMeter(instance=1, building=14, dataset='REFIT', site_meter, appliances=[])
ElecMeter(instance=1, building=15, dataset='REFIT', site_meter, appliances=[])
ElecMeter(instance=1, building=16, dataset='REFIT', site_meter, appliances=[])
ElecMeter(instance=1, building=17, dataset='REFIT', site_meter, appliances=[])
ElecMeter(instance=1, building=18, dataset='REFIT', site_meter, appliances=[])
ElecMeter(instance=1, building=19, dataset='REFIT', site_meter, appliances=[])
ElecMeter(instance=1, building=2, dataset='REFIT', site_meter, appliances=[])
ElecMeter(instance=1, building=20, dataset='REFIT', si

In [8]:
# device types
device_types = {}

for house_id in refit.buildings:
    elec = refit.buildings[house_id].elec
    for num, meter in enumerate(elec.submeters().meters, start=1):
        #help(meter.load)
        for appliance in meter.appliances:
            if appliance.metadata.get('type') is not None:
                appliance_type = appliance.metadata.get('type')
                # Sorting them into types with (house, applianceNum) tuples
                if appliance_type not in device_types:
                    device_types[appliance_type] = [(meter.building(), num)]
                else:
                    device_types[appliance_type].append((meter.building(), num))
            else:
                print('Nothing there')

# Print out the device types and their associated meter instances
for appliance_type, house in device_types.items():
    print(f"{appliance_type}: {house}")

fridge: [(1, 1), (11, 1), (17, 1), (19, 1), (4, 1), (7, 1), (8, 1)]
freezer: [(1, 2), (1, 3), (10, 3), (13, 2), (16, 1), (17, 2), (19, 2), (3, 3), (4, 2), (6, 1), (7, 2), (7, 3), (8, 2)]
washer dryer: [(1, 4), (17, 4), (8, 3), (9, 2)]
washing machine: [(1, 5), (10, 5), (11, 3), (13, 3), (14, 3), (15, 5), (16, 4), (17, 5), (18, 2), (19, 4), (2, 2), (20, 3), (3, 6), (4, 5), (4, 6), (5, 3), (6, 2), (7, 5), (8, 4), (9, 3)]
dish washer: [(1, 6), (10, 6), (11, 4), (13, 4), (14, 4), (15, 6), (17, 6), (19, 5), (2, 3), (20, 4), (3, 5), (5, 4), (6, 3), (7, 6), (9, 4)]
computer: [(1, 7), (11, 5), (12, 4), (14, 5), (15, 7), (16, 5), (17, 7), (19, 6), (5, 5), (6, 4), (6, 9), (8, 6)]
television: [(1, 8), (10, 7), (12, 8), (13, 1), (14, 6), (15, 8), (16, 6), (16, 9), (17, 8), (18, 3), (19, 7), (2, 4), (20, 6), (3, 7), (4, 7), (5, 6), (6, 5), (7, 7), (8, 7), (9, 5)]
electric space heater: [(1, 9), (15, 3), (15, 4), (9, 9)]
food processor: [(10, 1), (10, 9), (20, 5)]
toaster: [(10, 2), (12, 7), (14, 9)

In [9]:
# Putting the devices into groups. Possibly separate by water consumption after.
# Not in this set then it is intermittent
# Add more if you notice more above in different datasets
always_on_types = set()
always_on_types.update(['fridge', 'freezer', 'fridge freezer', 'broadband router', 'pond pump'])
condensed_types = {'AlwaysOn': {}, 'Intermit': {}}

# Sorting all of the appliances into distinct categories
for appliance_type in device_types.keys():
    if appliance_type in always_on_types:
        for (house, num) in device_types[appliance_type]:
            if house not in condensed_types['AlwaysOn']:
                condensed_types['AlwaysOn'][house] = set()
                
            condensed_types['AlwaysOn'][house].add(num)
    else:
        for (house, num) in device_types[appliance_type]:
            if house not in condensed_types['Intermit']:
                condensed_types['Intermit'][house] = set()
                
            condensed_types['Intermit'][house].add(num)

# Print out the separated categories
for cat, info in condensed_types.items():
    print(f"{cat}: {info}")

AlwaysOn: {1: {1, 2, 3}, 11: {8, 1, 2}, 17: {1, 2, 3}, 19: {1, 2}, 4: {1, 2, 3}, 7: {1, 2, 3}, 8: {1, 2}, 10: {3, 4}, 13: {2, 6}, 16: {1, 2}, 3: {2, 3}, 6: {1}, 12: {1}, 14: {1}, 15: {1, 2}, 18: {1}, 2: {1}, 20: {1, 9}, 5: {1}, 9: {1}}
Intermit: {1: {4, 5, 6, 7, 8, 9}, 17: {4, 5, 6, 7, 8, 9}, 8: {3, 4, 5, 6, 7, 8, 9}, 9: {2, 3, 4, 5, 6, 7, 8, 9}, 10: {1, 2, 5, 6, 7, 8, 9}, 11: {3, 4, 5, 6, 7, 9}, 13: {1, 3, 4, 5, 7, 8, 9}, 14: {2, 3, 4, 5, 6, 7, 8, 9}, 15: {3, 4, 5, 6, 7, 8, 9}, 16: {3, 4, 5, 6, 7, 8, 9}, 18: {2, 3, 4, 5, 6, 7, 8, 9}, 19: {3, 4, 5, 6, 7, 8, 9}, 2: {2, 3, 4, 5, 6, 7, 8, 9}, 20: {2, 3, 4, 5, 6, 7, 8}, 3: {1, 4, 5, 6, 7, 8, 9}, 4: {4, 5, 6, 7, 8, 9}, 5: {2, 3, 4, 5, 6, 7, 8, 9}, 6: {2, 3, 4, 5, 6, 7, 8, 9}, 7: {4, 5, 6, 7, 8, 9}, 12: {2, 3, 4, 5, 6, 7, 8, 9}}


In [30]:
# Keep track of row processing
def process_row(row):
    return row.nlargest(k).iloc[-1]

In [31]:
corrected_mains = {}

# I want to prevent mismeasurements by adding the W of any column
for house_id in sorted(refit.buildings):
    elec = refit.buildings[house_id].elec
    mains = elec.mains()
    mains_df = pd.DataFrame(next(mains.load()))
    all_meters_df = pd.DataFrame()

    # Going through appliances
    for num, meter in enumerate(elec.submeters().meters, start=1):
        # Checking if the appliance has any larger values
        meter_df = pd.DataFrame(next(meter.load()))
        all_meters_df = pd.concat([all_meters_df, meter_df], axis=1)

    # Calculate the sum of all meters for each row
    sum_of_meters = all_meters_df.sum(axis=1)

    # Rare case where sum is still less so get the highest val iteratively
    for k in range(1, len(elec.submeters().meters) + 1):
        condition = sum_of_meters > mains_df.squeeze()  # Assuming mains_df is a single column DataFrame
        print("Total invalid rows: " + str(condition.sum()))

        # If the condition is empty we don't have to check the next highest meter
        if not condition.any():
            break

        # Add the highest val to mains
        print("Found incorrect mains sum for house " + str(house_id) + " (iter " + str(k) + ")")
        
        # This is to take advantage of the added speed of max
        if k == 1:
            mains_df[condition] += all_meters_df[condition].max(axis=1).values.reshape(-1, 1)
        else:
            mains_df[condition] += all_meters_df[condition].apply(process_row, axis=1).values.reshape(-1, 1)

    # Add to corrected
    corrected_mains[house_id] = mains_df
    print('Done checking all of house ' + str(house_id))

print('Finished checking all houses')
        

Total invalid rows: 23850
Found incorrect mains sum for house 1 (iter 1)
Total invalid rows: 224
Found incorrect mains sum for house 1 (iter 2)
Total invalid rows: 14
Found incorrect mains sum for house 1 (iter 3)
Total invalid rows: 3
Found incorrect mains sum for house 1 (iter 4)
Total invalid rows: 0
Done checking all of house 1
Total invalid rows: 28444
Found incorrect mains sum for house 2 (iter 1)
Total invalid rows: 124
Found incorrect mains sum for house 2 (iter 2)
Total invalid rows: 9
Found incorrect mains sum for house 2 (iter 3)
Total invalid rows: 0
Done checking all of house 2
Total invalid rows: 408627
Found incorrect mains sum for house 3 (iter 1)
Total invalid rows: 35960
Found incorrect mains sum for house 3 (iter 2)
Total invalid rows: 4575
Found incorrect mains sum for house 3 (iter 3)
Total invalid rows: 550
Found incorrect mains sum for house 3 (iter 4)
Total invalid rows: 3
Found incorrect mains sum for house 3 (iter 5)
Total invalid rows: 0
Done checking all of 

In [33]:
import os

# Directory where you want to save the CSV files
csv_dir = 'C:\\Users\\nicho\\OneDrive - The University of Western Ontario\\Ecolux\\Databases\\REFIT\\Hourly CSV\\'
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)

for house_id, (start, end) in sorted_date_ranges:
    # Initialize an empty DataFrame for the building
    house_df = pd.DataFrame()

    # Access electricity data for the house
    elec = refit.buildings[house_id].elec
    mains_df = pd.DataFrame()

    # Loading the mains data and resampling
    thirty_sec_mains = corrected_mains[house_id].resample('30S').mean()
    thirty_sec_energy = thirty_sec_mains * (1/(120)) # Converting to Wh
    mains_df = thirty_sec_energy.resample('H').sum().round(3)
    
    # Two data frames for each category
    cat_df = {'AlwaysOn': pd.DataFrame(), 'Intermit': pd.DataFrame()}

    # Go through each meter
    for num, meter in enumerate(elec.submeters().meters, start=1):
        print(f"Processing appiance {num} for house {house_id}")

        # Finding the category
        for cat, house in condensed_types.items():
            if house_id in house and num in house[house_id]:
                meter_cat = cat
        try:
            # Load the meter data with hour sampling
            # TODO CHANGE TO HANDLE MULTIPLE CHUNKS
            meter_data = next(meter.load()) 
            
            if not meter_data.empty:
                # Resample to hourly data and sum the readings
                thirty_sec_app_power = meter_data.resample('30S').mean()
                thirty_sec_app_energy = thirty_sec_app_power * (1/(120)) # Converting to Wh
                hourly_data = thirty_sec_app_energy.resample('H').sum().round(3)

            # Putting the meter in a category
            if cat_df[meter_cat].empty:
                cat_df[meter_cat] = hourly_data
            else:
                cat_df[meter_cat] += hourly_data

        except Exception as e:
            print(f"Error processing Meter{num} for House {house_id}. Error: {e}")

    # Combine the category DataFrames and handle missing data
    for cat, df in cat_df.items():
        # GOTTA HANDLE THE ZERO VALUES CORRECTLY
        df.fillna(method='ffill', inplace=True)  # Replace NaNs with a forward fill
        house_df[cat] = df.sum(axis=1)  # Sum across meters within the category

    # Adding the mains data
    house_df['Total'] = mains_df
    house_df['HVAC'] = house_df['Total'] - house_df['AlwaysOn'] - house_df['Intermit']
    
    # Converting time zones only if needed
    #house_df.index = house_df.index.tz_convert('Europe/London')
    house_df['DayNum'] = house_df.index.dayofweek
    house_df['Time'] = house_df.index.time
    house_df['Month'] = house_df.index.month
    cols = ['DayNum', 'Time', 'Month', 'Total', 'AlwaysOn', 'Intermit', 'HVAC']
    house_df = house_df[cols]

    # Write the combined building data to CSV
    csv_file_path = os.path.join(csv_dir, f"house{house_id}_clean.csv")
    house_df.to_csv(csv_file_path, index=True)
    print(f"Combined hourly data for house {house_id} written to CSV.")

print("Finished processing all buildings.")

Processing appiance 1 for house 1
Processing appiance 2 for house 1
Processing appiance 3 for house 1
Processing appiance 4 for house 1
Processing appiance 5 for house 1
Processing appiance 6 for house 1
Processing appiance 7 for house 1
Processing appiance 8 for house 1
Processing appiance 9 for house 1
Combined hourly data for house 1 written to CSV.
Processing appiance 1 for house 2
Processing appiance 2 for house 2
Processing appiance 3 for house 2
Processing appiance 4 for house 2
Processing appiance 5 for house 2
Processing appiance 6 for house 2
Processing appiance 7 for house 2
Processing appiance 8 for house 2
Processing appiance 9 for house 2
Combined hourly data for house 2 written to CSV.
Processing appiance 1 for house 3
Processing appiance 2 for house 3
Processing appiance 3 for house 3
Processing appiance 4 for house 3
Processing appiance 5 for house 3
Processing appiance 6 for house 3
Processing appiance 7 for house 3
Processing appiance 8 for house 3
Processing appianc

In [41]:
houses_path = 'C:\\Users\\nicho\\OneDrive - The University of Western Ontario\\Ecolux\\Databases\\REFIT\\Hourly CSV\\house'
weather_path = 'C:\\Users\\nicho\\OneDrive - The University of Western Ontario\\Ecolux\\Databases\\REFIT\\Weather CSV\\house'
output_path = 'C:\\Users\\nicho\\OneDrive - The University of Western Ontario\\Ecolux\\Databases\\REFIT\\Completed CSV\\house'

for house_id in sorted(refit.buildings):
    cur_house = houses_path + str(house_id) + '_clean.csv'
    cur_weather = weather_path + str(house_id) + '.csv'
    cur_output = output_path + str(house_id) + '.csv'

    # Reading the csvs
    house_df = pd.read_csv(cur_house, index_col='Unix', parse_dates=['Unix'])
    weather_df = pd.read_csv(cur_weather, skiprows=3)
    
    # Convert to datetime
    weather_df['time'] = pd.to_datetime(weather_df['time'], unit='s').dt.tz_localize('UTC').dt.tz_convert('Europe/London')
    
    # Set datetime as the index
    weather_df.set_index('time', inplace=True)
    
    # Merge the dataframes based on the index
    merged_df = house_df.join(weather_df, how='inner')

    # Label the index
    merged_df.reset_index(inplace=True)
    merged_df.rename(columns={'index': 'Timestamp'}, inplace=True)

    # Send to csv
    merged_df.to_csv(cur_output, index=False)
    print('Done formatting house ' + str(house_id))

Done formatting house 1
Done formatting house 2
Done formatting house 3
Done formatting house 4
Done formatting house 5
Done formatting house 6
Done formatting house 7
Done formatting house 8
Done formatting house 9
Done formatting house 10
Done formatting house 11
Done formatting house 12
Done formatting house 13
Done formatting house 14
Done formatting house 15
Done formatting house 16
Done formatting house 17
Done formatting house 18
Done formatting house 19
Done formatting house 20
