In [None]:
#!pip uninstall -y -q nilmtk nilm_metadata
!python3 -m pip install -q pandas numpy networkx scipy tables scikit-learn hmmlearn pyyaml matplotlib xgboost pyts h5py

# Trick to install NILM regardless of its dependencies
!python3 -m pip install -q --no-deps git+https://github.com/nilmtk/nilmtk@master
!python3 -m pip install -q --no-deps git+https://github.com/nilmtk/nilm_metadata@master

!mamba install -c conda-forge tslearn py-xgboost -y -q
!pip install -q tsfresh

!mamba install -y -q pytorch=2.* torchvision pytorch-cuda=11.7 -c pytorch -c nvidia -c anaconda
!pip install -q pytorch-lightning torchmetrics

In [None]:
import h5py
import numpy as np
import nilmtk
import os
import pickle
import pandas as pd

from pyts.image import RecurrencePlot

from pathlib import Path

In [None]:
# Static experiment settings:
n_mins = 60
step_ratio = 0.8
method = "zero fill" #stitch, zero fill or interpolate
min_watts = 2

names = [
    #"/home/jovyan/datasets/NILM/iawe.h5",
    #"/home/jovyan/datasets/NILM/eco.h5",
    #"/home/jovyan/datasets/NILM/redd.h5",
    #"/home/jovyan/datasets/NILM/refit.h5",
    "/home/jovyan/datasets/NILM/ukdale.h5",
]


LabelNames = [
    #["air conditioner","computer","fridge","television"], 
    #["HTPC", "broadband router","coffee maker","computer","freezer","fridge","kettle","lamp","laptop computer","washing machine","microwave", "audio system"], 
    #["electric furnace" ,"fridge" ,"light" ,"microwave","sockets"],
    #["audio system","breadmaker","broadband router","computer","dehumidifier","dish washer","electric space heater","food processor","freezer","fridge","fridge freezer","kettle","pond pump","television","toaster","tumble dryer","washer dryer","washing machine"],
    ["HTPC","boiler","computer monitor","desktop computer","freezer","fridge","fridge freezer","kettle","laptop computer","microwave","server computer","television","toaster","washer dryer","light" ]
]



for idx, originname in enumerate(names):

    #step calculation based on resolution and step ratio
    step_mins = n_mins * step_ratio
    step = int(step_mins * 10)
    name= Path(originname).stem #os.path.splitext(originname)[0]

    originpath=""
    destinationpath=""

    dataset=nilmtk.DataSet(originname)
    
    #iterates through all meters and gets unique devices in the dataset
    devices=[]
    for building in dataset.buildings:
      for meter in dataset.buildings[building].elec.submeters().meters:
        label=meter.appliances[0].metadata.get("type")
        devices.append(str(label))
    devices=np.sort(np.unique(np.array(devices)))
    devicedict={}
    for i, device in enumerate(devices):
      devicedict[device]=i

    #dictionary of unique devices in the dataset
    # with open(""+name+"_devicedict.pkl", "wb") as f:
    #   pickle.dump(devices, f)


    data=[]
    labels=[]
    stattotal=[]
    statnotnan=[]
    statretained=[]

    sample_length = n_mins * 10

    for building in dataset.buildings:
      for meter in dataset.buildings[building].elec.submeters().meters:
        label=meter.appliances[0].metadata.get("type")
        if label not in LabelNames[idx]:
            #print(label)
            continue
        try:
            df = next(meter.load(physical_quantity='power'))
        #resamples to 6s, as thats the most commonly shared sample rate
            df=df.resample("6s").ffill(limit=10)
        #gets time series of power values
        except:
            break
        try:
            ts = np.array(df.power.active.values)
        except:
            try:
                ts = np.array(df.power.apparent.values)
            except:
                raise ValueError

        #collects some info about the data        
        #print(f"starting: {label}")
        length=len(ts)
        #print(f"raw series length: {length}")
        stattotal.append(length)

        #transforms the raw time series based on the selected type
        if method=="stitch":
            #deletes nan data points
          ts=ts[~np.isnan(ts)]
        elif method=="zero fill":
            #replaces nan data points with 0
          ts[np.isnan(ts)]=0
        elif method=="interpolate":
            #interpolates nan data points (linear by default)
          ts=np.array(pd.Series(ts).interpolate())

        #collects some info about the data
        length=len(ts)
        #print(f"cleaned series length: {length}")
        statnotnan.append(length)

        #sample creation from time series, sliding window with calculated step
        n_samples=length//(sample_length-step)
        statretained.append(n_samples*sample_length)
        for i in range(n_samples):
          sample=ts[i*step:i*step+sample_length]
          if len(sample) == sample_length:
            data.append(sample)
            if label == "fridge" or label == "freezer" or label == "fridge freezer":
                label = "fridge/freezer"
                labels.append(label)
            elif label=="kettle" or label == "toaster":
                label = "HEKA"
                labels.append(label)
            else:
                labels.append(label)
        #print(f"finished: {label}, created {n_samples} samples\n")

    #output collected info (how many samples)
    #print(f"retained {np.sum(statnotnan)} out of {np.sum(stattotal)} data points")
    #print(f"retained {np.sum(statretained)} data points after sample creation ({np.sum(statretained)//sample_length} samples)\n")

    #save data in shape (number of samples, minutes*10) and labels in shape (number of samples)
    data = np.array(data)
    labels = np.array(labels)
    
    total, y_weight = np.unique(labels, return_inverse=True)
    print(total)
    
    mapping = {}
    for x in range(len(total)):
        mapping[total[x]] = x

    num_labels = np.array([ mapping[label] for label in labels], dtype=np.uint8)
        
# integer representation
    #for x in range(len(labels)):
    #    labels[x] = mapping[labels[x]]

In [None]:
activity_labels = np.average(data, axis=1) > min_watts

In [None]:
# Check if the sizes and shapes are correct
print(f'{data.shape=} {data.dtype=};')
print(f'{num_labels.shape=} {num_labels.dtype=};')
print(f'{activity_labels.shape=} {activity_labels.dtype=};')

In [None]:
with h5py.File('./ukdale-processed.h5', mode='w') as f:
    f.create_dataset('X', shape=data.shape, dtype=data.dtype, chunks=True, compression='gzip', shuffle=True, compression_opts=9, data=data)
    print('Done X!')
    f.create_dataset('y', shape=num_labels.shape, dtype=num_labels.dtype, chunks=True, compression='gzip', shuffle=True, compression_opts=9, data=num_labels)
    print('Done y!')
    f.create_dataset('activity', shape=activity_labels.shape, dtype=activity_labels.dtype, chunks=True, compression='gzip', shuffle=True, compression_opts=9, data=activity_labels)
    print('Done labels!')

In [None]:
!ls -lah