Load Dataset

In [6]:
from nilmtk import DataSet
import pandas as pd
import numpy as np
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Use data in  house 1,2 and 5 only
house_indicies = [1, 2, 5]

# Load UK-DALE .h5 dataset
ukdale = DataSet(r'C:\Users\Raymond Tie\Desktop\NILM\datasets\ukdale.h5')

#Hyperparameters
sample_period = 6;
noise_threshold = 5; #Noise threshold in Watts



In [8]:

# ---------------------------------------------------------------------------------------------------
#                                        Main Meter data capturing
#
# ---------------------------------------------------------------------------------------------------

# Store all processed data
all_house_data = []
# Standardized appliance names (final output)
appliance_name2 = ['kettle', 'microwave', 'fridge', 'dishwasher', 'washing_machine']

# Mapping from original names to standardized names
appliance_mapping = {
    'fridge freezer': 'fridge',
    'fridge': 'fridge',
    'dish washer': 'dishwasher', 
    'dishwasher': 'dishwasher',
    'washer dryer': 'washing_machine',
    'washing machine': 'washing_machine',
    'washing_machine': 'washing_machine'
}

# Original appliance names per building (for loading)
appliance_name = [['kettle', 'microwave', 'fridge freezer', 'dish washer', 'washer dryer'],
                   ['kettle', 'microwave', 'fridge', 'dish washer', 'washing machine'],
                   ['kettle', 'microwave', 'fridge freezer', 'dish washer', 'washer dryer']]

#Load Power in the selectec houses
for idx, house_id in enumerate(house_indicies):
    print("************************************************")
    print(f"Loading power data for Building {house_id}")

    #Get electricity data for that house
    elec = ukdale.buildings[house_id].elec #elec is a MeterGroup object that represents all the meters in the building

    #Load mains(aggregated power)
    mains = elec.mains()
    df_mains = next(mains.load(sample_period=sample_period))

    #Rename the column name
    df_mains = df_mains['power']['active']
    df_mains = df_mains.to_frame(name='P_mains')


    # ---------------------------------------------------------------------------------------------------
    #  Data Quality Filtering for mains data
    # ---------------------------------------------------------------------------------------------------

    #Handling missing Value using forward fill (limit to 30 samples (180 seconds))
    df_mains = df_mains.resample(f"{sample_period}S").mean() ##Regularize timestamps
    df_mains = df_mains.fillna(method='ffill', limit=30) ## Fill short gaps (<= 30 samples) with the last valid observation

    #Delete the rest of NaN Values
    df_mains = df_mains.dropna().copy()

    #Delete negative power if exists
    df_mains = df_mains[df_mains["P_mains"] > 0]


    #Set the noise signal in total power to 0
    df_mains[df_mains < noise_threshold] = 0  

    # ---------------------------------------------------------------------------------------------------
    #                                        Appliances data capturing
    #
    # ---------------------------------------------------------------------------------------------------

    # Initialize standardized appliance columns
    for std_app in appliance_name2:
        df_mains[std_app] = 0.0
    
    for app in appliance_name[idx]:
        # Get standardized name
        std_app = appliance_mapping.get(app, app)
        
        # Only process if it's one of our target appliances
        if std_app in appliance_name2:
            #Check if appliance exists in the building
            appliance_found = False
            for appliances in elec.appliances:
                if appliances.identifier.type == app:
                    appliance_found = True
                    break

            if appliance_found:
                #Load appliance data
                df_app = next(elec[app].load(sample_period=sample_period))
                df_app = df_app['power']['active'].to_frame(name=std_app)
                
                # Data Quality Filtering
                df_app = df_app.resample(f"{sample_period}S").mean().fillna(method='ffill', limit=30)

                #Merge the appliances in mains data (update existing column)
                df_mains[std_app] = df_app[std_app]

                # Identify impossible cases where appliance > total power
                mask_invalid = df_mains[std_app] > df_mains['P_mains']

                # Replace invalid samples with NaN first
                df_mains.loc[mask_invalid, std_app] = np.nan

                # Forward fill to replace with last valid power value
                df_mains[std_app] = df_mains[std_app].fillna(method='ffill', limit=5)
                
                print(f"  - Loaded {app} -> {std_app} data")
            else:
                # Appliance not found, keep zeros
                print(f"  - {app} -> {std_app} not found, filled with zeros")
        
    # Keep only P_mains and the 5 standardized appliances
    final_columns = ['P_mains'] + appliance_name2
    df_mains_final = df_mains[final_columns].copy()
    
    # Store processed data
    all_house_data.append(df_mains_final)
    print(f"Building {house_id} processed successfully")
    print(f"  Final columns: {list(df_mains_final.columns)}")
    print(f"  Data shape: {df_mains_final.shape}")
            


if all_house_data:
    entire_data = pd.concat(all_house_data, ignore_index=True)
    len(entire_data)




************************************************
Loading power data for Building 1
  - Loaded kettle -> kettle data
  - Loaded microwave -> microwave data
  - Loaded fridge freezer -> fridge data
  - Loaded dish washer -> dishwasher data
  - Loaded washer dryer -> washing_machine data
Building 1 processed successfully
  Final columns: ['P_mains', 'kettle', 'microwave', 'fridge', 'dishwasher', 'washing_machine']
  Data shape: (9443254, 6)
************************************************
Loading power data for Building 2
  - Loaded kettle -> kettle data
  - Loaded microwave -> microwave data
  - Loaded fridge -> fridge data
  - Loaded dish washer -> dishwasher data
  - Loaded washing machine -> washing_machine data
Building 2 processed successfully
  Final columns: ['P_mains', 'kettle', 'microwave', 'fridge', 'dishwasher', 'washing_machine']
  Data shape: (2020410, 6)
************************************************
Loading power data for Building 5
  - Loaded kettle -> kettle data
  - L

In [None]:
#****************************************************************************
#                                Normalize the data
#****************************************************************************
from sklearn.preprocessing import StandardScaler
import numpy as np

# Convert entire_data to numpy array
X = entire_data.values  # Shape: (samples, features)
print(f"Data shape: {X.shape}")
print(f"Original data stats:")
print(entire_data.describe())

# Z-score normalization (mean=0, std=1) - RECOMMENDED
scaler_z = StandardScaler()
X_norm_z = scaler_z.fit_transform(X)

# Convert back to DataFrame
entire_data_norm = pd.DataFrame(X_norm_z, 
                               index=entire_data.index, 
                               columns=entire_data.columns)

print(f"\nZ-score normalized data stats:")
print(entire_data_norm.describe())

# Save normalization parameters for later use
normalization_params = {
    'mean': scaler_z.mean_,
    'std': scaler_z.scale_
}

print(f"\nNormalization parameters:")
print(f"Mean: {normalization_params['mean']}")
print(f"Std: {normalization_params['std']}")

# Verify normalization worked
print(f"\nVerification:")
print(f"Normalized data mean: {entire_data_norm.mean().mean():.6f} (should be ~0)")
print(f"Normalized data std: {entire_data_norm.std().mean():.6f} (should be ~1)")


Data shape: (13366214, 6)
Original data stats:
            P_mains        kettle     microwave        fridge    dishwasher  \
count  1.336621e+07  1.229551e+07  1.285430e+07  1.285260e+07  1.285339e+07   
mean   3.517151e+02  1.710460e+01  1.395020e+01  4.101673e+01  1.915114e+01   
std    4.688719e+02  2.004570e+02  9.951019e+01  4.944275e+01  1.895236e+02   
min    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
25%    1.524933e+02  1.000000e+00  1.000000e+00  0.000000e+00  0.000000e+00   
50%    2.313117e+02  1.000000e+00  1.000000e+00  0.000000e+00  1.000000e+00   
75%    3.688850e+02  1.000000e+00  1.000000e+00  8.800000e+01  1.000000e+00   
max    8.399210e+03  3.996000e+03  3.054000e+03  1.998000e+03  3.480000e+03   

       washing_machine  
count     1.272984e+07  
mean      2.806045e+01  
std       1.911348e+02  
min       0.000000e+00  
25%       0.000000e+00  
50%       0.000000e+00  
75%       3.000000e+00  
max       3.888000e+03  

Z-score normali