In [2]:
%pip install pyarrow
%pip install tqdm

Collecting pyarrow
  Using cached pyarrow-13.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.0 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-13.0.0
Note: you may need to restart the kernel to use updated packages.
Collecting tqdm
  Using cached tqdm-4.66.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.66.1
Note: you may need to restart the kernel to use updated packages.


In [3]:

import pandas as pd
import os
from tqdm import tqdm
from collections import defaultdict
import concurrent.futures

# DEKN

https://data.open-power-system-data.org/household_data/

In [None]:
df = pd.read_csv("./Energy_graph/data/temp/household_data_15min_singleindex_filtered.csv")
df =df.drop(columns=["utc_timestamp", "interpolated"])
df

In [None]:
df["cet_cest_timestamp"] = df["cet_cest_timestamp"].apply(lambda x: x.split("+")[0])
df["cet_cest_timestamp"] = pd.to_datetime(df["cet_cest_timestamp"], format="%Y-%m-%dT%H:%M:%S")
df = df.set_index("cet_cest_timestamp")
df = df[~df.index.duplicated(keep='first')]

df

In [None]:

# Extract household identifiers
households = set(column.split('_')[2] for column in df.columns)

# Create a dictionary of dataframes, one for each household
dfs = {}

for household in households:
    # Filter columns relevant to this household
    relevant_columns = [col for col in df.columns if household in col]
    temp_df = df[relevant_columns].copy()

    # Rename columns to remove the prefix and retain the device name
    rename_dict = {col: col.replace(f"DE_KN_{household}_", "") for col in relevant_columns}
    temp_df.rename(columns=rename_dict, inplace=True)
    temp_df.rename(columns={'cet_cest_timestamp': 'timestamp', "grid_import": "aggregate"}, inplace=True)
    if "grid_export" in temp_df.columns:
        temp_df.drop(columns=['grid_export'], inplace=True)
    if "pv" in temp_df.columns:
        temp_df.drop(columns=['pv'], inplace=True)
    # temp_df.drop(columns=['grid_export', 'pv'], inplace=True)
    data = {}
    name ="DEKN_" +str(household[-1])
    for c in temp_df.columns:
        data[c] = pd.DataFrame(temp_df[c].dropna())
        
    dfs[name] = data

In [None]:
df2 = pd.read_excel("./Energy_graph/data/temp/household_data.xlsx")
df2 

# GREEND


https://sourceforge.net/projects/greend/



GREEND download form
Great to get to know you! 

Here are our dataset snapshots and the associated password:

v0.1: 
http://sourceforge.net/projects/greend/files/GREEND_0-1_311014.zip/download

PWD:"Vienna"


https://www.academia.edu/7794767/GREEND_An_Energy_Consumption_Dataset_of_Households_in_Italy_and_Austria

http://www.andreatonello.com/wp-content/uploads/PAPERS/CONFERENCES/SGC2014_2.pdf



In [None]:
df = pd.read_csv("./Energy_graph/data/temp/GREEND/building0/dataset_2013-12-07.csv", on_bad_lines="skip")
df


# TODO either fix NILMTK if possible or try to get id to device mapping from somewhere else

# ENERTALK

In [None]:
data_path = "./Energy_graph/data/temp/ENERTALK/enertalk"
def convert2KRtime(df):
    """
    convert dateframe's unix timestamp into Asia/Seoul Timezone
    
    input
    ----
        df: dataframe (columns: timestamp, active_power, reactive_power, appliance)
    
    output
    ----
        df_kr: dataframe (columns: timestamp, active_power, reactive_power, appliance, KR timezone)
    """ 

    df_kr = df
    df_kr['timestamp'] = df_kr['timestamp'].dt.tz_localize('UTC').dt.tz_convert('Asia/Seoul')
    df_kr = df_kr.set_index(pd.DatetimeIndex(df_kr['timestamp']))
    return df_kr


def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Parse the name of the file to get the device name"
    """
    df.drop(columns=["reactive_power"], inplace=True)
    # convert unix timestamp to datetime and set as index
    df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms").dt.tz_localize('UTC').dt.tz_convert('Asia/Seoul')
    df.set_index("timestamp", inplace=True)
    # convert to kWh
    df  = df/1000 * (1/15)/3600
    # resample to 1 second
    df = df.resample("1S").sum()

    return df

def parse_name(file_name: str):
    """
    Parse the name of the file to get the device name"
    """
    # remove the extension
    file_name = file_name.split(".")[0]
    # get the device name
    file_name = file_name.split("_")[1]
 

    return file_name



def process_house(house):
    house_path = os.path.join(data_path, house)
    house_dict = defaultdict(list)
    house_name = "ENERTALK_" + str(int(house))
    
    for day in os.listdir(house_path):
        day_path = os.path.join(house_path, day)
        
        for device in os.listdir(day_path):
            device_path = os.path.join(day_path, device)
            name = parse_name(device)
            
            df = preprocess_dataframe(pd.read_parquet(device_path))
            house_dict[name].append(df)

    for key in house_dict:
        house_dict[key] = pd.concat(house_dict[key], axis=0)
    
    return house_name, house_dict


Serial program

In [None]:
from tqdm import tqdm
from collections import defaultdict
data_path = "./Energy_graph/data/temp/ENERTALK/enertalk"
data_dict = {}
for house in os.listdir(data_path):
    house_dict = defaultdict(list)
    house_name = "ENERTALK_" + str(int(house))
    for day in tqdm(os.listdir(data_path + "/" + house)):
        for device in os.listdir(data_path + "/" + house + "/" + day):
            name = parse_name(device)
            df = preprocess_dataframe(pd.read_parquet(data_path + "/" + house + "/" + day + "/" + device))
            house_dict[name].append(df)

    for key in house_dict:
        house_dict[key] = pd.concat(house_dict[key], axis=0)
    
    data_dict[house_name] = house_dict
    break
    



Multithreaded

In [None]:
import os
import pandas as pd
from collections import defaultdict
import concurrent.futures
from tqdm import tqdm

data_path = "./Energy_graph/data/temp/ENERTALK/enertalk"
data_dict = {}

def process_house(house, progress_bar=None):
    house_path = os.path.join(data_path, house)
    house_dict = defaultdict(list)
    house_name = "ENERTALK_" + str(int(house))
    
    for day in os.listdir(house_path):
        day_path = os.path.join(house_path, day)
        for device in os.listdir(day_path):
            device_path = os.path.join(day_path, device)
            name = parse_name(device)
            df = preprocess_dataframe(pd.read_parquet(device_path))
            house_dict[name].append(df)

    for key in house_dict:
        house_dict[key] = pd.concat(house_dict[key], axis=0)

    if progress_bar:
        progress_bar.update(1)  # Increment the progress bar when a house is processed

    return house_name, house_dict

houses = os.listdir(data_path)
# Create a progress bar with a total equal to the number of houses
with tqdm(total=len(houses), desc="Processing houses", unit="house") as progress_bar:
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Pass the progress_bar to the worker function
        futures = [executor.submit(process_house, house, progress_bar) for house in houses]
        for future in concurrent.futures.as_completed(futures):
            house_name, house_dict = future.result()
            data_dict[house_name] = house_dict


Multiprocessed

In [None]:
import os
import pandas as pd
from collections import defaultdict
import concurrent.futures
from tqdm import tqdm
import multiprocessing


data_dict = {}

def process_house(house_path, queue):
    house = os.path.basename(house_path)  # Extract house name from the path
    house_dict = defaultdict(list)
    house_name = "ENERTALK_" + str(int(house))
    
    for day in os.listdir(house_path):
        day_path = os.path.join(house_path, day)
        for device in os.listdir(day_path):
            device_path = os.path.join(day_path, device)
            name = parse_name(device)
            df = preprocess_dataframe(pd.read_parquet(device_path))
            house_dict[name].append(df)

    for key in house_dict:
        house_dict[key] = pd.concat(house_dict[key], axis=0)

    queue.put(1)  # Indicate that one house has been processed
    return house_name, house_dict

# Construct full paths for each house directory
data_path = "./Energy_graph/data/temp/ENERTALK/"
house_paths = [os.path.join(data_path, house) for house in os.listdir(data_path)]
queue = multiprocessing.Manager().Queue()

with tqdm(total=len(house_paths), desc="Processing houses", unit="house") as progress_bar:
    with concurrent.futures.ProcessPoolExecutor(max_workers=os.cpu_count()/2) as executor:
        futures = [executor.submit(process_house, house_path, queue) for house_path in house_paths]
        
        # Update progress bar based on queue
        for _ in concurrent.futures.as_completed(futures):
            progress_bar.update(queue.get())

        for future in futures:
            house_name, house_dict = future.result()
            data_dict[house_name] = house_dict


In [None]:
data_dict["ENERTALK_0"]
# save with pickle
import pickle
with open("./Energy_graph/data/processed/ENERTALK.pkl", "wb") as f:
    pickle.dump(data_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:

df = pd.read_parquet("./Energy_graph/data/temp/ENERTALK/enertalk/00/20161101/02_washing-machine.parquet.gzip").drop(columns=["reactive_power"])
df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms").dt.tz_localize('UTC').dt.tz_convert('Asia/Seoul')
df.set_index("timestamp", inplace=True)
df  = df/1000 * (1/15)/3600
df.resample("1S").sum()

In [None]:
test_str = "02_washing-machine.parquet.gzip"
def parse_name(file_name: str):
    """
    Parse the name of the file to get the device name"
    """
    # remove the extension
    file_name = file_name.split(".")[0]
    # get the device name
    file_name = file_name.split("_")[1]
 

    return file_name

print(parse_name(test_str))

# DEDDIAG

In [None]:
# get map of item_id to label for appliance
labels = pd.read_csv("./Energy_graph/data/temp/DEDDIAG/house_08/items.tsv", sep="\t")
labels.set_index("item_id", inplace=True)
id_label_map = labels["category"].to_dict()
id_label_map

In [None]:
def parse_id(file_name : str) -> int:
    return int(file_name.split('_')[1])

# watts to kWh given data frequency as a fraction of an hour (e.g. 0.5 for half-hourly data)
def watts2kwh(df, data_frequency):
    df = df/1000 * data_frequency
    return df


In [None]:
data_path = "./Energy_graph/data/temp/DEDDIAG/house_08/"
from tqdm import tqdm
data = {}

for device in tqdm([d for d in os.listdir(data_path) if "data" in d]):
    label = id_label_map[parse_id(device)]
    if "Phase" not in label:
        if "Total" in label:
            label = "aggregate"
        df = pd.read_csv(data_path + device, sep="\t")
        df["time"] = pd.to_datetime(df["time"])
        df.drop(columns=["item_id"], inplace=True)
        df.set_index("time", inplace=True)
        df = df[~df.index.duplicated(keep='first')]
        df = df.resample("1s").ffill()
        df.dropna(inplace=True)
        df = watts2kwh(df, 1/3600)
        print(label)
        data[label] = df

    
data_dict = {
    "DEDDIAG_8": data,
}


In [None]:
data["aggregate"].resample("D").sum()

# SUSTData2


In [None]:
path = "./Energy_graph/data/temp/SUSTData/"
# aggregate consumption data
df_aggregate = pd.DataFrame()
for file in os.listdir(path + "aggregate"):
    if file.endswith(".csv"):
        df_aggregate = pd.concat([df_aggregate,(pd.read_csv(path+"aggregate/" + file))])

df_aggregate["timestamp"] = pd.to_datetime(df_aggregate["timestamp"])
df_aggregate.set_index("timestamp", inplace=True)
df_aggregate.drop(columns=['Unnamed: 0', "Q","V","I"], inplace=True)
df_aggregate.rename(columns={"P":"power"}, inplace=True)
data_dict = {"aggregate":df_aggregate}

In [None]:
def parse_name(file_name: str):
    """
    Parse the file name to get the appliance name
    """
    # appliance name
    appliance_name = file_name.split(".")[0].split("_")[1]
    # date
    return appliance_name


# appliance consumption data
for file in os.listdir(path+"appliances/"):
    if file.endswith(".csv"):
        print(parse_name(file))
        data_dict[parse_name(file)] = pd.read_csv(path + "appliances/" + file).set_index("timestamp")

In [None]:
data_dict["aggregate"]

# SUSTData1

In [97]:
# watts to kWh given data frequency as a fraction of an hour (e.g. 0.5 for half-hourly data)
def watts2kwh(df, data_frequency):
    df = df/1000 * data_frequency
    return df

def preprocess_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess the dataframe
    """
   
    df = df.drop(columns=['Imin', 'Imax', 'Iavg', 'Vmin', 'Vmax',
        'Vavg', 'Pmin', 'Pmax', 'Qmin', 'Qmax', 'Qavg', 'PFmin',
        'PFmax', 'PFavg', 'miss_flag', 'iid', 'deploy']).dropna().set_index("tmstp").sort_index()
    df = df[~df.index.duplicated(keep='first')]
    df = df.resample("min").fillna(method="nearest", limit=5).dropna()# if there is data within 5 minutes, fill it in else drop it
    df = watts2kwh(df, 1/60)
    return df

path = "./Energy_graph/data/temp/SUST/SUST1/aggregate/"
data = {}
for house in range(1,51):
    name = "SUST1_" + str(house)
    tmp = {"aggregate" : pd.DataFrame()}
    data[name] = tmp
homes = set()
for folder in os.listdir(path):
    for file in os.listdir(path + folder):
        print(file)
        if file.endswith(".csv"):
            df = pd.read_csv(path + folder + "/" + file)
            # drop rows with missing data
            df = df[df['miss_flag'] == 0]
            # convert timestamp to datetime
            df["tmstp"] = pd.to_datetime(df["tmstp"])
            df.rename(columns={"Pavg":"aggregate"}, inplace=True)
            for iid in df["iid"].unique():
                name = "SUST1_" + str(iid)
                data[name]["aggregate"] = pd.concat([data[name]["aggregate"], preprocess_df(df[df["iid"] == iid])], axis=0)
        # break
    # break

for house in data:
    data[house]["aggregate"] = data[house]["aggregate"].sort_index()
    data[house]["aggregate"] = data[house]["aggregate"][~data[house]["aggregate"].index.duplicated(keep='first')]
# homes = list(homes).sort()

# df.columns

power_samples_d4_1.csv
power_samples_d4_2.csv
power_samples_d3_2.csv
power_samples_d3_3.csv
power_samples_d3_4.csv
power_samples_d3_1.csv
.ipynb_checkpoints
power_samples_d1_1.csv
.DS_Store
power_samples_d1_2.csv
power_samples_d2_2.csv
power_samples_d2_5.csv
.ipynb_checkpoints
power_samples_d2_4.csv
power_samples_d2_3.csv
.DS_Store
power_samples_d2_1.csv


In [None]:
# df = pd.read_csv("./Energy_graph/data/temp/SUST/SUST1/aggregate/power_samples_d1/power_samples_d1_1.csv")
# df = df[df['miss_flag'] == 0]
# df["tmstp"] = pd.to_datetime(df["tmstp"])
# df = df[(df["iid"] == 2)].drop(columns=['Imin', 'Imax', 'Iavg', 'Vmin', 'Vmax',
#        'Vavg', 'Pmin', 'Pmax', 'Qmin', 'Qmax', 'Qavg', 'PFmin',
#        'PFmax', 'PFavg', 'miss_flag', 'iid', 'deploy']).dropna().set_index("tmstp").sort_index()
# df.resample("1min").fillna(method="nearest", limit=5).dropna()[:180].plot()# if there is data within 5 minutes, fill it in else drop it
data["SUST1_50"]["aggregate"][:600].plot()


In [7]:
df.drop(columns=['Imin', 'Imax', 'Iavg', 'Vmin', 'Vmax',
       'Vavg', 'Pmin', 'Pmax', 'Qmin', 'Qmax', 'Qavg', 'PFmin',
       'PFmax', 'PFavg']).dropna()

Unnamed: 0,iid,tmstp,deploy,Pavg,miss_flag
1233,1,2010-11-25 20:33:34,2,823.51100,0
1234,1,2010-11-25 20:34:36,2,838.49400,0
1235,1,2010-11-25 20:35:39,2,831.41400,0
1236,1,2010-11-25 20:36:41,2,831.80400,0
1237,1,2010-11-25 20:37:44,2,839.35400,0
...,...,...,...,...,...
2958532,5,2012-04-10 19:52:34,2,541.82800,0
2958533,5,2012-04-10 19:53:37,2,537.85700,0
2958534,5,2012-04-10 19:54:09,2,559.01700,0
2959613,5,2012-05-12 13:53:46,2,6.78973,0


In [8]:
# drop rows where missing flag is true
df = df[df['miss_flag'] == 0]
df["iid"].unique()

array([1, 2, 3, 4, 5])

# MFRED
unused for now because of aggregated apartments might be used later

In [None]:
df = pd.read_csv("./Energy_graph/data/temp/MFRED/MFRED_Aggregates_15min_2019Q1-Q4.csv")

df

# EMBED
TODO
http://embed-dataset.org/

In [6]:
path = "./Energy_graph/data/temp/EMBED/Apt1_GT_Plug/Hair Dryer.csv"

df = pd.read_csv(path, header=None)
df['datetime'] = pd.to_datetime(df[1].str.cat(df[2], sep=' '))

df

Unnamed: 0,0,1,2,3,datetime
0,Tue,Aug 06 2013,13:29:48,0.16,2013-08-06 13:29:48
1,Tue,Aug 06 2013,13:29:49,0.15,2013-08-06 13:29:49
2,Tue,Aug 06 2013,13:29:49,0.15,2013-08-06 13:29:49
3,Tue,Aug 06 2013,13:29:50,0.16,2013-08-06 13:29:50
4,Tue,Aug 06 2013,13:29:50,0.16,2013-08-06 13:29:50
...,...,...,...,...,...
1243973,Mon,Aug 19 2013,10:57:24,0.25,2013-08-19 10:57:24
1243974,Mon,Aug 19 2013,10:57:25,0.23,2013-08-19 10:57:25
1243975,Mon,Aug 19 2013,10:57:27,0.23,2013-08-19 10:57:27
1243976,Mon,Aug 19 2013,10:57:28,0.25,2013-08-19 10:57:28


In [16]:
from scipy.io import loadmat



# Load the .mat file
mat_data = loadmat('./Energy_graph/data/temp/EMBED/Apt1_data (1).mat')

# The data could be stored under various keys. Usually, it's not under meta keys like '__header__', '__version__', '__globals__'.
# Let's say your data is under the key 'dataKey'. If you're unsure about the key, print mat_data.keys() to inspect.
print(mat_data.keys())
data = mat_data['data']

# The shape (1, 1) suggests that data is essentially a 2D array with a single element. This single element could itself be an array or another complex datatype.

# To further inspect this, let's access the inner contents of this 2D array:

# python
# Copy code
inner_content = data[0, 0]
# print(type(inner_content))
# print(inner_content.shape)
# print(inner_content.dtype.names)

field_names = inner_content.dtype.names
for name in field_names:
    print(f"Shape of {name}: {inner_content[name].shape}")


dict_keys(['__header__', '__version__', '__globals__', 'data'])
Shape of t_power: (587654, 1)
Shape of tt_power: (587654, 1)
Shape of Qa: (9, 587654)
Shape of Qb: (9, 587654)
Shape of Pa: (9, 587654)
Shape of Pb: (9, 587654)
Shape of startDate: (1, 1)
Shape of startTime: (1, 1)


In [17]:
# Extracting 1D arrays for t_power and tt_power
data_dict = {
    't_power': inner_content['t_power'].squeeze(),
    'tt_power': inner_content['tt_power'].squeeze(),
    'startDate': [inner_content['startDate'][0,0]] * inner_content['t_power'].shape[0],
    'startTime': [inner_content['startTime'][0,0]] * inner_content['t_power'].shape[0],
}

# Extracting columns for Qa, Qb, Pa, Pb
for i in range(inner_content['Qa'].shape[0]):
    data_dict[f'Qa_{i}'] = inner_content['Qa'][i, :]
    data_dict[f'Qb_{i}'] = inner_content['Qb'][i, :]
    data_dict[f'Pa_{i}'] = inner_content['Pa'][i, :]
    data_dict[f'Pb_{i}'] = inner_content['Pb'][i, :]

# Creating the dataframe
df = pd.DataFrame(data_dict)


In [18]:


df

Unnamed: 0,t_power,tt_power,startDate,startTime,Qa_0,Qb_0,Pa_0,Pb_0,Qa_1,Qb_1,...,Pa_6,Pb_6,Qa_7,Qb_7,Pa_7,Pb_7,Qa_8,Qb_8,Pa_8,Pb_8
0,735452.562383,0.000000,[[[2013/08/06]]],[[[13:29:49.934163324999489551]]],-43.508073,78.877231,703.465276,715.219317,0.041146,0.039373,...,0.014270,-0.015485,0.000294,0.000065,-0.000131,-0.000047,0.003548,0.005824,-0.000030,-0.003185
1,735452.562384,0.016665,[[[2013/08/06]]],[[[13:29:49.934163324999489551]]],-44.122336,78.802943,703.344349,711.424172,0.022737,0.021345,...,0.019564,-0.005687,0.000274,0.000086,-0.000008,0.000078,0.003150,0.004905,0.000098,-0.002874
2,735452.562384,0.033330,[[[2013/08/06]]],[[[13:29:49.934163324999489551]]],-44.184718,74.937676,704.097946,703.786992,0.018148,0.017963,...,0.017582,-0.038401,0.000102,0.000084,-0.000126,0.000068,0.002345,0.007185,-0.001763,-0.005700
3,735452.562384,0.049995,[[[2013/08/06]]],[[[13:29:49.934163324999489551]]],-43.558715,75.549708,703.883711,701.394935,0.000925,0.001153,...,0.022956,-0.025042,-0.000189,0.000555,-0.000327,-0.000216,0.002190,0.007141,-0.002171,-0.006395
4,735452.562384,0.066660,[[[2013/08/06]]],[[[13:29:49.934163324999489551]]],-43.025862,71.860145,703.007584,697.097498,-0.031575,-0.029193,...,0.025455,-0.063981,-0.000932,0.000897,-0.000242,-0.001925,0.000285,0.009445,-0.004158,-0.011050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
587649,735452.675912,9245.825325,[[[2013/08/06]]],[[[13:29:49.934163324999489551]]],17.471694,57.984443,20.281512,95.579867,-0.001957,0.002833,...,0.003424,0.039465,-0.000505,0.000107,0.000202,-0.000419,0.005693,0.007678,-0.003815,-0.015920
587650,735452.675912,9245.841990,[[[2013/08/06]]],[[[13:29:49.934163324999489551]]],17.451093,50.329093,20.159681,88.707211,-0.000790,-0.001924,...,0.000714,-0.022520,0.000091,0.000266,0.000301,-0.000106,0.003128,0.013823,-0.007928,-0.024756
587651,735452.675912,9245.858655,[[[2013/08/06]]],[[[13:29:49.934163324999489551]]],17.363645,55.373979,20.036955,97.431764,-0.000943,-0.001010,...,0.001774,0.013797,0.000029,0.000257,0.000336,-0.000218,0.004908,0.011371,-0.006064,-0.020717
587652,735452.675912,9245.875320,[[[2013/08/06]]],[[[13:29:49.934163324999489551]]],17.378752,55.292825,20.016533,91.867555,-0.001335,-0.000311,...,0.002663,0.027379,-0.000113,0.000456,0.000475,-0.000241,0.005837,0.010365,-0.004861,-0.018995


# HEART


In [None]:
# watts to kWh given data frequency as a fraction of an hour (e.g. 0.5 for half-hourly data)
def watts2kwh(df, data_frequency):
    df = df/1000 * data_frequency
    return df
def parse_name(file_name: str):
    """
    Parse the file name to get the house name
    """
    # appliance name
    appliance_name = file_name.split(".")[0]

    # date
    return appliance_name[:5] + "_" + appliance_name[5:]


# df = pd.read_csv("./Energy_graph/data/temp/HEART/HERON33.csv")
# df["Timestamp"] = pd.to_datetime(df["Timestamp"], unit="ms")

# df = df.set_index("Timestamp").drop(columns=["dw", "wm"])
# df = watts2kwh(df, 1/3600)



In [None]:
data_path = "./Energy_graph/data/temp/HEART/"
data_dict = {}
for file in os.listdir(data_path):
    if file.endswith(".csv"):
        # 
        df = pd.read_csv(data_path + file)
        # convert unix timestamp to datetime
        df["Timestamp"] = pd.to_datetime(df["Timestamp"], unit="ms")
        # set datetime as index and drop unnecessary columns
        df = df.set_index("Timestamp").drop(columns=["dw", "wm"])
        
        df.rename(columns={"Value": "aggregate"}, inplace=True)
        # convert watts to kilowatt hours
        df = watts2kwh(df, 1/3600)
        df.dropna(inplace=True)
        # create a dictionary of dataframes for each device
        devices_dict = {}
        for device in df.columns:
                devices_dict[device] = pd.DataFrame(df[device])
        # add the device dictionary to the data dictionary
        data_dict[parse_name(file)] = devices_dict



# IDEAL

In [None]:
# watts to kWh given data frequency as a fraction of an hour (e.g. 0.5 for half-hourly data)
def watts2kwh(df, data_frequency):
    df = df/1000 * data_frequency
    return df
def read_and_preprocess_df(path):
    df = pd.read_csv(path, header=None, names=["timestamp", "value"])
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    # set timestamp as index
    df = df.set_index("timestamp")
    df.sort_index(inplace=True)
    # resample to 7s and forward fill up to 35s
    df = df.resample("7s").ffill(limit=7).dropna()

    # convert to kWh
    df = watts2kwh(df, 7/3600)
    return df
# get house name and appliance name from file name
def parse_name(file_name : str):
    file_name = file_name.split("_")
    house_name = file_name[0].replace("home", "IDEAL_")
    appliance_name = file_name[3]
    if appliance_name == "electric-mains":
        appliance_name = "aggregate"

    if appliance_name == "electric-appliance":
        appliance_name = file_name[4].split(".")[0]
    return house_name, appliance_name

def process_house(house, file_list, data_path):
    house_data = {}
    for file in file_list:
        _, label, df = process_file(file, data_path)
        house_data[label] = df
    return house, house_data


def process_file(file,data_path):
    house, label = parse_name(file)
    return house, label, read_and_preprocess_df(data_path + "data_merged/" + file)

def process_files_for_home(house, file_list, data_path):
    house_data = {}
    for file in file_list:
        _, label, df = process_file(file, data_path)
        house_data[label] = df
    return house, house_data



In [None]:
test_str = "home168_kitchen1534_sensor12520_electric-appliance_washingmachinetumbledrier.csv.gz"


parse_name(test_str)

Serial program

In [None]:
data = {}
data_path = "./Energy_graph/data/temp/IDEAL/"
files = [file for file in os.listdir(data_path + "data_merged/") if ("electric-appliance" in file or "electric-mains" in file) and "home223" not in file]

for file in tqdm(files):
    house, label = parse_name(file)
    data.setdefault(house, {})[label] = read_and_preprocess_df(data_path+"data_merged/" + file)
        



Multiprocessed(takes around 1m:30s with 64 cores) 

In [None]:
from concurrent.futures import ProcessPoolExecutor
from tqdm.notebook import tqdm
from collections import defaultdict
def unpack_and_process(p):
    return process_house(*p)
# Main script body
data_path = "./Energy_graph/data/temp/IDEAL/"
data_dict = {}
files_grouped_by_home = defaultdict(list)
files = [file for file in os.listdir(data_path + "data_merged/") if ("electric-appliance" in file or "electric-mains" in file) and "home223" not in file]
for file in files:
    house, _ = parse_name(file)
    files_grouped_by_home[house].append(file)

total_houses = len(files_grouped_by_home)

print("Processing houses...")
with ProcessPoolExecutor(max_workers=int(os.cpu_count()/2)) as executor, tqdm(total=total_houses, desc="Processing houses", unit="house") as t:
    args = ((house, files_grouped_by_home[house], data_path) for house in files_grouped_by_home)
    
    for house_name, house_data in executor.map(unpack_and_process, args):
        data_dict[house_name] = house_data
        t.update(1)

In [None]:
data_dict

In [None]:
# save with pickle to: energy-knowledge-graph\data\processed\IDEAL.pkl

import pickle

with open('./Energy_graph/data/processed/IDEAL.pkl', 'wb') as f:
    pickle.dump(data_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

# RAE
Room data not appliance

needs to be cited
https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/ZJW4LC

In [7]:
path = "./Energy_graph/data/temp/RAE/test/house1_subs_blk2.csv"

df = pd.read_csv(path)

In [13]:
test = df.copy()
df


Unnamed: 0,unix_ts,sub,V,f,I,dPF,aPF,P,Q,S,Pt,Qt,St
0,1457251200,1,119.7,60.0,0.0,0.99,0.03,0,0,3,1950,46,4074
1,1457251200,2,119.1,60.0,0.0,0.98,0.36,1,0,3,3139,526,4699
2,1457251200,3,119.7,60.0,0.0,0.26,0.01,0,0,1,39,41,806
3,1457251200,4,119.1,60.0,0.0,0.59,0.53,1,2,3,7238,1840,8348
4,1457251200,5,119.7,60.0,0.0,0.99,0.21,0,0,3,18935,4462,21632
...,...,...,...,...,...,...,...,...,...,...,...,...,...
126974635,1462690799,20,120.2,60.0,1.5,0.63,0.62,115,141,183,74782,77334,112016
126974636,1462690799,21,119.9,60.0,0.0,0.77,0.45,2,2,6,268792,19868,300476
126974637,1462690799,22,120.2,60.0,0.0,0.58,0.36,2,3,7,279871,54855,304911
126974638,1462690799,23,119.9,60.0,0.0,0.91,0.48,4,1,8,3408,3898,9459


In [14]:
test["unix_ts"] = pd.to_datetime(test["unix_ts"], unit="ms")


In [16]:
test.set_index("unix_ts").groupby("sub")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f1d0ac7c340>

In [28]:
with open("./Energy_graph/data/temp/RAE/house1_labels.txt", "r") as f:
    lines = f.readlines()
    

# Process each line to build the dictionary
data_dict = {}
for line in lines:
    line = line.strip()
    if not line:
        continue
    parts = line.split(' ')
    print(parts)
    key = int(parts[0])
    value = parts[1].strip()
    data_dict[key] = value

print(data_dict)

['1', 'Kitchen', 'Wall', 'Oven', 'L1']
['2', 'Kitchen', 'Wall', 'Oven', 'L2']
['3', 'Kitchen', 'Counter', 'Plugs', 'L1']
['4', 'Kitchen', 'Counter', 'Plugs', 'L2']
['5', 'Clothes', 'Dryer', 'L1']
['6', 'Clothes', 'Dryer', 'L2']
['7', 'Upstairs', 'Bedroom', 'AFCI', 'Arc-Fault', 'Plugs']
['8', 'Kitchen', 'Fridge']
['9', 'Clothes', 'Washer']
['10', 'Kitchen', 'Dishwasher']
['11', 'Furnace', '&', 'Hot', 'Water', 'Unit', '(incl.', 'Furnace', 'Room', 'Plug)']
['12', 'Basement', 'Plugs', '&', 'Lights', '(incl.', 'Outside', 'Plugs)']
['13', 'Heat', 'Pump', 'L1']
['14', 'Heat', 'Pump', 'L2']
['15', 'Garage', 'Sub-Panel', 'L1']
['16', 'Garage', 'Sub-Panel', 'L2']
['17', 'Upstairs', 'Plugs', '&', 'Lights', 'L1', '(incl.', 'Bathroom', 'Lights', 'and', 'Vent', 'Fan,', 'Smoke', 'Alarms,', 'Living', 'Room', 'Plugs)']
['18', 'Upstairs', 'Plugs', '&', 'Lights', 'L2', '(incl.', 'Bathroom', 'Lights', 'and', 'Vent', 'Fan,', 'Smoke', 'Alarms,', 'Living', 'Room', 'Plugs)']
['19', 'Basement', 'Blue', 'Plugs'

# ECD-UY

In [6]:

def process_file(file):
    file_path = os.path.join(DATA_PATH, 'consumption_data', file)
    
    df = pd.read_csv(file_path)
    # pivot the dataframe so that each column is a different house with timestamps as the index and the values are the consumption
    df = df.pivot(index="datetime", columns="id", values="value")
    # convert the timestamps to datetime objects and set the correct timezone
    df.index = pd.to_datetime(df.index, unit='s', utc=True).tz_convert('America/Montevideo')
    
    temp_data = defaultdict(lambda: {"aggregate": []})
    # iterate over each column and add the data to the dictionary and drop missing values
    for col in df.columns:
        name = "ECDUY_" + str(col)
        temp_data[name]["aggregate"].append(df[col].dropna())
    
    return dict(temp_data)

In [10]:
from concurrent.futures import ProcessPoolExecutor 
data_path = './Energy_graph/data/temp/ECD-UY/'
# set this to the number of cores you want to use
cpu_count = 32



files = [f for f in os.listdir(os.path.join(data_path, 'consumption_data')) if f.endswith('.csv')]
len(files)
batch_size = 11  # or whatever size you deem fit, based on your system's number of cores and memory there is 22 file total for size 11 around 500gb is
data = defaultdict(lambda: {"aggregate": []})

for i in tqdm(range(0, len(files), batch_size)):
    batch_files = files[i:i + batch_size]
    with ProcessPoolExecutor(max_workers=cpu_count) as executor:
        results = list(executor.map(process_file, batch_files, data_path))
    for result in results:
        for key, value in result.items():
            data[key]["aggregate"].extend(value["aggregate"])

# Convert defaultdict back to a normal dictionary if needed
data = dict(data)

  0%|          | 0/2 [01:17<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: './consumption_data/consumption_data_201906.csv'

In [4]:
for key in tqdm(data):
    data[key]["aggregate"] = pd.concat(data[key]["aggregate"])
    

100%|██████████| 110952/110952 [01:19<00:00, 1394.48it/s]


In [5]:
# save  with pickle
import pickle

with open("./Energy_graph/data/processed/ECDUY.pkl", 'wb') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
    

In [19]:
data = defaultdict(lambda: {"aggregate": []})
data_path = './Energy_graph/data/temp/ECD-UY/'

# Load data from csv files
for file in (os.listdir(os.path.join(data_path, 'consumption_data'))):
    if file.endswith('.csv'):
        file_path = os.path.join(data_path, 'consumption_data', file)
        
        df = pd.read_csv(file_path)
        df = df.pivot(index="datetime", columns="id", values="value")
        df.index = pd.to_datetime(df.index, unit='s').tz_localize('UTC').tz_convert('America/Montevideo')
        
        for col in tqdm(df.columns):
            name = "ECDUY_" + str(col)
            data[name]["aggregate"].append(df[col])

    break
data = dict(data)
print("Processed files")

100%|██████████| 75338/75338 [00:02<00:00, 30842.11it/s]

Processed files



