In [11]:
%pip install pyarrow


Collecting pyarrow
  Using cached pyarrow-12.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.9 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-12.0.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import os
import numpy as np
from datetime import date

# HUE dataset

In [48]:
HUE_metadata = pd.read_parquet("./Energy_graph/data/energy-forecast/data/processed/HUE_metadata.parquet")

# metadata_refit = pd.read_parquet("./Energy_graph/data/energy-forecast/data/processed/refit_metadata.parquet")

HUE_metadata["name"] = "HUE_" + HUE_metadata["residential_id"].astype(str)



HUE_metadata.reset_index(drop=True, inplace=True)
HUE_metadata.drop(columns=["residential_id", "region", "tz"], inplace=True)
HUE_metadata["first_reading"] = HUE_metadata["first_reading"].dt.date
HUE_metadata["last_reading"] = HUE_metadata["last_reading"].dt.date
col = HUE_metadata.pop("name")
HUE_metadata.insert(0,"name", col)

HUE_metadata.rename(columns={"RUs": "rental_units"}, inplace=True)
HUE_metadata["AC"] = 1 - HUE_metadata["NAC"]

In [49]:
HUE_metadata["heating"] = np.where(HUE_metadata['GEOTH'] == 1, "geothermal", 'natural gas')


In [50]:
HUE_metadata = HUE_metadata.drop(columns=['SN', 'FAGF', 'HP', 'FPG', 'FPE', 'IFRHG', 'NAC', 'FAC', 'PAC',
       'BHE', 'IFRHE', 'WRHIR', 'GEOTH'])
HUE_metadata

Unnamed: 0,name,first_reading,last_reading,house_type,facing,rental_units,EVs,country,lat,lon,AC,heating
0,HUE_1,2012-06-01,2015-10-03,bungalow,S,1.0,0.0,Canada,49.083333,-122.35,1,natural gas
1,HUE_2,2016-06-09,2019-11-20,duplex,N,0.0,0.0,Canada,49.083333,-122.35,0,natural gas
2,HUE_3,2015-01-27,2018-01-29,modern,S,2.0,0.0,Canada,49.083333,-122.35,0,natural gas
3,HUE_4,2015-01-30,2018-01-29,character,W,1.0,0.0,Canada,49.083333,-122.35,0,natural gas
4,HUE_5,2015-01-30,2018-01-29,modern,S,1.0,0.0,Canada,49.083333,-122.35,0,natural gas
5,HUE_6,2015-01-30,2018-01-29,apartment,SW,0.0,0.0,Canada,49.083333,-122.35,0,natural gas
6,HUE_8,2015-02-21,2018-02-20,character,S,0.0,0.0,Canada,49.083333,-122.35,1,natural gas
7,HUE_9,2015-05-01,2018-02-21,special,S,0.0,0.0,Canada,49.083333,-122.35,0,natural gas
8,HUE_10,2015-02-21,2018-02-20,special,S,0.0,0.0,Canada,49.083333,-122.35,0,natural gas
9,HUE_11,2015-02-21,2018-02-20,duplex,N,0.0,0.0,Canada,49.083333,-122.35,0,natural gas


# REFIT

In [51]:
REFIT_metadata = pd.read_parquet("./Energy_graph/data/energy-forecast/data/processed/refit_metadata.parquet")

REFIT_metadata.drop(columns=["tz", "location"], inplace=True)
REFIT_metadata['name'] = 'REFIT_' + REFIT_metadata['house'].astype(str)

In [52]:

# Suppose we have two dataframes df1 and df2
df1 = pd.DataFrame({
   'A': ['A0', 'A1', 'A2', 'A3'],
   'B': ['B0', 'B1', 'B2', 'B3'],
   'C': ['C0', 'C1', 'C2', 'C3'],
   'D': ['D0', 'D1', 'D2', 'D3']},
   index=[0, 1, 2, 3])

df2 = pd.DataFrame({
   'B': ['B2', 'B3', 'B6', 'B7'],
   'D': ['D2', 'D3', 'D6', 'D7'],
   'F': ['F2', 'F3', 'F6', 'F7']},
   index=[2, 3, 6, 7])

# Using concat
result = pd.concat([df1, df2], axis=0) # axis=0 means concatenate along the row
result

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
2,,B2,,D2,F2
3,,B3,,D3,F3
6,,B6,,D6,F6
7,,B7,,D7,F7


In [53]:

REFIT_metadata.reset_index(drop=True, inplace=True)
REFIT_metadata.drop(columns=["house","appliances"], inplace=True)
col = REFIT_metadata.pop("name")
REFIT_metadata.insert(0, col.name, col)

In [54]:
REFIT_metadata["house_type"] = REFIT_metadata["house_type"].replace(" Detached   ", "house")
REFIT_metadata["country"] = REFIT_metadata["country"].replace("GB", "United Kingdom")


In [55]:

data = pd.read_pickle("./Energy_graph/data/processed/REFIT.pkl")
data.keys()

dict_keys(['REFIT_1', 'REFIT_2', 'REFIT_3', 'REFIT_4', 'REFIT_5', 'REFIT_6', 'REFIT_7', 'REFIT_8', 'REFIT_9', 'REFIT_10', 'REFIT_11', 'REFIT_12', 'REFIT_13', 'REFIT_15', 'REFIT_16', 'REFIT_17', 'REFIT_18', 'REFIT_19', 'REFIT_20', 'REFIT_21'])

In [56]:
start_end = {}
for house in data.keys():
    # print(house)
    start_end[house] = {}
    start_end[house]['first_reading'] = data[house]["aggregate"].index.min().date()
    start_end[house]['last_reading'] = data[house]["aggregate"].index.max().date()



In [57]:
first_readings = [start_end[h]["first_reading"] for h in start_end]
last_readings = [start_end[h]["last_reading"] for h in start_end]
REFIT_metadata["first_reading"] = first_readings
REFIT_metadata["last_reading"] = last_readings
REFIT_metadata.drop(columns=["appliances_owned"], inplace=True)

In [58]:
REFIT_metadata

Unnamed: 0,name,occupancy,construction_year,house_type,house_size,country,lat,lon,first_reading,last_reading
0,REFIT_1,2,1975-1980,house,4 bed,United Kingdom,52.7709,-1.2097,2013-10-09,2015-07-10
1,REFIT_2,4,-,Semi-detached,3 bed,United Kingdom,52.7709,-1.2097,2013-09-17,2015-05-28
2,REFIT_3,2,1988,house,3 bed,United Kingdom,52.7709,-1.2097,2013-09-25,2015-06-02
3,REFIT_4,2,1850-1899,house,4 bed,United Kingdom,52.7709,-1.2097,2013-10-11,2015-07-07
4,REFIT_5,4,1878,Mid-terrace,4 bed,United Kingdom,52.7709,-1.2097,2013-09-26,2015-07-06
5,REFIT_6,2,2005,house,4 bed,United Kingdom,52.7709,-1.2097,2013-11-28,2015-06-28
6,REFIT_7,4,1965-1974,house,3 bed,United Kingdom,52.7709,-1.2097,2013-11-01,2015-07-08
7,REFIT_8,2,1966,house,2 bed,United Kingdom,52.7709,-1.2097,2013-11-01,2015-05-11
8,REFIT_9,2,1919-1944,house,3 bed,United Kingdom,52.7709,-1.2097,2013-12-17,2015-07-08
9,REFIT_10,4,1919-1944,house,3 bed,United Kingdom,52.7709,-1.2097,2013-11-20,2015-06-30


# UCIML

In [59]:
from datetime import date
data_uciml = pd.read_parquet("./Energy_graph/data/energy-forecast/data/processed/uciml_household.parquet")
# 2006-12-16
data_uciml.drop(columns=["global_active_power", "global_reactive_power", "voltage", "global_intensity", "sub_metering_1", "sub_metering_2", "sub_metering_3", "unmetered"], inplace=True)

In [60]:
# meta data for uciml
first_reading = data_uciml["timestamp"].min().date()
last_reading = data_uciml["timestamp"].max().date()
country = data_uciml["country"].iloc[0]
region = data_uciml["region"].iloc[0]
lat = data_uciml["lat"].iloc[0]
lon = data_uciml["lon"].iloc[0]
tz = data_uciml["tz"].iloc[0]
   
data = {
    "name" : "UCIML_1",
    "first_reading" :first_reading,
    "last_reading" :last_reading,
    "house_type" : "house",
    "country" :country,
    "lat" :lat,
    "lon" :lon,
    }

In [61]:
UCIML_metadata = pd.DataFrame(data, index=[0])
UCIML_metadata

Unnamed: 0,name,first_reading,last_reading,house_type,country,lat,lon
0,UCIML_1,2006-12-16,2010-11-26,house,France,48.77644,2.29026


# HES

In [62]:
# data from https://github.com/ETSSmartRes/HES-Dataset

data = {
    "name" : "HES_1",
    "first_reading" : date(2018, 5, 12),
    "last_reading" : date(2018,10, 10),
    "lat": 	45.508888,
    "lon": -73.561668,
    "house_type": "house",
    "country": "Canada",
}

HES_meta = pd.DataFrame(data, index=[0])
HES_meta

Unnamed: 0,name,first_reading,last_reading,lat,lon,house_type,country
0,HES_1,2018-05-12,2018-10-10,45.508888,-73.561668,house,Canada


# ECO

In [63]:
# lat: 47.36667 
# lon: 8.55
houses = {
    'ECO_1': {
        'first_reading': date(2012, 6, 1),
        'last_reading': date(2013, 1, 31),
        'country': 'Switzerland'
    },
    'ECO_2': {
        'first_reading': date(2012, 6, 1),
        'last_reading': date(2013, 1, 31),
        'country': 'Switzerland'
    },
    'ECO_3': {
        'first_reading': date(2012, 7, 26),
        'last_reading': date(2013, 1, 31),
        'country': 'Switzerland'
    },
    'ECO_4': {
        'first_reading': date(2012, 7, 26),
        'last_reading': date(2013, 1, 31),
        'country': 'Switzerland'
    },
    'ECO_5': {
        'first_reading': date(2012, 7, 26),
        'last_reading': date(2013, 1, 31),
        'country': 'Switzerland'
    },
    'ECO_6': {
        'first_reading': date(2012, 7, 26),
        'last_reading': date(2013, 1, 31),
        'country': 'Switzerland'
    }
}

ECO_metadata = pd.DataFrame(houses).T
ECO_metadata.reset_index(inplace=True)
ECO_metadata.rename(columns={'index': 'name'}, inplace=True)
ECO_metadata

Unnamed: 0,name,first_reading,last_reading,country
0,ECO_1,2012-06-01,2013-01-31,Switzerland
1,ECO_2,2012-06-01,2013-01-31,Switzerland
2,ECO_3,2012-07-26,2013-01-31,Switzerland
3,ECO_4,2012-07-26,2013-01-31,Switzerland
4,ECO_5,2012-07-26,2013-01-31,Switzerland
5,ECO_6,2012-07-26,2013-01-31,Switzerland


# LERTA


In [4]:
lerta = pd.read_pickle("./Energy_graph/data/processed/LERTA.pkl")
lerta

{'LERTA_1': {'AGGREGATE':                            AGGREGATE
  Time                                
  2020-02-27 00:00:36+00:00   0.000107
  2020-02-27 00:00:42+00:00   0.000107
  2020-02-27 00:00:48+00:00   0.000107
  2020-02-27 00:00:54+00:00   0.000107
  2020-02-27 00:01:00+00:00   0.000107
  ...                              ...
  2021-02-07 05:54:18+00:00   0.003595
  2021-02-07 05:54:24+00:00   0.003595
  2021-02-07 05:54:30+00:00   0.003595
  2021-02-07 05:54:36+00:00   0.003595
  2021-02-07 05:54:42+00:00   0.003595
  
  [4985942 rows x 1 columns],
  '5c600454ff5adf000142cdc8':                            5c600454ff5adf000142cdc8
  Time                                               
  2020-02-27 00:00:36+00:00                       0.0
  2020-02-27 00:00:42+00:00                       0.0
  2020-02-27 00:00:48+00:00                       0.0
  2020-02-27 00:00:54+00:00                       0.0
  2020-02-27 00:01:00+00:00                       0.0
  ...                         

In [8]:



houses = {
    'LERTA_1': {
        'first_reading': pd.to_datetime(lerta["LERTA_1"]["AGGREGATE"].index).min().date(),
        'last_reading': pd.to_datetime(lerta["LERTA_1"]["AGGREGATE"].index).max().date(),
        'country': 'Poland',
        
    },
    'LERTA_2': {
        'first_reading': pd.to_datetime(lerta["LERTA_2"]["AGGREGATE"].index).min().date(),
        'last_reading': pd.to_datetime(lerta["LERTA_2"]["AGGREGATE"].index).max().date(),
        'country': 'Poland'
    },
    'LERTA_3': {
        'first_reading': pd.to_datetime(lerta["LERTA_3"]["AGGREGATE"].index).min().date(),
        'last_reading': pd.to_datetime(lerta["LERTA_3"]["AGGREGATE"].index).max().date(),
        'country': 'Poland'
    },
    'LERTA_4': {
        'first_reading': pd.to_datetime(lerta["LERTA_4"]["AGGREGATE"].index).min().date(),
        'last_reading': pd.to_datetime(lerta["LERTA_4"]["AGGREGATE"].index).max().date(),
        'country': 'Poland'
    },

}


In [9]:
LERTA_metadata = (pd.DataFrame(houses).T).reset_index()
LERTA_metadata.rename(columns={'index': 'name'}, inplace=True)
LERTA_metadata

Unnamed: 0,name,first_reading,last_reading,country
0,LERTA_1,2020-02-27,2021-02-07,Poland
1,LERTA_2,2020-02-27,2021-07-19,Poland
2,LERTA_3,2020-02-27,2021-07-19,Poland
3,LERTA_4,2020-02-27,2020-03-10,Poland


# UKDALE

In [67]:

import yaml
path = "./shared/Energy_graph_datasets/raw/UK-DALE/metadata/dataset.yaml"

with open(path, 'r') as file:
    data = yaml.safe_load(file)

# Now data contains the contents of your YAML file
print(data["geo_location"])

lat  = data["geo_location"]["latitude"]
lon = data["geo_location"]["longitude"]




{'country': 'GB', 'latitude': 51.464462, 'locality': 'London', 'longitude': -0.076544}


In [68]:
os.listdir("./shared/Energy_graph_datasets/raw/UK-DALE/metadata/")

house_data = {}

for file in os.listdir("./shared/Energy_graph_datasets/raw/UK-DALE/metadata/"):
    if file.endswith(".yaml") and "building" in file:
        # print(file)
        with open("./shared/Energy_graph_datasets/raw/UK-DALE/metadata/" + file, 'r') as stream:
            try:
                data = yaml.safe_load(stream)
            except yaml.YAMLError as exc:
                print(exc)
        
        start = data["timeframe"]["start"].split("T")[0]
        end = data["timeframe"]["end"].split("T")[0]
        heating = np.nan
        occupants = np.nan
        if "heating" in data:
            heating = data["heating"][0]

        if "n_occupants" in data:
            occupants = data["n_occupants"]

        name = file.split(".")[0]
        name = "UKDALE_"+name[-1]
        house_data[name] = {
            "first_reading": start,
            "last_reading": end,
            "heating": heating,
            "occupancy": occupants,
            "lat": lat,
            "lon": lon,
            "country": "United Kingdom",
        }
        

In [69]:
UKDALE_metadata = pd.DataFrame(house_data).transpose()
UKDALE_metadata.sort_index(inplace=True)
UKDALE_metadata.reset_index(inplace=True)
UKDALE_metadata.rename(columns={'index': 'name'}, inplace=True)
UKDALE_metadata

Unnamed: 0,name,first_reading,last_reading,heating,occupancy,lat,lon,country
0,UKDALE_1,2012-11-09,2017-04-26,natural gas,4.0,51.464462,-0.076544,United Kingdom
1,UKDALE_2,2013-02-17,2013-10-10,natural gas,2.0,51.464462,-0.076544,United Kingdom
2,UKDALE_3,2013-02-27,2013-04-08,,,51.464462,-0.076544,United Kingdom
3,UKDALE_4,2013-03-09,2013-10-01,natural gas,2.0,51.464462,-0.076544,United Kingdom
4,UKDALE_5,2014-06-29,2014-11-13,natural gas,2.0,51.464462,-0.076544,United Kingdom


In [70]:
df = pd.concat([HUE_metadata, REFIT_metadata, UCIML_metadata, ECO_metadata, HES_meta, LERTA_metadata, UKDALE_metadata], axis=0, ignore_index=True)


In [71]:
df["name"].unique()

# columns = []

array(['HUE_1', 'HUE_2', 'HUE_3', 'HUE_4', 'HUE_5', 'HUE_6', 'HUE_8',
       'HUE_9', 'HUE_10', 'HUE_11', 'HUE_12', 'HUE_13', 'HUE_14',
       'HUE_15', 'HUE_16', 'HUE_17', 'HUE_18', 'HUE_19', 'HUE_20',
       'HUE_21', 'HUE_22', 'HUE_23', 'HUE_24', 'HUE_25', 'HUE_26',
       'HUE_27', 'HUE_28', 'REFIT_1', 'REFIT_2', 'REFIT_3', 'REFIT_4',
       'REFIT_5', 'REFIT_6', 'REFIT_7', 'REFIT_8', 'REFIT_9', 'REFIT_10',
       'REFIT_11', 'REFIT_12', 'REFIT_13', 'REFIT_15', 'REFIT_16',
       'REFIT_17', 'REFIT_18', 'REFIT_19', 'REFIT_20', 'REFIT_21',
       'UCIML_1', 'ECO_1', 'ECO_2', 'ECO_3', 'ECO_4', 'ECO_5', 'ECO_6',
       'HES_1', 'LERTA_1', 'LERTA_2', 'LERTA_3', 'LERTA_4', 'UKDALE_1',
       'UKDALE_2', 'UKDALE_3', 'UKDALE_4', 'UKDALE_5'], dtype=object)

In [72]:
df['first_reading'] = pd.to_datetime(df['first_reading'])
df['last_reading'] = pd.to_datetime(df['last_reading'])

In [73]:

# df to parquet
# df.to_parquet('./Energy_graph/data/metadata/residential_metadata.parquet')

In [14]:
import pandas as pd 
SAVE_PATH = "../../data/metadata/"

test_old = pd.read_parquet(SAVE_PATH+'residential_metadata.parquet')
test_new = pd.read_parquet(SAVE_PATH+'residential_metadata_test.parquet')

df1 = test_old.sort_values(by=['name'])
df2 = test_new.sort_values(by=['name'])

df1 = df1.reset_index(drop=True).sort_values(by=['name'])
df2 = df2.reset_index(drop=True).sort_values(by=['name'])

df1.equals(df2)

True

In [75]:
print(test["lat"].unique())
print(test["lon"].unique())


[49.083333 48.511    52.7709   48.77644        nan 45.508888 51.464462]
[-1.2235000e+02 -1.2341300e+02 -1.2097000e+00  2.2902600e+00
            nan -7.3561668e+01 -7.6544000e-02]


python generate_metadata.py <data_path> <save_path> [--save]

Where:

- <data_path> is the path to your dataset file.
- <save_path> is the path to the directory where you want to save the metadata file.
- --save is an optional argument. If you include it, the metadata will be saved to the <save_path>; otherwise, the metadata will be generated but not saved.

`python generate_metadata.py <data_path> <save_path> [--save]`
Where:
* `<data_path>` is the path to your dataset file.
* `<save_path>` is the path to the directory where you want to save the metadata file.
* `--save` is an optional argument. If you include it, the metadata will be saved to the `<save_path>`; otherwise, the metadata will be generated but not saved.

For example:



# SMART
TODO
