In [1]:
import pandas as pd
import os

In [2]:
# convert to kWh
def convert2kWh(df):
    df['power[kW]'] = df['power[kW]'] * 0.25 # 15 min interval -> *4
    df.columns = ['time', 'power[kWh]']
    return df

# read the file and convert to kWh 
def process_file(file_path):
    df = pd.read_csv(file_path, header=None, names=["time", "power[kW]"])
    df = convert2kWh(df)
    df["time"] = pd.to_datetime(df["time"])
    df = df.set_index('time')
    return df

# get apartment name from file path
def apartment_name(file_path : str):
    return file_path.split("_")[0]


def sort_key(s):
    return int(s[3:])  # Extract the number part of the string and convert to int


def calculate_loadprofiles(df):
    # resample to daily and hourly
    hourly = df.resample('H').sum()
    daily = df.resample('D').sum()


        
    # daily load profile
    loadprofile_daily = hourly.groupby(hourly.index.hour).mean()

    # weekly load profile
    loadprofile_weekly = daily.groupby(daily.index.dayofweek).mean()

    # monthly load profile
    loadprofile_monthly = daily.groupby(daily.index.day).mean()

    # save to dictioanry
    loadprofiles = {
        "daily": loadprofile_daily,
        "weekly": loadprofile_weekly,
        "monthly": loadprofile_monthly
    }
    return loadprofiles



In [6]:
# path to the folder including the year folders
folder_path = "../../data/SMART/"

# sets to store apartment names for each year
apartments = []
 

for file in os.listdir(folder_path + "2015"):
    if not file.endswith(".csv"):
        continue
    # read apartment name from file name
    apt_name = apartment_name(file)
    apartments.append(apt_name)
    
        

# sort apartments by number
apartments = sorted(list(apartments), key=sort_key)


In [4]:
# print apartments that appear in 2015 but not in 2014
print("2015 but not 2014:", apartments_2015.difference(apartments_2014))

NameError: name 'apartments_2015' is not defined

In [7]:
# stores load profiles for all apartments with the following structure: {apartment_name: load_profiles}
loadprofiles_for_all_apartments = {}
# these 6 apartments are missing in 2014 data but appear in 2015 and 2016
apartments_missing2014 = ['Apt65', 'Apt6', 'Apt21', 'Apt112', 'Apt94', 'Apt3']
from tqdm import tqdm
for apt in tqdm(apartments):
    # handle missing 2014 data
    if apt in apartments_missing2014:
        df_2015 = process_file(folder_path + "2015/" + apt + "_2015.csv")
        df_2016 = process_file(folder_path + "2016/" + apt + "_2016.csv")
        df = pd.concat([df_2015, df_2016])
    else:
        df_2014 = process_file(folder_path + "2014/" + apt + "_2014.csv")
        df_2015 = process_file(folder_path + "2015/" + apt + "_2015.csv")
        df_2016 = process_file(folder_path + "2016/" + apt + "_2016.csv")
        df = pd.concat([df_2014, df_2015, df_2016])
        
    # merge dataframes
    
    lp = calculate_loadprofiles(df)

    loadprofiles_for_all_apartments[apt] = lp
    





100%|██████████| 114/114 [01:36<00:00,  1.18it/s]


In [8]:
# save to pickle
import pickle
with open('../../data/SMART/loadprofiles.pickle', 'wb') as handle:
    pickle.dump(loadprofiles_for_all_apartments, handle, protocol=pickle.HIGHEST_PROTOCOL)