In [1]:
import os # built-in functions for working with the Windows system
import pandas as pd # most popular data science library for working with panel/tabular data
import numpy as np

In [2]:
# define path for users
username = os.getlogin()
if username == "root":
    username = os.getenv("USER")

if username == "rose775":
    path_2024_1 = "/Users/rose775/Library/CloudStorage/OneDrive-PNNL/General - NEB Decarb/Datasets/ResStock/2024.1"
    path_2024_2 = "/Users/rose775/Library/CloudStorage/OneDrive-PNNL/General - NEB Decarb/Datasets/ResStock/2024.2"
    path_out_2024_1 = "/Users/rose775/Library/CloudStorage/OneDrive-PNNL/General - NEB Decarb/Analysis/resstock_results/2024_1"
    path_out_2024_2 = "/Users/rose775/Library/CloudStorage/OneDrive-PNNL/General - NEB Decarb/Analysis/resstock_results/2024_2"

elif username == "kieren_username":
    path_2024_1 = "/General - NEB Decarb/Datasets/ResStock/2024.1"
    path_2024_2 = "/General - NEB Decarb/Datasets/ResStock/2024.2"
    path_out_2024_1 = "/General - NEB Decarb/Analysis/resstock_results/2024_1"
    path_out_2024_2 = "/General - NEB Decarb/Analysis/resstock_results/2024_2"

elif username == "max_username":
    path_2024_1 = "/General - NEB Decarb/Datasets/ResStock/2024.1"
    path_2024_2 = "/General - NEB Decarb/Datasets/ResStock/2024.2"
    path_out_2024_1 = "/General - NEB Decarb/Analysis/resstock_results/2024_1"
    path_out_2024_2 = "/General - NEB Decarb/Analysis/resstock_results/2024_2"

else:
    print("Who are you and why are you doing ResStock analysis?")

In [3]:
files_2024_1 = ["baseline_metadata_and_annual_results.parquet", "upgrade2.01_metadata_and_annual_results.parquet", "upgrade2.02_metadata_and_annual_results.parquet",
                "upgrade3.03_metadata_and_annual_results.parquet", "upgrade3.07_metadata_and_annual_results.parquet"]

files_2024_2 = ["baseline_metadata_and_annual_results.parquet", "upgrade01_metadata_and_annual_results.parquet", "upgrade02_metadata_and_annual_results.parquet",
                "upgrade03_metadata_and_annual_results.parquet", "upgrade04_metadata_and_annual_results.parquet", "upgrade05_metadata_and_annual_results.parquet",
                "upgrade06_metadata_and_annual_results.parquet", "upgrade07_metadata_and_annual_results.parquet", "upgrade08_metadata_and_annual_results.parquet",
                "upgrade09_metadata_and_annual_results.parquet", "upgrade10_metadata_and_annual_results.parquet", "upgrade14_metadata_and_annual_results.parquet",
                "upgrade15_metadata_and_annual_results.parquet"]

In [4]:
def read_resstock_files(files, path_in):
    dfs = {} # create empty dictionary to fill with Pandas dataframes

    # Loop through each file name, read the file, and store the df in dfs dictionary
    if files is files_2024_1: # make sure we're reading files from 2024v1
        for file in files:
            x = file.split('_')[0] # just use the "baseline" or "upgradeXX" information for naming the dfs
            df_name = f"{x}_2024_1"
            dfs[df_name] = pd.read_parquet(f"{path_in}/{file}").reset_index() # read the parquet and save the named df to the dict
    elif files is files_2024_2:
        for file in files:
            x = file.split('_')[0]
            df_name = f"{x}_2024_2"
            dfs[df_name] = pd.read_parquet(f"{path_in}/{file}").reset_index()

    keys = list(dfs.keys())

    return dfs, keys

In [5]:
def new_resstock_dataframes(dfs, files):
    new_dfs = {}

    if files is files_2024_1: # have to separate v1 and v2 because out.emissions.all_fuels.etc is different between the two
        for key, df in dfs.items():
            new_df_name = key + "_new"
            new_dfs[new_df_name] = pd.DataFrame(df.loc[:, "bldg_id":"upgrade"].join(df.loc[:, "out.site_energy.net.energy_consumption.kwh":"out.emissions.all_fuels.lrmer_mid_case_2030_boxavg.co2e_kg"]))
        
        new_keys = list(new_dfs.keys())

    elif files is files_2024_2:
        for key, df in dfs.items():
            new_df_name = key + "_new"
            new_dfs[new_df_name] = pd.DataFrame(df.loc[:, "bldg_id":"upgrade"].join(df.loc[:, "out.site_energy.net.energy_consumption.kwh":"out.emissions.all_fuels.lrmer_mid_case_15.co2e_kg"]))
        
        new_keys = list(new_dfs.keys())


    return new_dfs, new_keys


In [6]:
def resstock_analysis(new_dfs, baseline_new, df_metadata, path_out):
    for key, df in new_dfs.items():
        if key not in ["baseline_2024_1_new", "baseline_2024_2_new"]:
            df = df.apply(pd.to_numeric, errors='coerce')
            baseline_new = baseline_new.apply(pd.to_numeric, errors='coerce')
            df_diff = df - baseline_new
            df_full = pd.concat([df_metadata, df_diff], axis=1)

            # Select numerical columns excluding the specific non-numerical or grouping column
            numerical_cols = df_full.select_dtypes(include=np.number).columns.drop('in.ashrae_iecc_climate_zone_2004', errors='ignore')
            df_averaged_cz = df_full.groupby('in.ashrae_iecc_climate_zone_2004')[numerical_cols].mean().reset_index()

            keeper_cols = ["in.ashrae_iecc_climate_zone_2004", "in.sqft", "out.site_energy.net.energy_consumption.kwh",
                                            "out.emissions.all_fuels.lrmer_high_re_cost_2030_boxavg.co2e_kg",
                                            "out.emissions.all_fuels.lrmer_low_re_cost_2030_boxavg.co2e_kg",
                                            "out.emissions.all_fuels.lrmer_mid_case_2030_boxavg.co2e_kg", 
                                            "out.emissions.all_fuels.lrmer_high_re_cost_15.co2e_kg",
                                            "out.emissions.all_fuels.lrmer_low_re_cost_15.co2e_kg", "out.emissions.all_fuels.lrmer_mid_case_15.co2e_kg"]
            df_averaged_cz = df_averaged_cz.loc[:, [col for col in keeper_cols if col in df_averaged_cz.columns]].round(0) # can be any of the columns in keeper_cols bc v1 and v2 are different

            # to .csv
            df_averaged_cz.to_csv(f"{path_out}/{key}_results.csv")

In [7]:
# comment/uncomment the block to be used for the run, no other modifications needed

# files = files_2024_1
# path_in = path_2024_1
# path_out = path_out_2024_1

files = files_2024_2
path_in = path_2024_2
path_out = path_out_2024_2

In [8]:
dfs, keys = read_resstock_files(files, path_in=path_in)

In [9]:
new_dfs, new_keys = new_resstock_dataframes(dfs, files)

In [10]:
# need to hold some data out of the function for analysis
if files is files_2024_1:
    df_metadata = dfs["baseline_2024_1"][['bldg_id', 'in.sqft', 'weight', 'in.ashrae_iecc_climate_zone_2004', 'in.census_division', 'in.census_region', 'in.county']]
    baseline_new = new_dfs["baseline_2024_1_new"]
elif files is files_2024_2:
    df_metadata = dfs["baseline_2024_2"][['bldg_id', 'in.sqft', 'weight', 'in.ashrae_iecc_climate_zone_2004', 'in.census_division', 'in.census_region', 'in.county']]
    baseline_new = new_dfs["baseline_2024_2_new"]

In [11]:
resstock_analysis(new_dfs, baseline_new=baseline_new, df_metadata=df_metadata, path_out=path_out)