In [1]:
import os # built-in functions for working with the Windows system
from datetime import datetime as dt # library for getting and working with dates and time
from glob import glob # useful module for pattern matching and batch operations
import pandas as pd # most popular data science library for working with panel/tabular data
import numpy as np # most popular computational library, used in the backend of pandas
import scipy as sp # for more complex computational operations
import statsmodels.api as sm # for more advanced statistical testing
import matplotlib.pyplot as plt # popular plotting library
import seaborn as sns # wrapper for matplotlib to make plotting much easier
from sklearn import linear_model # scikit-learn is a very useful machine learning library with many models built in
from unidecode import unidecode

In [2]:
# define pagth for users
username = os.getlogin()
if username == "root":
    username = os.getenv("USER")

if username == "rose775":
    path_2024_1 = "/Users/rose775/Library/CloudStorage/OneDrive-PNNL/General - NEB Decarb/Datasets/ResStock/2024.1"
    path_2024_2 = "/Users/rose775/Library/CloudStorage/OneDrive-PNNL/General - NEB Decarb/Datasets/ResStock/2024.2"
    path_out_2024_1 = "/Users/rose775/Library/CloudStorage/OneDrive-PNNL/General - NEB Decarb/Analysis/resstock_results/2024_1"
    path_out_2024_2 = "/Users/rose775/Library/CloudStorage/OneDrive-PNNL/General - NEB Decarb/Analysis/resstock_results/2024_2"

elif username == "kieren_username":
    path_2024_1 = "/General - NEB Decarb/Datasets/ResStock/2024.1"
    path_2024_2 = "/General - NEB Decarb/Datasets/ResStock/2024.2"
    path_out_2024_1 = "/General - NEB Decarb/Analysis/resstock_results/2024_1"
    path_out_2024_2 = "/General - NEB Decarb/Analysis/resstock_results/2024_2"

elif username == "max_username":
    path_2024_1 = "/General - NEB Decarb/Datasets/ResStock/2024.1"
    path_2024_2 = "/General - NEB Decarb/Datasets/ResStock/2024.2"
    path_out_2024_1 = "/General - NEB Decarb/Analysis/resstock_results/2024_1"
    path_out_2024_2 = "/General - NEB Decarb/Analysis/resstock_results/2024_2"

else:
    print("Who are you and why are you doing ResStock analysis?")

In [3]:
files_2024_1 = ["baseline_metadata_and_annual_results.parquet", "upgrade2.01_metadata_and_annual_results.parquet", "upgrade2.02_metadata_and_annual_results.parquet",
                "upgrade3.03_metadata_and_annual_results.parquet", "upgrade3.07_metadata_and_annual_results.parquet"]

files_2024_2 = ["baseline_metadata_and_annual_results.parquet", "upgrade01_metadata_and_annual_results.parquet", "upgrade02_metadata_and_annual_results.parquet",
                "upgrade03_metadata_and_annual_results.parquet", "upgrade04_metadata_and_annual_results.parquet", "upgrade05_metadata_and_annual_results.parquet",
                "upgrade06_metadata_and_annual_results.parquet", "upgrade07_metadata_and_annual_results.parquet", "upgrade08_metadata_and_annual_results.parquet",
                "upgrade09_metadata_and_annual_results.parquet", "upgrade10_metadata_and_annual_results.parquet", "upgrade14_metadata_and_annual_results.parquet",
                "upgrade15_metadata_and_annual_results.parquet"]

In [4]:
def read_resstock_files(files, path_in):

    dfs = {}

    # Loop through each file name, read the file, and store the df in dfs dictionary
    if files is files_2024_1:
        for file in files:
            x = file.split('_')[0]
            df_name = f"{x}_2024_1"
            dfs[df_name] = pd.read_parquet(f"{path_in}/{file}").reset_index()
    elif files is files_2024_2:
        for file in files:
            x = file.split('_')[0]
            df_name = f"{x}_2024_2"
            dfs[df_name] = pd.read_parquet(f"{path_in}/{file}").reset_index()

    keys = list(dfs.keys())

    return dfs, keys

In [5]:
def new_resstock_dataframes(dfs, files):
    new_dfs = {}

    if files is files_2024_1:
        for key, df in dfs.items():
            new_df_name = key + "_new"
            new_dfs[new_df_name] = pd.DataFrame(df.loc[:, "bldg_id":"upgrade"].join(df.loc[:, "out.site_energy.net.energy_consumption.kwh":"out.emissions.all_fuels.lrmer_mid_case_2030_boxavg.co2e_kg"]))
        
        new_keys = list(new_dfs.keys())

    elif files is files_2024_2:
        for key, df in dfs.items():
            new_df_name = key + "_new"
            new_dfs[new_df_name] = pd.DataFrame(df.loc[:, "bldg_id":"upgrade"].join(df.loc[:, "out.site_energy.net.energy_consumption.kwh":"out.emissions.all_fuels.lrmer_mid_case_15.co2e_kg"]))
        
        new_keys = list(new_dfs.keys())


    return new_dfs, new_keys


In [6]:
def resstock_analysis(new_dfs, baseline_new, df_metadata, path_out):
    for key, df in new_dfs.items():
        if key not in ["baseline_2024_1_new", "baseline_2024_2_new"]:
            df = df.apply(pd.to_numeric, errors='coerce')
            baseline_new = baseline_new.apply(pd.to_numeric, errors='coerce')
            df_diff = df - baseline_new
            df_full = pd.concat([df_metadata, df_diff], axis=1)

            # Select numerical columns excluding the specific non-numerical or grouping column
            numerical_cols = df_full.select_dtypes(include=np.number).columns.drop('in.ashrae_iecc_climate_zone_2004', errors='ignore')
            df_averaged_cz = df_full.groupby('in.ashrae_iecc_climate_zone_2004')[numerical_cols].mean().reset_index()
            
            # to .csv
            df_averaged_cz.to_csv(f"{path_out}/{key}_results.csv")

In [7]:
# comment/uncomment the block to be used for the run, no other modifications needed

# files = files_2024_1
# path_in = path_2024_1
# path_out = path_out_2024_1

files = files_2024_2
path_in = path_2024_2
path_out = path_out_2024_2

In [8]:
dfs, keys = read_resstock_files(files, path_in=path_in)
# keys = read_resstock_files(files_2022)[1]

In [10]:
new_dfs, new_keys = new_resstock_dataframes(dfs, files)

In [11]:
# need to hold some data out of the function for analysis
if files is files_2024_1:
    df_metadata = dfs["baseline_2024_1"][['bldg_id', 'in.sqft', 'weight', 'in.ashrae_iecc_climate_zone_2004', 'in.census_division', 'in.census_region', 'in.county']]
    baseline_new = new_dfs["baseline_2024_1_new"]
elif files is files_2024_2:
    df_metadata = dfs["baseline_2024_2"][['bldg_id', 'in.sqft', 'weight', 'in.ashrae_iecc_climate_zone_2004', 'in.census_division', 'in.census_region', 'in.county']]
    baseline_new = new_dfs["baseline_2024_2_new"]

In [12]:
resstock_analysis(new_dfs, baseline_new=baseline_new, df_metadata=df_metadata, path_out=path_out)