In [1]:
# In this script, we set up the basics of downloading the data files for 300,000 buildings from the NREL repository

# The full data lake, which runs on an Amazon S3 server, is hosted at this URL: https://data.openei.org/s3_viewer?bucket=oedi-data-lake&prefix=nrel-pds-building-stock%2F
# Comstock 2024 AMY-1, the release we are looking for, is hosted in this URL: https://data.openei.org/s3_viewer?bucket=oedi-data-lake&prefix=nrel-pds-building-stock%2Fend-use-load-profiles-for-us-building-stock%2F2024%2Fcomstock_amy2018_release_1%2F

# first, we use boto3 to print the directory structure of this release bucket

import boto3 
from botocore import UNSIGNED
from botocore.config import Config

s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
bucket_name = 'oedi-data-lake'
prefix = 'nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/'

response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix, Delimiter='/')
directories = [content['Prefix'] for content in response.get('CommonPrefixes', [])]

for dir in directories:
    print(dir)

nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/building_energy_models/
nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/comparison_plots/
nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/geographic_information/
nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/metadata/
nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/metadata_and_annual_results/
nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/timeseries_aggregates/
nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/timeseries_individual_buildings/
nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/weather/


In [2]:
# we now explore the files present in metadata folder

prefix = 'nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/metadata/'

paginator = s3.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

files = []
for page in pages:
    for obj in page.get('Contents', []):
        files.append(obj['Key'])

for file in files:
    print(file)

nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/metadata/baseline.parquet
nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/metadata/upgrade01.parquet
nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/metadata/upgrade02.parquet
nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/metadata/upgrade03.parquet
nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/metadata/upgrade05.parquet
nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/metadata/upgrade06.parquet
nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/metadata/upgrade07.parquet
nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/metadata/upg

In [3]:
# We now explore the metadata of the latest update. While doing the same, we will also store the latest update number which might be useful later.

import pandas as pd
import s3fs
import os

fs = s3fs.S3FileSystem(anon=True)
file_path = 'nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/metadata/upgrade32.parquet'
base_path = '/lcrc/project/NEXTGENOPT/NREL_COMSTOCK_DATA' # base path where the files are downloaded

update_version = ''.join(filter(str.isdigit, os.path.basename(file_path))) # version of the update which we are using

with fs.open(f's3://{bucket_name}/{file_path}') as f:
    df = pd.read_parquet(f)
    
# Also save the metadata file to disk
fs.get(f's3://{bucket_name}/{file_path}', base_path)

[None]

In [4]:
# We now explore the columns of the parquet file we just downloaded

col_values = df.columns.tolist()

for cval in col_values:
    print(cval)

metadata_index
upgrade
weight
in.sqft
calc.weighted.sqft
in.upgrade_name
applicability
in.building_america_climate_zone
in.cambium_grid_region
in.census_division_name
in.census_region_name
in.cluster_id
in.cluster_name
in.county_name
in.iso_rto_region
in.nhgis_county_gisjoin
in.nhgis_puma_gisjoin
in.nhgis_tract_gisjoin
in.reeds_balancing_area
in.state
in.state_name
in.airtightness..m3_per_m2_s
in.ashrae_iecc_climate_zone_2006
in.aspect_ratio
in.building_subtype
in.cejst_is_disadvantaged
in.comstock_building_type
in.comstock_building_type_group
in.economizer_changeover_temperature_fault_applicable
in.economizer_damper_stuck_fault_applicable
in.economizer_damper_stuck_fault_timing
in.ejscreen_census_tract_percentile_for_demographic_index
in.ejscreen_census_tract_percentile_for_less_than_hs_educ
in.ejscreen_census_tract_percentile_for_low_income
in.ejscreen_census_tract_percentile_for_people_in_ling_isol
in.ejscreen_census_tract_percentile_for_people_of_color
in.ejscreen_census_tract_perc

In [5]:
# List of unique state names (will be important later)
unique_states = list(set(df['in.state'].tolist()))
    
print(unique_states)

['NH', 'IN', 'NM', 'CA', 'GA', 'IL', 'MT', 'NE', 'AL', 'WI', 'IA', 'AR', 'AK', 'MD', 'NJ', 'VA', 'MO', 'CO', 'NV', 'SD', 'WV', 'NC', 'FL', 'TN', 'ID', 'MS', 'MI', 'NY', 'VT', 'OR', 'HI', 'KY', 'OH', 'UT', 'DE', 'MA', 'KS', 'RI', 'WA', 'LA', 'ND', 'AZ', 'OK', 'MN', 'TX', 'SC', 'PA', 'WY', 'ME', 'DC', 'CT']


In [8]:
# Create new folders for each state (plus weather)

bidx, state_name = df.index.tolist(), df['in.state'].tolist()
# We convert each into a dictionary, grouped by state name
grouped_by_state = {}
for a,b in zip(bidx,state_name):
    if b not in grouped_by_state:
        grouped_by_state[b] = []
    grouped_by_state[b].append(a)
    
# We now create an empty folder for each state and place a file called links.txt in each which contains the S3 prefix of the relevant files to be downloaded.
for key, value in grouped_by_state.items():
    os.makedirs(base_path+'/'+key, exist_ok = True)
    # populate the download list
    dlist = [f'nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/timeseries_individual_buildings/by_state/upgrade={update_version}/state={key}/{v}-{update_version}.parquet'
            for v in value]
    with open(base_path+'/'+key+'/links.txt', 'w') as file:
        for line in dlist:
            file.write(line + '\n')
            
# We now create an empty folder called weather where we load the weather files
pumas = list(set(df['in.nhgis_county_gisjoin'].tolist()))
os.makedirs(base_path+'/weather', exist_ok = True)
dlist = [f'nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2024/comstock_amy2018_release_1/weather/amy2018/{p}_2018.csv' for p in pumas]
with open(base_path+'/weather/links.txt', 'w') as file:
    for line in dlist:
        file.write(line + '\n')