In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import quote
import io

In [2]:
columns_to_read = ["timestamp", "out.zone_mean_air_temp.conditioned_space.c"]

base_url = "https://data.openei.org/s3_viewer?bucket=oedi-data-lake&prefix=nrel-pds-building-stock%2Fend-use-load-profiles-for-us-building-stock%2F2024%2Fresstock_amy2018_release_2%2Ftimeseries_individual_buildings%2Fby_state%2Fupgrade%3D{upgrade}%2Fstate%3D{state}%2F"

upgrades = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID",
          "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS",
          "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK",
          "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV",
          "WI", "WY"]

In [3]:
dataframes_dict = {}

In [4]:
# Loop through upgrades and states
for upgrade in upgrades:
    for state in states:
        # Construct URL for each upgrade and state
        url = base_url.format(upgrade=upgrade, state=quote(state, safe=""))
        
        # Download directory listing HTML page
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse HTML content
            soup = BeautifulSoup(response.content, "html.parser")

            # Find all links (files) in the directory
            links = soup.find_all("a")

            # Loop through links to download and process each file
            for link in links:
                file_url = link.get("href")
                file_name = os.path.basename(file_url)

                if file_name.endswith(".parquet"):  # Assuming CSV files, modify as needed
                    # Download file
                    file_response = requests.get(file_url)

                    if file_response.status_code == 200:
                        # Process the downloaded CSV file (example: convert to DataFrame)
                        df = pd.read_parquet(io.BytesIO(file_response.content), columns=columns_to_read)  # Assuming you have imported io
                        # Add DataFrame to the dictionary with a meaningful key
                        bldg_id = file_name.split('-')[0]
                        key = f"upgrade_{upgrade}_state_{state}_bldg_id_{bldg_id}"
                        dataframes_dict[key] = df
                        print(f"Downloaded and processed: {key}")
                    else:
                        print(f"Failed to download file: {file_url}. Status code: {file_response.status_code}")
        else:
            print(f"Failed to access URL: {url}. Status code: {response.status_code}")


Downloaded and processed: upgrade_0_state_AL_bldg_id_1
Downloaded and processed: upgrade_0_state_AL_bldg_id_100137
Downloaded and processed: upgrade_0_state_AL_bldg_id_100168
Downloaded and processed: upgrade_0_state_AL_bldg_id_100354
Downloaded and processed: upgrade_0_state_AL_bldg_id_100480
Downloaded and processed: upgrade_0_state_AL_bldg_id_100501
Downloaded and processed: upgrade_0_state_AL_bldg_id_100612
Downloaded and processed: upgrade_0_state_AL_bldg_id_100676
Downloaded and processed: upgrade_0_state_AL_bldg_id_10073
Downloaded and processed: upgrade_0_state_AL_bldg_id_100746
Downloaded and processed: upgrade_0_state_AL_bldg_id_100818
Downloaded and processed: upgrade_0_state_AL_bldg_id_100872
Downloaded and processed: upgrade_0_state_AL_bldg_id_10093
Downloaded and processed: upgrade_0_state_AL_bldg_id_100934
Downloaded and processed: upgrade_0_state_AL_bldg_id_101013
Downloaded and processed: upgrade_0_state_AL_bldg_id_101085
Downloaded and processed: upgrade_0_state_AL_bl

In [5]:
dataframes_dict.keys()

dict_keys(['upgrade_0_state_AL_bldg_id_1', 'upgrade_0_state_AL_bldg_id_100137', 'upgrade_0_state_AL_bldg_id_100168', 'upgrade_0_state_AL_bldg_id_100354', 'upgrade_0_state_AL_bldg_id_100480', 'upgrade_0_state_AL_bldg_id_100501', 'upgrade_0_state_AL_bldg_id_100612', 'upgrade_0_state_AL_bldg_id_100676', 'upgrade_0_state_AL_bldg_id_10073', 'upgrade_0_state_AL_bldg_id_100746', 'upgrade_0_state_AL_bldg_id_100818', 'upgrade_0_state_AL_bldg_id_100872', 'upgrade_0_state_AL_bldg_id_10093', 'upgrade_0_state_AL_bldg_id_100934', 'upgrade_0_state_AL_bldg_id_101013', 'upgrade_0_state_AL_bldg_id_101085', 'upgrade_0_state_AL_bldg_id_101134', 'upgrade_0_state_AL_bldg_id_10116', 'upgrade_0_state_AL_bldg_id_101213', 'upgrade_0_state_AL_bldg_id_101252', 'upgrade_0_state_AL_bldg_id_101304', 'upgrade_0_state_AL_bldg_id_101369', 'upgrade_0_state_AL_bldg_id_101394', 'upgrade_0_state_AL_bldg_id_101424', 'upgrade_0_state_AL_bldg_id_101464', 'upgrade_0_state_AL_bldg_id_101501', 'upgrade_0_state_AL_bldg_id_101509'

In [8]:
import pickle

In [9]:
# File path where the dictionary will be saved
file_path = '/Users/rose775/Library/CloudStorage/OneDrive-PNNL/General - NEB Decarb/Datasets/all_8760_dfs.pkl'

# Save the dictionary of dataframes to a file
with open(file_path, 'wb') as file:
    pickle.dump(dataframes_dict, file)