In [268]:
import os
import time
import glob
import pandas as pd
import zipfile as zp
from functools import reduce
from selenium import webdriver

In [269]:
# Columns to keep.
usecols = [
    'OPR_DT',
    'OPR_HR',
    'Timestamp',
    'NODE_ID',
    'MARKET_RUN_ID',
    'LMP_TYPE',
    'XML_DATA_ITEM',
    'MW'
]

# Date strings for formatting URLs.
month_dict = {
    'Jan': ['0101T08:00-0000', '0201T08:00-0000'],
    'Feb': ['0201T08:00-0000', '0301T08:00-0000'],
    'Mar': ['0301T08:00-0000', '0401T07:00-0000'],
    'Apr': ['0401T07:00-0000', '0501T07:00-0000'],
    'May': ['0501T07:00-0000', '0601T07:00-0000'],
    'Jun': ['0601T07:00-0000', '0701T07:00-0000'],
    'Jul': ['0701T07:00-0000', '0801T07:00-0000'],
    'Aug': ['0801T07:00-0000', '0901T07:00-0000'],
    'Sep': ['0901T07:00-0000', '1001T07:00-0000'],
    'Oct': ['1001T07:00-0000', '1101T07:00-0000'],
    'Nov': ['1101T07:00-0000', '1201T08:00-0000'],
    'Dec': ['1201T08:00-0000', '0101T08:00-0000']
}

# TH (Trading Hub) nodes.
th_node_list = [
    'TH_NP15_GEN-APND',
    'TH_SP15_GEN-APND',
    'TH_ZP26_GEN-APND'
]

# Default LAP (Load Aggregation Point) nodes corresponding to utility territories.
lap_node_list = [
    'DLAP_PGAE-APND',
    'DLAP_SCE-APND',
    'DLAP_SDGE-APND',
    'DLAP_VEA-APND'
]

# Sub-LAP nodes.
sub_lap_node_list = [
    'SLAP_PGCC-APND', 'SLAP_PGEB-APND', 'SLAP_PGF1-APND', 'SLAP_PGFG-APND', 'SLAP_PGHB-APND', 'SLAP_PGKN-APND', 'SLAP_PGLP-APND', 'SLAP_PGNB-APND', 'SLAP_PGNC-APND', 'SLAP_PGNP-APND', 'SLAP_PGNV-APND', 'SLAP_PGP2-APND', 'SLAP_PGSA-APND', 'SLAP_PGSB-APND', 'SLAP_PGSF-APND', 'SLAP_PGSI-APND', 'SLAP_PGSN-APND', 'SLAP_PGST-APND', 'SLAP_PGZP-APND',
    'SLAP_SCEC-APND', 'SLAP_SCEN-APND', 'SLAP_SCEW-APND', 'SLAP_SCHD-APND', 'SLAP_SCLD-APND', 'SLAP_SCNW-APND', 
    'SLAP_SDG1-APND', 
    'SLAP_VEA-APND'
]

In [270]:
def extract_hourly_data(year, nodes):
    '''Downloads df from Oasis Portal by month for a year of choice and aggregates into single CSV file.'''

    node_entry = reduce(lambda x, y: x +','+ y, nodes)
    name_entry = reduce(lambda x, y: x +', '+ y, nodes)

    # Directory path.
    dir = f'/Users/parkerwild/GitHub/ca_nem/data/CAISO_LMPs/{str(year)}/{name_entry}'

    print('Accessing Chrome driver...')

    chrome_options = webdriver.ChromeOptions()
    prefs = {'download.default_directory': f'{dir}/ZIPs'}
    chrome_options.add_experimental_option('prefs', prefs)
    
    driver = webdriver.Chrome(
        options=chrome_options, 
        executable_path='/Users/parkerwild/GitHub/ca_nem/chromedriver_mac64/chromedriver.exe'
    )

    for month in month_dict.keys():

        # Handle last day of year.
        start_year = year
        end_year = year

        if month == 'Dec':
            end_year += 1

        # URL.
        api_call = 'http://oasis.caiso.com/oasisapi/SingleZip?queryname=PRC_LMP&resultformat=6&' + \
            'startdatetime=' + str(start_year) + month_dict[month][0] + '&' + \
            'enddatetime=' + str(end_year) + month_dict[month][1] + '&' + \
            'version=1&market_run_id=DAM&node=' + node_entry
        
        print(f'Downloading data for {month}...')

        # Request.
        driver.get(api_call)

        # Sleep.
        time.sleep(15)

    print('Closing Chrome driver...')

    driver.close()

    return dir

In [272]:
def combine_hourly_data(dir):
    '''Unzip files and concatenate CSVs.'''

    print('Unzipping files...')
        
    zip_files = glob.glob(f'{dir}/ZIPs/*.zip')

    for zip_filename in zip_files:

        zip_handler = zp.ZipFile(zip_filename, "r")
        zip_handler.extractall(f'{dir}/CSVs')

    print('Concatenating CSVs...')

    csv_files = glob.glob(f'{dir}/CSVs/*.csv')

    csvs = []

    for csv in csv_files:

        data = pd.read_csv(csv)
        csvs.append(data)

    df = pd.concat(csvs)

    print('Cleaning data...')

    # Adjust hour column to [0, 23].
    df['OPR_HR'] = df['OPR_HR'].astype(int) - 1

    # Add extra hour on Mar 13th due to DST.
    extra_hour = df.loc[(df['OPR_DT'] == '2022-03-13') & (df['OPR_HR'] == 1)].copy()
    extra_hour['OPR_HR'] = 2
    df = pd.concat([df, extra_hour])

    # Skip extra hour on Nov 6th due to DST.
    df = df.loc[df['OPR_HR'] < 24]

    # Convert to datetime.
    df['Timestamp'] = pd.to_datetime(df['OPR_DT'] + ' ' + df['OPR_HR'].astype(str) + ':00:00')

    # Drop duplicates, if necessary (but shouldn't be...)
    df.drop_duplicates(subset=['Timestamp', 'NODE_ID', 'LMP_TYPE'], inplace=True, ignore_index=True)
    
    # Sort by interval start time.
    df.sort_values(by=['Timestamp', 'NODE_ID', 'LMP_TYPE'], inplace=True, ignore_index=True)

    # Keep subset of columns.
    df = df[usecols]

    print('Writing to CSV...')

    # Write dataframe to CSV.    
    df.to_csv(f'{dir}/Aggregated_LMPs.csv', index=False)

    return df

In [254]:
# Extract data.
dir = extract_hourly_data(2022, ['DLAP_PGAE-APND', 'DLAP_SCE-APND', 'DLAP_SDGE-APND'])

Accessing Chrome driver...
Downloading data for Jan...
Downloading data for Feb...
Downloading data for Mar...
Downloading data for Apr...
Downloading data for May...
Downloading data for Jun...
Downloading data for Jul...
Downloading data for Aug...
Downloading data for Sep...
Downloading data for Oct...
Downloading data for Nov...
Downloading data for Dec...
Closing Chrome driver...


In [275]:
# Process data.
df = combine_hourly_data(dir)

Unzipping files...
Concatenating CSVs...
Cleaning data...
Writing to CSV...


In [276]:
# Confirm the year includes 8760 hours.
len(df['Timestamp'].unique())

8760

In [277]:
# Identify problematic days, if necessary.
for dt in df['OPR_DT'].unique():
    if len(df.loc[df['OPR_DT'] == dt, 'OPR_HR'].unique()) != 24:
        print(dt)