# About this notebook

The EPA's power sector data crosswalk is an important input for this data pipeline. However, it is incomplete and in some cases has incorrect crosswalks. Ideally, we would contribute directly to the github repository, but the code is written in R and not python. As a temporary measure, we want to explore the outputted crosswalk and post-process it to fix some of these issues.

In [None]:
import pandas as pd
import sqlalchemy as sa 
import pudl.output

In [None]:
year =2020

In [None]:
# load raw cems data
cems_path = f"../data/pudl/pudl_data/parquet/epacems/year={year}"
cems = pd.read_parquet(cems_path).rename(
        columns={
            "plant_id_eia": "plant_id_epa",
            "heat_content_mmbtu": "fuel_consumed_mmbtu",
        }
    )

# load crosswalk data
crosswalk = pudl.output.epacems.epa_crosswalk()

# load eia-860 data
pudl_db = "sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite"
pudl_engine = sa.create_engine(pudl_db)
pudl_out = pudl.output.pudltabl.PudlTabl(
        pudl_engine, freq="MS", start_date=f"{year}-01-01", end_date=f"{year}-12-31"
    )
gens_860 = pudl_out.gens_eia860()

In [None]:
ids = cems[["plant_id_epa", "unitid"]].drop_duplicates()

# Complete list of non grid connected plants

In [None]:
# get the list of plant_id_eia from the static table
ngc_plants = list(
    pd.read_csv(
        "../data/egrid/egrid_static_tables/table_4-2_plants_not_connected_to_grid.csv"
    )["Plant ID"]
)


In [None]:
ids[ids['plant_id_epa'].isin(ngc_plants)].to_clipboard()

# Identify where there is no EIA plant that matches the EPA plant

In [None]:
missing_eia = crosswalk[~crosswalk['CAMD_PLANT_ID'].isna() & crosswalk['EIA_PLANT_ID'].isna()]
missing_eia

In [None]:
missing_ids = missing_eia[['CAMD_PLANT_ID','CAMD_UNIT_ID','CAMD_GENERATOR_ID']].drop_duplicates()

missing_ids = missing_ids.merge(gens_860[['plant_id_eia','generator_id']], how='left', left_on=['CAMD_PLANT_ID','CAMD_UNIT_ID'], right_on=['plant_id_eia','generator_id'])
missing_ids = missing_ids.merge(gens_860[['plant_id_eia','generator_id']], how='left', left_on=['CAMD_PLANT_ID','CAMD_GENERATOR_ID'], right_on=['plant_id_eia','generator_id'], suffixes=('_u','_g'))

# identify the source of the data
missing_ids['source'] = ''
missing_ids.loc[~missing_ids['plant_id_eia_u'].isna(), 'source'] = "CAMD_UNIT_ID matches EIA_GENERATOR_ID"
missing_ids.loc[~missing_ids['plant_id_eia_g'].isna(), 'source'] = "CAMD_GENERATOR_ID matches EIA_GENERATOR_ID"

# fill nas in each group so that we can identify where values match
missing_ids['generator_id_u'] = missing_ids['generator_id_u'].fillna(missing_ids['generator_id_g'])
missing_ids['generator_id_g'] = missing_ids['generator_id_g'].fillna(missing_ids['generator_id_u'])

missing_ids['plant_id_eia_u'] = missing_ids['plant_id_eia_u'].fillna(missing_ids['plant_id_eia_g'])
missing_ids['plant_id_eia_g'] = missing_ids['plant_id_eia_g'].fillna(missing_ids['plant_id_eia_u'])


In [None]:
# identify where we identified a consistent generator match
unit_manual_match = (missing_ids[missing_ids['generator_id_u'] == missing_ids['generator_id_g']]).drop(columns=['plant_id_eia_g','generator_id_g']).rename(columns={'plant_id_eia_u':'plant_id_eia','generator_id_u':'generator_id'})
unit_manual_match

In [None]:
unit_manual_match.to_csv('../data/output/crosswalk_unit_manual_matches.csv')

In [None]:
# identify where the two matching methods returned different generator matches
multi_match = missing_ids[missing_ids['generator_id_u'] != missing_ids['generator_id_g']]
multi_match

In [None]:
multi_match.to_csv('../data/output/crosswalk_unit_manual_matches_multi.csv')

In [None]:
gens_860[gens_860['plant_id_eia'] == 3443]

# Identify units missing from crosswalk

In [None]:
missing_from_cw = ids.merge(crosswalk[['CAMD_PLANT_ID','CAMD_UNIT_ID']], how='left', left_on=['plant_id_epa','unitid'], right_on=['CAMD_PLANT_ID','CAMD_UNIT_ID'])
missing_from_cw = missing_from_cw[missing_from_cw['CAMD_UNIT_ID'].isna()]

In [None]:
missing_from_cw = missing_from_cw.merge(gens_860[['plant_id_eia','generator_id']], how='left', left_on=['plant_id_epa','unitid'], right_on=['plant_id_eia','generator_id'])

missing_from_cw['source'] = ''
missing_from_cw.loc[~missing_from_cw['generator_id'].isna(), 'source'] = "CAMD_UNIT_ID matches EIA_GENERATOR_ID"

missing_from_cw

In [None]:
missing_from_cw.to_csv('../data/output/missing_from_crosswalk.csv', index=False)

In [None]:
gens_860.loc[gens_860['plant_id_eia'] == 55641, ['plant_id_eia','generator_id','prime_mover_code']]