# About this notebook

The EPA's power sector data crosswalk is an important input for this data pipeline. However, it is incomplete and in some cases has incorrect crosswalks. Ideally, we would contribute directly to the github repository, but the code is written in R and not python. As a temporary measure, we want to explore the outputted crosswalk and post-process it to fix some of these issues.

In [17]:
import pandas as pd
import sqlalchemy as sa 
import pudl.output

In [2]:
year =2020

In [3]:
# load raw cems data
cems_path = f"../data/pudl/pudl_data/parquet/epacems/year={year}"
cems = pd.read_parquet(cems_path).rename(
        columns={
            "plant_id_eia": "plant_id_epa",
            "heat_content_mmbtu": "fuel_consumed_mmbtu",
        }
    )

# load crosswalk data
crosswalk = pudl.output.epacems.epa_crosswalk()

# load eia-860 data
pudl_db = "sqlite:///../data/pudl/pudl_data/sqlite/pudl.sqlite"
pudl_engine = sa.create_engine(pudl_db)
pudl_out = pudl.output.pudltabl.PudlTabl(
        pudl_engine, freq="MS", start_date=f"{year}-01-01", end_date=f"{year}-12-31"
    )
gens_860 = pudl_out.gens_eia860()

In [5]:
ids = cems[["plant_id_epa", "unitid"]].drop_duplicates()

# Complete list of non grid connected plants

In [10]:
# get the list of plant_id_eia from the static table
ngc_plants = list(
    pd.read_csv(
        "../data/egrid/egrid_static_tables/table_4-2_plants_not_connected_to_grid.csv"
    )["Plant ID"]
)


In [9]:
ids[ids['plant_id_epa'].isin(ngc_plants)].to_clipboard()

# Identify where there is no EIA plant that matches the EPA plant

In [16]:
missing_eia = crosswalk[~crosswalk['CAMD_PLANT_ID'].isna() & crosswalk['EIA_PLANT_ID'].isna()]
missing_eia

Unnamed: 0,SEQUENCE_NUMBER,CAMD_STATE,CAMD_FACILITY_NAME,CAMD_PLANT_ID,CAMD_UNIT_ID,CAMD_GENERATOR_ID,CAMD_NAMEPLATE_CAPACITY,CAMD_FUEL_TYPE,CAMD_LATITUDE,CAMD_LONGITUDE,CAMD_STATUS,CAMD_STATUS_DATE,CAMD_RETIRE_YEAR,MOD_CAMD_UNIT_ID,MOD_CAMD_GENERATOR_ID,EIA_STATE,EIA_PLANT_NAME,EIA_PLANT_ID,EIA_GENERATOR_ID,EIA_NAMEPLATE_CAPACITY,EIA_BOILER_ID,EIA_UNIT_TYPE,EIA_FUEL_TYPE,EIA_LATITUDE,EIA_LONGITUDE,EIA_RETIRE_YEAR,PLANT_ID_CHANGE_FLAG,MOD_EIA_PLANT_ID,MOD_EIA_BOILER_ID,MOD_EIA_GENERATOR_ID_BOILER,MOD_EIA_GENERATOR_ID_GEN,MATCH_TYPE_GEN,MATCH_TYPE_BOILER
137,,CA,Contra Costa Generating Station,228,10,7,359.0,Pipeline Natural Gas,38.0184,-121.7610,RET,2013-05-02T00:00:00,2013,10,7,,,,,,,,,,,,,,,,,CAMD Unmatched,CAMD Unmatched
138,,CA,Contra Costa Generating Station,228,9,6,359.0,Pipeline Natural Gas,38.0184,-121.7610,RET,2013-05-01T00:00:00,2013,9,6,,,,,,,,,,,,,,,,,CAMD Unmatched,CAMD Unmatched
141,,CA,"Morro Bay Power Plant, LLC",259,1,1,169.1,Pipeline Natural Gas,35.3747,-120.8578,RET,2014-02-05T00:00:00,2014,1,1,,,,,,,,,,,,,,,,,CAMD Unmatched,CAMD Unmatched
142,,CA,"Morro Bay Power Plant, LLC",259,2,2,169.1,Pipeline Natural Gas,35.3747,-120.8578,RET,2014-02-05T00:00:00,2014,2,2,,,,,,,,,,,,,,,,,CAMD Unmatched,CAMD Unmatched
143,,CA,"Morro Bay Power Plant, LLC",259,3,3,359.0,Pipeline Natural Gas,35.3747,-120.8578,RET,2014-02-05T00:00:00,2014,3,3,,,,,,,,,,,,,,,,,CAMD Unmatched,CAMD Unmatched
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6925,6523.0,PA,SPMT Marcus Hook Industrial Complex,880107,AB02,,,Other Gas,39.8076,-75.4239,RET,2016-12-19T00:00:00,2016,AB02,,,,,,,,,,,,,,,,,,Manual CAMD Excluded,Manual CAMD Excluded
6926,6524.0,PA,SPMT Marcus Hook Industrial Complex,880107,AB03,,,Other Gas,39.8076,-75.4239,OPR,2004-03-12T00:00:00,0,AB03,,,,,,,,,,,,,,,,,,Manual CAMD Excluded,Manual CAMD Excluded
6927,6525.0,PA,SPMT Marcus Hook Industrial Complex,880107,AB04,,,Other Gas,39.8076,-75.4239,OPR,2004-03-26T00:00:00,0,AB04,,,,,,,,,,,,,,,,,,Manual CAMD Excluded,Manual CAMD Excluded
6928,6526.0,IN,Grain Processing Corporation,880108,BLR1,,,Natural Gas,38.6552,-87.1814,OPR,1999-08-15T00:00:00,0,BLR1,,,,,,,,,,,,,,,,,,Manual CAMD Excluded,Manual CAMD Excluded


In [42]:
missing_ids = missing_eia[['CAMD_PLANT_ID','CAMD_UNIT_ID','CAMD_GENERATOR_ID']].drop_duplicates()

missing_ids = missing_ids.merge(gens_860[['plant_id_eia','generator_id']], how='left', left_on=['CAMD_PLANT_ID','CAMD_UNIT_ID'], right_on=['plant_id_eia','generator_id'])
missing_ids = missing_ids.merge(gens_860[['plant_id_eia','generator_id']], how='left', left_on=['CAMD_PLANT_ID','CAMD_GENERATOR_ID'], right_on=['plant_id_eia','generator_id'], suffixes=('_u','_g'))

# identify the source of the data
missing_ids['source'] = ''
missing_ids.loc[~missing_ids['plant_id_eia_u'].isna(), 'source'] = "CAMD_UNIT_ID matches EIA_GENERATOR_ID"
missing_ids.loc[~missing_ids['plant_id_eia_g'].isna(), 'source'] = "CAMD_GENERATOR_ID matches EIA_GENERATOR_ID"

# fill nas in each group so that we can identify where values match
missing_ids['generator_id_u'] = missing_ids['generator_id_u'].fillna(missing_ids['generator_id_g'])
missing_ids['generator_id_g'] = missing_ids['generator_id_g'].fillna(missing_ids['generator_id_u'])

missing_ids['plant_id_eia_u'] = missing_ids['plant_id_eia_u'].fillna(missing_ids['plant_id_eia_g'])
missing_ids['plant_id_eia_g'] = missing_ids['plant_id_eia_g'].fillna(missing_ids['plant_id_eia_u'])


In [56]:
# identify where we identified a consistent generator match
unit_manual_match = (missing_ids[missing_ids['generator_id_u'] == missing_ids['generator_id_g']]).drop(columns=['plant_id_eia_g','generator_id_g']).rename(columns={'plant_id_eia_u':'plant_id_eia','generator_id_u':'generator_id'})
unit_manual_match

In [62]:
unit_manual_match.to_csv('../data/output/crosswalk_unit_manual_matches.csv')

In [64]:
# identify where the two matching methods returned different generator matches
multi_match = missing_ids[missing_ids['generator_id_u'] != missing_ids['generator_id_g']]
multi_match

In [None]:
multi_match.to_csv('../data/output/crosswalk_unit_manual_matches_multi.csv')

In [66]:
gens_860[gens_860['plant_id_eia'] == 3443]

Unnamed: 0,report_date,plant_id_eia,plant_id_pudl,plant_name_eia,utility_id_eia,utility_id_pudl,utility_name_eia,generator_id,associated_combined_heat_power,balancing_authority_code_eia,balancing_authority_name_eia,bga_source,bypass_heat_recovery,capacity_mw,carbon_capture,city,cofire_fuels,county,current_planned_operating_date,data_source,deliver_power_transgrid,distributed_generation,duct_burners,energy_source_1_transport_1,energy_source_1_transport_2,energy_source_1_transport_3,energy_source_2_transport_1,energy_source_2_transport_2,energy_source_2_transport_3,energy_source_code_1,energy_source_code_2,energy_source_code_3,energy_source_code_4,energy_source_code_5,energy_source_code_6,ferc_cogen_status,ferc_exempt_wholesale_generator,ferc_small_power_producer,fluidized_bed_tech,fuel_type_code_pudl,fuel_type_count,grid_voltage_2_kv,grid_voltage_3_kv,grid_voltage_kv,iso_rto_code,latitude,longitude,minimum_load_mw,multiple_fuels,nameplate_power_factor,operating_date,operating_switch,operational_status,operational_status_code,original_planned_operating_date,other_combustion_tech,other_modifications_date,other_planned_modifications,owned_by_non_utility,ownership_code,planned_derate_date,planned_energy_source_code_1,planned_modifications,planned_net_summer_capacity_derate_mw,planned_net_summer_capacity_uprate_mw,planned_net_winter_capacity_derate_mw,planned_net_winter_capacity_uprate_mw,planned_new_capacity_mw,planned_new_prime_mover_code,planned_repower_date,planned_retirement_date,planned_uprate_date,previously_canceled,primary_purpose_id_naics,prime_mover_code,pulverized_coal_tech,reactive_power_output_mvar,retirement_date,rto_iso_lmp_node_id,rto_iso_location_wholesale_reporting_id,sector_id_eia,sector_name_eia,solid_fuel_gasification,startup_source_code_1,startup_source_code_2,startup_source_code_3,startup_source_code_4,state,stoker_tech,street_address,subcritical_tech,summer_capacity_estimate,summer_capacity_mw,summer_estimated_capability_mw,supercritical_tech,switch_oil_gas,syncronized_transmission_grid,technology_description,time_cold_shutdown_full_load_code,timezone,topping_bottoming_code,turbines_inverters_hydrokinetics,turbines_num,ultrasupercritical_tech,unit_id_pudl,uprate_derate_completed_date,uprate_derate_during_year,winter_capacity_estimate,winter_capacity_mw,winter_estimated_capability_mw,zip_code
22770,2020-01-01,3443,2336,Victoria,60638,5527,"Victoria WLE, LP",4,False,ERCO,"Electric Reliability Council of Texas, Inc.",,False,75.0,False,Victoria,False,Victoria,NaT,eia860,,,False,,,,,,,NG,,,,,,False,True,False,,gas,1,,,138.0,,28.7883,-97.01,,False,,1955-03-01,,retired,RE,NaT,,NaT,,,S,NaT,,,,,,,,,NaT,NaT,NaT,,22,ST,,,2009-09-01,,,2,IPP Non-CHP,False,,,,,TX,,1205 S Bottom St,,,69.0,,,,False,Natural Gas Steam Turbine,12H,America/Chicago,X,,,,,NaT,False,,69.0,,77901
22769,2020-01-01,3443,2336,Victoria,60638,5527,"Victoria WLE, LP",5,False,ERCO,"Electric Reliability Council of Texas, Inc.",eia860_org,False,180.0,False,Victoria,False,Victoria,NaT,eia860,,,True,,,,,,,NG,,,,,,False,True,False,,gas,1,,,138.0,,28.7883,-97.01,55.0,False,0.85,1963-03-01,,existing,OP,NaT,,NaT,,,S,NaT,,,,,,,,,NaT,NaT,NaT,,22,CA,,,NaT,vic_victorg5,,2,IPP Non-CHP,False,,,,,TX,,1205 S Bottom St,,,96.0,,,,False,Natural Gas Fired Combined Cycle,12H,America/Chicago,X,,,,1.0,NaT,False,,107.0,,77901
22768,2020-01-01,3443,2336,Victoria,60638,5527,"Victoria WLE, LP",6,False,ERCO,"Electric Reliability Council of Texas, Inc.",,False,261.0,False,Victoria,False,Victoria,NaT,eia860,,,False,,,,,,,NG,,,,,,False,True,False,,gas,1,,,138.0,,28.7883,-97.01,50.0,False,,1968-05-01,,retired,RE,NaT,,NaT,,,S,NaT,,,,,,,,,NaT,NaT,NaT,,22,ST,,,2008-12-01,,,2,IPP Non-CHP,False,,,,,TX,,1205 S Bottom St,,,250.0,,,,False,Natural Gas Steam Turbine,12H,America/Chicago,X,,,,,NaT,False,,250.0,,77901
22767,2020-01-01,3443,2336,Victoria,60638,5527,"Victoria WLE, LP",7,False,ERCO,"Electric Reliability Council of Texas, Inc.",string_assn,False,196.9,False,Victoria,False,Victoria,NaT,eia860,,,False,,,,,,,NG,,,,,,False,True,False,,gas,1,,,138.0,,28.7883,-97.01,115.0,False,0.9,2009-05-01,,existing,OP,2008-04-01,,NaT,,,S,NaT,,,,,,,,,NaT,NaT,NaT,,22,CT,,,NaT,vic_victorg6,,2,IPP Non-CHP,False,,,,,TX,,1205 S Bottom St,,,170.0,,,,False,Natural Gas Fired Combined Cycle,12H,America/Chicago,X,,,,1.0,NaT,False,,159.0,,77901


# Identify units missing from crosswalk

In [80]:
missing_from_cw = ids.merge(crosswalk[['CAMD_PLANT_ID','CAMD_UNIT_ID']], how='left', left_on=['plant_id_epa','unitid'], right_on=['CAMD_PLANT_ID','CAMD_UNIT_ID'])
missing_from_cw = missing_from_cw[missing_from_cw['CAMD_UNIT_ID'].isna()]

In [81]:
missing_from_cw = missing_from_cw.merge(gens_860[['plant_id_eia','generator_id']], how='left', left_on=['plant_id_epa','unitid'], right_on=['plant_id_eia','generator_id'])

missing_from_cw['source'] = ''
missing_from_cw.loc[~missing_from_cw['generator_id'].isna(), 'source'] = "CAMD_UNIT_ID matches EIA_GENERATOR_ID"

missing_from_cw

Unnamed: 0,plant_id_epa,unitid,CAMD_PLANT_ID,CAMD_UNIT_ID,plant_id_eia,generator_id,source
0,116,GT3,,,116.0,GT3,CAMD_UNIT_ID matches EIA_GENERATOR_ID
1,116,GT4,,,116.0,GT4,CAMD_UNIT_ID matches EIA_GENERATOR_ID
2,116,GT5,,,116.0,GT5,CAMD_UNIT_ID matches EIA_GENERATOR_ID
3,116,GT6,,,116.0,GT6,CAMD_UNIT_ID matches EIA_GENERATOR_ID
4,116,GT7,,,116.0,GT7,CAMD_UNIT_ID matches EIA_GENERATOR_ID
5,315,CT1,,,,,
6,315,CT2,,,,,
7,335,CT1,,,,,
8,335,CT2,,,,,
9,60698,D1,,,,,


In [82]:
missing_from_cw.to_csv('../data/output/missing_from_crosswalk.csv', index=False)

In [98]:
gens_860.loc[gens_860['plant_id_eia'] == 55641, ['plant_id_eia','generator_id','prime_mover_code']]

Unnamed: 0,plant_id_eia,generator_id,prime_mover_code
13026,55641,CTG1,CT
13025,55641,CTG2,CT
13024,55641,CTG3,CT
13023,55641,CTG4,CT
13022,55641,STG1,CA
13021,55641,STG2,CA
