# About this notebook

This notebook is for test running the data pipeline and  as a sandbox for testing new functions that we are adding to data pipeline.

In [1]:
# import packages
import numpy as np
import pandas as pd
import argparse
import os

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../hourly-egrid/")

# import local modules
import src.data_cleaning as data_cleaning
import src.load_data as load_data
import src.impute_hourly_profiles as impute_hourly_profiles
import src.eia930 as eia930
import src.output_data as output_data

from src.column_checks import get_dtypes, apply_dtypes

year = 2020

# Test new partial plant assignment

In [2]:
import src.emissions as emissions

In [3]:
# 3. Clean EIA-923 Generation and Fuel Data at the Monthly Level
####################################################################################
print("3. Cleaning EIA-923 data")
eia923_allocated, primary_fuel_table = data_cleaning.clean_eia923(year, False)
# Add primary fuel data to each generator
eia923_allocated = eia923_allocated.merge(
    primary_fuel_table,
    how="left",
    on=["plant_id_eia", "generator_id"],
    validate="m:1",
)

# 4. Clean Hourly Data from CEMS
####################################################################################
print("4. Cleaning CEMS data")
cems = data_cleaning.clean_cems(year, False)

# calculate biomass-adjusted emissions while cems data is at the unit level
cems = emissions.adjust_emissions_for_biomass(cems)

# 5. Assign static characteristics to CEMS and EIA data to aid in aggregation
####################################################################################
print("5. Loading plant static attributes")
plant_attributes = data_cleaning.create_plant_attributes_table(
    cems, eia923_allocated, year, primary_fuel_table
)


3. Cleaning EIA-923 data




    Checking that there are no missing energy source codes associated with non-zero fuel consumption...  OK
    Checking that fuel and emissions values are positive...  OK
 
      prime_mover_code energy_source_code boiler_bottom_type boiler_firing_type
20403               FC                LFG                NaN                NaN
13248               FC                 NG                NaN                NaN
19938               FC                OBG                NaN                NaN
 
 
      prime_mover_code energy_source_code boiler_firing_type
20403               FC                LFG                NaN
13248               FC                 NG                NaN
19938               FC                OBG                NaN
 
    Checking that total fuel consumed >= fuel consumed for electricity...  OK
    Checking that adjusted emission values are less than total emissions...  OK
    Removing 0 plants that are not grid-connected
    Removing 0 plants located in the following s

In [25]:
eia923_allocated = data_cleaning.identify_hourly_data_source(
        eia923_allocated, cems, year
    )

In [26]:
eia923_allocated[eia923_allocated["plant_id_eia"] == 3].head(20)

Unnamed: 0,report_date,plant_id_eia,generator_id,net_generation_mwh,fuel_consumed_mmbtu,fuel_consumed_for_electricity_mmbtu,co2_mass_lb,ch4_mass_lb,n2o_mass_lb,co2e_mass_lb,nox_mass_lb,so2_mass_lb,co2_mass_lb_for_electricity,ch4_mass_lb_for_electricity,n2o_mass_lb_for_electricity,co2e_mass_lb_for_electricity,nox_mass_lb_for_electricity,so2_mass_lb_for_electricity,co2_mass_lb_adjusted,ch4_mass_lb_adjusted,n2o_mass_lb_adjusted,co2e_mass_lb_adjusted,nox_mass_lb_adjusted,so2_mass_lb_adjusted,co2_mass_lb_for_electricity_adjusted,ch4_mass_lb_for_electricity_adjusted,n2o_mass_lb_for_electricity_adjusted,co2e_mass_lb_for_electricity_adjusted,nox_mass_lb_for_electricity_adjusted,so2_mass_lb_for_electricity_adjusted,subplant_id,prime_mover_code,energy_source_code,plant_primary_fuel,hourly_data_source
3,2020-01-01,3,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,ST,NG,NG,cems
4,2020-01-01,3,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,ST,NG,NG,cems
5,2020-01-01,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,ST,BIT,NG,cems
6,2020-01-01,3,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,ST,BIT,NG,cems
7,2020-01-01,3,A1CT,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,CT,NG,NG,cems
8,2020-01-01,3,A1CT2,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,CT,NG,NG,cems
9,2020-01-01,3,A1ST,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,CA,NG,NG,cems
10,2020-01-01,3,A2C1,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,CT,NG,NG,cems
11,2020-01-01,3,A2C2,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,CT,NG,NG,cems
12,2020-01-01,3,A2ST,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,CA,NG,NG,cems


# Run the Pipeline

In [None]:
%cd ../src
%run data_pipeline --year 2020

In [None]:
%cd ../src
%run data_pipeline --small SMALL --year 2020

# Functions for loading intermediate outputs

In [None]:

# load data from csv
year = 2020
path_prefix = ''

cems = pd.read_csv(f'../data/outputs/{path_prefix}{year}/cems_{year}.csv', dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])
partial_cems_scaled = pd.read_csv(f'../data/outputs/{path_prefix}{year}/partial_cems_scaled_{year}.csv', dtype=get_dtypes(), parse_dates=['datetime_utc', 'report_date'])
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}{year}/eia923_allocated_{year}.csv', dtype=get_dtypes(), parse_dates=['report_date'])
plant_attributes = pd.read_csv(f"../data/outputs/{path_prefix}{year}/plant_static_attributes_{year}.csv")
primary_fuel_table = plant_attributes.drop_duplicates(subset="plant_id_eia")[["plant_id_eia", "plant_primary_fuel"]]
residual_profiles = pd.read_csv(f"../data/outputs/{path_prefix}{year}/residual_profiles_{year}.csv")
shaped_eia_data = pd.read_csv(f"../data/outputs/{path_prefix}{year}/shaped_eia923_data_{year}.csv")

In [None]:
# load data from csv
year = 2020
path_prefix = ''

shaped_eia_data = pd.read_csv(f"../data/outputs/{path_prefix}{year}/shaped_eia923_data_{year}.csv")

In [None]:
shaped_eia_data[shaped_eia_data["ba_code"] == "CISO"]