In [None]:
# import packages
import pandas as pd
import os
import zipfile

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys
sys.path.append('../../../open-grid-emissions/src/')

import load_data
from column_checks import get_dtypes
from filepaths import *


year = 2021
path_prefix = f"{year}/"

## About this notebook
This notebook can be used to identify differences between one version of OGE data and another. 
This is useful if you want to identify how much a code update affects the output results.

This notebook compares files in the `outputs` and `results` directory against archived data in the `zenodo` or `s3_upload` directories. 
This assumes that the previous, stable version of the data outputs are archived on your computer.

## Compare plant data

In [None]:
# load archived data
data_type = "plant_data"
resolution = "annual"

In [None]:
# unzip archived data
if not os.path.exists(data_folder("diff")):
    os.mkdir(data_folder("diff"))
with zipfile.ZipFile(data_folder(f"s3_upload/{year}_{data_type}_{resolution}_us_units.zip"), "r") as zip_to_unzip:
    zip_to_unzip.extractall(data_folder(f"diff/{year}_{data_type}_{resolution}_us_units"))

In [None]:
# load archived data
prev_data = pd.read_csv(data_folder(f"diff/{year}_{data_type}_{resolution}_us_units/plant_data.csv"), dtype=get_dtypes()).round(0)

# load new data
new_data = pd.read_csv(results_folder(f"{year}/{data_type}/{resolution}/us_units/plant_data.csv"), dtype=get_dtypes()).round(0)

# load plant attributes
plant_attributes = pd.read_csv(outputs_folder(f"{year}/plant_static_attributes_{year}.csv"), dtype=get_dtypes())

prev_data = prev_data.merge(plant_attributes[["plant_id_eia","ba_code","fuel_category"]], how="left", on="plant_id_eia")
new_data = new_data.merge(plant_attributes[["plant_id_eia","ba_code","fuel_category"]], how="left", on="plant_id_eia")

key_cols = ["plant_id_eia","ba_code","fuel_category"]
comparison = prev_data.set_index(key_cols).compare(new_data.set_index(key_cols), result_names=("previous","new"))

# get difference
diff = comparison.groupby(level=0, axis=1).diff().rename(columns={"new":"pct_diff"}).drop(columns=["previous"], level=1)
comparison = pd.concat([comparison, diff], axis=1).sort_index(axis=1, level=0, ascending=True, sort_remaining=False)
comparison.iloc[:, comparison.columns.get_level_values(1)=='pct_diff'] = (comparison.iloc[:, comparison.columns.get_level_values(1)=='pct_diff'].values / comparison.iloc[:, comparison.columns.get_level_values(1)=='previous'].values).round(3)

comparison


In [None]:
# column to compare
#col = "consumed_co2_rate_lb_per_mwh_for_electricity"
col = "co2_mass_lb_for_electricity"

comparison[
    (
        ~comparison.loc[
            :, (col, "pct_diff")
        ].isna()
    )
    & (
        abs(comparison.loc[:, (col, "pct_diff")])
        > 0.01
    )
]


# Compare BA data

In [None]:
# load archived data
data_type = "power_sector_data"
resolution = "annual"

key_cols = ["ba_code", "fuel_category"]

In [None]:
# load archived data
data_type = "carbon_accounting"
resolution = "annual"

key_cols = ["ba_code"]

In [None]:
# unzip archived data
if not os.path.exists(data_folder("diff")):
    os.mkdir(data_folder("diff"))
with zipfile.ZipFile(data_folder(f"s3_upload/{year}_{data_type}_{resolution}_us_units.zip"), "r") as zip_to_unzip:
    zip_to_unzip.extractall(data_folder(f"diff/{year}_{data_type}_{resolution}_us_units"))

In [None]:
# load archived data
prev_data = []
for ba in os.listdir(data_folder(f"diff/{year}_{data_type}_{resolution}_us_units")):
    df = pd.read_csv(data_folder(f"diff/{year}_{data_type}_{resolution}_us_units/{ba}"), dtype=get_dtypes())
    df["ba_code"] = ba.split(".")[0]
    prev_data.append(df)

prev_data = pd.concat(prev_data, axis=0).reset_index(drop=True)

# load data
new_data = []
for ba in os.listdir(results_folder(f"{year}/{data_type}/{resolution}/us_units")):
    df = pd.read_csv(results_folder(f"{year}/{data_type}/{resolution}/us_units/{ba}"), dtype=get_dtypes())
    df["ba_code"] = ba.split(".")[0]
    new_data.append(df)

new_data = pd.concat(new_data, axis=0).reset_index(drop=True)

comparison = prev_data.set_index(key_cols).compare(new_data.set_index(key_cols), result_names=("previous","new"))

# get difference
diff = comparison.groupby(level=0, axis=1).diff().rename(columns={"new":"pct_diff"}).drop(columns=["previous"], level=1)
comparison = pd.concat([comparison, diff], axis=1).sort_index(axis=1, level=0, ascending=True, sort_remaining=False)
comparison.iloc[:, comparison.columns.get_level_values(1)=='pct_diff'] = (comparison.iloc[:, comparison.columns.get_level_values(1)=='pct_diff'].values / comparison.iloc[:, comparison.columns.get_level_values(1)=='previous'].values).round(3)


comparison


In [None]:
# column to compare
#col = "consumed_co2_rate_lb_per_mwh_for_electricity"
col = "generated_co2_rate_lb_per_mwh_for_electricity"

comparison[
    (
        ~comparison.loc[
            :, (col, "pct_diff")
        ].isna()
    )
    & (
        comparison.loc[:, (col, "pct_diff")]
        != 0
    )
]


## Compare intermediate outputs

In [None]:
# load archived data
file = "cems_cleaned"
key_cols = ["plant_id_eia","emissions_unit_id_epa","datetime_utc"]

# unzip archived data
if not os.path.exists(data_folder(f"diff/outputs_{year}")):
    os.mkdir(data_folder(f"diff/outputs_{year}"))
    with zipfile.ZipFile(data_folder(f"zenodo/outputs_{year}.zip"), "r") as zip_to_unzip:
        zip_to_unzip.extractall(data_folder(f"diff/outputs_{year}"))

# load archived data
prev_data = pd.read_csv(data_folder(f"diff/outputs_{year}/{file}_{year}.csv"), dtype=get_dtypes())

# load new data
new_data = pd.read_csv(outputs_folder(f"{year}/{file}_{year}.csv"), dtype=get_dtypes())

comparison = prev_data.set_index(key_cols).compare(new_data.set_index(key_cols), result_names=("previous","new"))
comparison
