## temp notebook for developing year over year comparison checks 

TODO: these same functions could easily be used to compare data after code changes as well. 

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import os


In [None]:
%reload_ext autoreload
%autoreload 2

import sys
sys.path.append("../../src")

from column_checks import get_dtypes
from filepaths import results_folder

# Annual data

This is the highest priority for triage and the easiest 

In [None]:
y1 = 2020
y2 = 2021
dat_y1 = pd.read_csv(f"../../data/results/{y1}/plant_data/annual/us_units/plant_data.csv")
dat_y2 = pd.read_csv(f"../../data/results/{y2}/plant_data/annual/us_units/plant_data.csv")


In [None]:
file_type_to_index_cols = {
    "plant_data":["plant_id_eia"]
}

In [None]:
dat_y1 = dat_y1.set_index(file_type_to_index_cols['plant_data'])
dat_y2 = dat_y2.set_index(file_type_to_index_cols['plant_data'])

In [None]:
emission_cols = [c for c in dat_y1.columns if "mass_lb" in c]

In [None]:
in_both = dat_y1.index.intersection(dat_y2.index)
dat_y1 = dat_y1.loc[in_both]
dat_y2 = dat_y2.loc[in_both]


In [None]:
non_zero_gen = dat_y1[(dat_y1.net_generation_mwh > 0)].index.intersection(dat_y2[(dat_y2.net_generation_mwh > 0)].index)
print(len(non_zero_gen))
dat_y1 = dat_y1.loc[non_zero_gen]
dat_y2 = dat_y2.loc[non_zero_gen]

In [None]:
rates_y1 = dat_y1[emission_cols].div(dat_y1.net_generation_mwh, axis=0)
rates_y1.columns = [c+"_per_mwh" for c in rates_y1.columns]

rates_y2 = dat_y2[emission_cols].div(dat_y2.net_generation_mwh, axis=0)
rates_y2.columns = [c+"_per_mwh" for c in rates_y2.columns]

In [None]:
# Drop emission rate zero plants. 
# TODO: ID rates zero in one year and not in other
non_zero_rates = rates_y1.loc[(rates_y1 != 0).all(axis=1)].index.intersection(rates_y2.loc[(rates_y2 != 0).all(axis=1)].index)
rates_y1 = rates_y1.loc[non_zero_rates]
rates_y2 = rates_y2.loc[non_zero_rates]

In [None]:
frac_diff = abs(rates_y2 - rates_y1)/rates_y1


In [None]:
px.line(frac_diff.quantile(np.arange(0, 1, .01)))

In [None]:
# Top 5% of differences: 
frac_diff.quantile(.95) * 100

In [None]:
# What fraction of differences are above 10%? 
((frac_diff > 1).sum(axis=0)/len(frac_diff)).sort_values(ascending=False) * 100

# BA-level, annual 

(to come: BA level, monthly)

In [None]:
y1 = 2019
y2 = 2020

path1 = results_folder(f"{y1}/power_sector_data/annual/us_units/")
path2 = results_folder(f"{y2}/power_sector_data/annual/us_units/")

In [None]:
ba_list = {d.replace(".csv","") for d in os.listdir(path1)}
ba_list2 = {d.replace(".csv","") for d in os.listdir(path2)}
bas_to_check = set.intersection(ba_list, ba_list2)
bas_cant_check = set.difference(set.union(ba_list2, ba_list), bas_to_check)

In [None]:
# Warning: can't check these BAs since they only exist in one or the other year.
bas_cant_check

In [None]:
all_ba_changes = [] 
all_bad_rates = []
for ba in bas_to_check: 
    ba_y1 = pd.read_csv(f"{path1}/{ba}.csv", dtype=get_dtypes(), index_col="fuel_category")
    ba_y2 = pd.read_csv(f"{path2}/{ba}.csv", dtype=get_dtypes(), index_col="fuel_category")
        # Identify any differences in fuel types between years
    any_year_fuels = ba_y1.index.union(ba_y2.index)
    both_year_fuels = ba_y1.index.intersection(ba_y2.index)
    one_year_fuels = any_year_fuels.difference(both_year_fuels)
    if len(one_year_fuels) > 0: 
        print(f"{ba} Warning: fuels {one_year_fuels.values} in only one of comparison years")

    # Is this BA all zero? 
    # TODO: separate check by year, expand to check for all zero/nan (eg, RIMS 2020)
    if (((ba_y1 == 0) | ba_y1.isna()).all().all()):
        print(f"Warning: ba {ba} {y1} is all zero/NaN.")
        
    if (((ba_y2 == 0) | ba_y2.isna()).all().all()):
        print(f"Warning: ba {ba} {y2} is all zero/NaN.")
    
    rate_cols = [c for c in ba_y1.columns if "rate" in c]
    diff = (ba_y1[rate_cols] - ba_y2[rate_cols])
    frac_change = diff/ba_y1[rate_cols]
    
    # Renewables are often zero in both years -- don't want NaN here 
    frac_change[diff == 0] = 0 

    # But if there was zero emissions in y1 and non-zero in y2, we want inf not NaN
    if frac_change.isna().any().any(): 
        print(f"{ba} Warning! some non-zero in y2 but zero in y1")
        frac_change[frac_change.isna()] = np.inf

    ba_changes = frac_change.melt(ignore_index=False, value_name="frac_change").reset_index()
    ba_changes["BA"] = ba
    ba_changes = ba_changes.set_index(["BA","fuel_category","variable"])

    diff_size = diff.melt(ignore_index=False, value_name="diff").reset_index()
    diff_size["BA"] = ba
    diff_size = diff_size.set_index(["BA","fuel_category","variable"])

    ba_changes["difference"] = diff_size["diff"]
    
    # Find and exclude differences from rates which are NaN in one or other year 
    bad_rates = diff.isna().melt(ignore_index=False, value_name="is_na").reset_index()
    bad_rates["BA"] = ba
    bad_rates = bad_rates.set_index(["BA","fuel_category","variable"])

    all_bad_rates.append(bad_rates[bad_rates.is_na])
    ba_changes = ba_changes.loc[bad_rates[bad_rates.is_na == False].index]
    
    all_ba_changes.append(ba_changes)


In [None]:
# Sort by size of change (not relative)
to_export = pd.concat(all_ba_changes)
to_export["abs_diff"] = to_export.difference.abs()
to_export = to_export.sort_values("abs_diff", ascending=False)
to_export = to_export.drop(columns="abs_diff")

In [None]:
os.makedirs(results_folder("validation"), exist_ok=True)
to_export.to_csv(f"{results_folder('validation')}/yoy_compare_{y1}_{y2}")