# About this notebook

This notebook is for test running the data pipeline and  as a sandbox for testing new functions that we are adding to data pipeline.

In [None]:
# import packages
import numpy as np
import pandas as pd
import argparse
import os

%reload_ext autoreload
%autoreload 2

# # Tell python where to look for modules.
import sys

sys.path.append("../../hourly-egrid/")

# import local modules
import src.data_cleaning as data_cleaning
import src.gross_to_net_generation as gross_to_net_generation
import src.load_data as load_data
import src.residual as residual

# Run the pipeline

In [None]:
%cd ../src
%run data_pipeline --year 2020

In [None]:
%cd ../src
%run data_pipeline --small SMALL --year 2020

# Test new functions

In [None]:
year = 2020
path_prefix = ''
cems = pd.read_csv(f'../data/outputs/{path_prefix}cems_{year}.csv', parse_dates=['operating_datetime_utc','report_date'])
eia923_allocated = pd.read_csv(f'../data/outputs/{path_prefix}eia923_allocated_{year}.csv', parse_dates=['report_date'])

In [None]:
plant_frame = pd.read_csv(f"../data/outputs/{path_prefix}plant_static_attributes.csv")
eia923_allocated = eia923_allocated.merge(plant_frame, how='left', on='plant_id_eia')

In [None]:
# 11. Assign hourly profile to monthly data
print('Assigning hourly profile to monthly EIA-923 data')
# create a separate dataframe containing only the generators for which we do not have CEMS data
monthly_eia_data_to_distribute = eia923_allocated[
    (eia923_allocated["hourly_data_source"] == "eia")
    & ~(eia923_allocated["fuel_consumed_mmbtu"].isna())
]
# load profile data and format for use in the pipeline
# TODO: once this is in the pipeline (step 10), may not need to read file
hourly_profiles = pd.read_csv(
    "../data/outputs/residual_profiles.csv", parse_dates=["report_date"]
)


In [None]:
available_profiles = hourly_profiles[['ba_code','fuel_category']].drop_duplicates()
ba_fuel_to_distribute = monthly_eia_data_to_distribute[['ba_code','fuel_category']].drop_duplicates().dropna()
missing_profiles = ba_fuel_to_distribute.merge(available_profiles, how='outer', on=['ba_code','fuel_category'], indicator='source')
missing_profiles = missing_profiles[missing_profiles.source == 'left_only']
missing_profiles.sort_values(by=['fuel_category','ba_code'])

In [None]:
hourly_profiles = residual.load_hourly_profiles(monthly_eia_data_to_distribute, year)

In [None]:
print(hourly_profiles[['ba_code','fuel_category','profile_method']].drop_duplicates().pivot_table(index='fuel_category',columns='profile_method', aggfunc='count').fillna(0).astype(int))

# investigate profile shapes

In [None]:
import plotly.express as px
import src.eia930 as eia930

In [None]:
hydro_demand = load_data.load_raw_eia930_data(year, 'BALANCE')
hydro_demand = hydro_demand[["Balancing Authority","operating_datetime_utc","Demand (MW)","Net Generation (MW)","Net Generation (MW) from Hydropower and Pumped Storage",]]

In [None]:
bas_with_no_hydro = hydro_demand.groupby("Balancing Authority").sum().reset_index()
bas_with_no_hydro = list(bas_with_no_hydro.loc[(bas_with_no_hydro["Net Generation (MW) from Hydropower and Pumped Storage"] == 0),"Balancing Authority"])

In [None]:
hydro_demand = hydro_demand[~hydro_demand['Balancing Authority'].isin(bas_with_no_hydro)]

In [None]:
px.line(hydro_demand, x='operating_datetime_utc', y='Net Generation (MW) from Hydropower and Pumped Storage', color='Balancing Authority')

In [None]:
hydro_corr = hydro_demand.groupby("Balancing Authority")[["Demand (MW)","Net Generation (MW) from Hydropower and Pumped Storage"]].corr().reset_index()
hydro_corr = hydro_corr[hydro_corr['level_1'] == 'Demand (MW)'].drop(columns=["Demand (MW)","level_1"])
hydro_corr

In [None]:
cleaned_930 = eia930.load_chalendar_for_pipeline(
    "../data/outputs/EBA_adjusted_elec.csv", year=year
)

In [None]:
cleaned_930

In [None]:
cleaned_930.fuel_category.unique()

In [None]:
fuel = 'wind'

data_to_plot = cleaned_930[cleaned_930['fuel_category'] == fuel]

px.line(data_to_plot, x='datetime_local', y='net_generation_mwh_930', color='ba_code')

In [None]:
data_to_plot

In [None]:
data_to_plot.pivot(index='datetime_local', columns='ba_code', values='net_generation_mwh_930').corr()