# Working with the EIA Extract / Transform
This notebook steps through PUDL's extract and transform steps for the EIA 860 and 923 datasets, to make it easier to test and add new years of data, or new tables from the various spreadsheets that haven't been integrated yet.

In [None]:
%load_ext autoreload
%autoreload 2
import pudl
from pudl import constants as pc
import logging
import sys
from pathlib import Path
import pandas as pd
pd.options.display.max_columns = None

In [None]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(stream=sys.stdout)
formatter = logging.Formatter('%(message)s')
handler.setFormatter(formatter)
logger.handlers = [handler]

In [None]:
pudl_settings = pudl.workspace.setup.get_defaults()

## Set the scope for the Extract-Transform:

In [None]:
eia923_tables = pc.PUDL_TABLES['eia923']
eia923_years = [2020, ] # list(range(2001, 2020))
eia860_tables = pc.PUDL_TABLES['eia860']
eia860_years = [2020, ] # list(range(2001, 2021))
eia860m = True

## Create a locally cached datastore

In [None]:
ds = pudl.workspace.datastore.Datastore(local_cache_path=Path(pudl_settings["data_dir"]))

# EIA-860

## Extract just the EIA-860 / EIA-860m

In [None]:
%%time
eia860_extractor = pudl.extract.eia860.Extractor(ds)
eia860_raw_dfs = eia860_extractor.extract(year=eia860_years)
if eia860m:
    eia860m_raw_dfs = pudl.extract.eia860m.Extractor(ds).extract(
        year_month=pc.WORKING_PARTITIONS['eia860m']['year_month'])
    eia860_raw_dfs = pudl.extract.eia860m.append_eia860m(
        eia860_raw_dfs=eia860_raw_dfs, eia860m_raw_dfs=eia860m_raw_dfs)

## Transform EIA-860 / EIA-860m

In [None]:
%%time
eia860_transformed_dfs = pudl.transform.eia860.transform(
    eia860_raw_dfs, eia860_tables=eia860_tables)

# EIA-923

## Extract just the EIA-923

In [None]:
%%time
eia923_extractor = pudl.extract.eia923.Extractor(ds)
eia923_raw_dfs = eia923_extractor.extract(year=eia923_years)

## Transform just the EIA-923

In [None]:
%%time
eia923_transformed_dfs = pudl.transform.eia923.transform(
    eia923_raw_dfs, eia923_tables=eia923_tables)

# Combined EIA Data

## Merge EIA-923/860, set dtypes, harvest entities

In [None]:
%%time
eia_transformed_dfs = eia923_transformed_dfs.copy()
eia_transformed_dfs.update(eia860_transformed_dfs.copy())
eia_transformed_dfs = pudl.helpers.convert_dfs_dict_dtypes(eia_transformed_dfs, 'eia')
entities_dfs, eia_transformed_dfs = pudl.transform.eia.transform(
    eia_transformed_dfs,
    eia860_years=eia860_years,
    eia923_years=eia923_years,
    eia860m=eia860m,
)

out_dfs = pudl.etl._read_static_tables_eia()
entities_dfs = pudl.helpers.convert_dfs_dict_dtypes(entities_dfs, 'eia')
out_dfs.update(entities_dfs)
out_dfs.update(eia_transformed_dfs)