# HEDNO Load Data and Factor IDs

In [None]:
import pandas as pd

from hedno.pipelines.preprocess import nodes as prp

In [None]:
pd.options.display.memory_usage = "deep"
pd.options.display.show_dimensions = True

## Categories

In [None]:
prp.PHASE_TYPE

In [None]:
prp.REQUEST_TYPE

In [None]:
prp.VOLTAGE_TYPE

## Load utility datasets

### Rates

In [None]:
preprocessed_rates = (
    catalog
    .load("raw_rates")
    .pipe(prp.preprocess_rates)
)

preprocessed_rates

In [None]:
preprocessed_rates.info(show_counts=True)

### Powers

In [None]:
preprocessed_powers = (
    catalog
    .load("raw_powers")
    .pipe(prp.preprocess_powers)
)

preprocessed_powers

In [None]:
preprocessed_powers.info(show_counts=True)

### Suppliers

In [None]:
preprocessed_suppliers = (
    catalog
    .load("raw_suppliers")
    .pipe(prp.preprocess_suppliers)
)

preprocessed_suppliers

In [None]:
preprocessed_suppliers.info(show_counts=True)

### Uses

In [None]:
preprocessed_uses = (
    catalog
    .load("raw_uses")
    .pipe(prp.preprocess_uses)
)

preprocessed_uses

In [None]:
preprocessed_uses.info(show_counts=True)

## Load main datasets

### Records

#### Test

In [None]:
records_test = catalog.load("raw_records_test")

records_test

In [None]:
records_test.info(show_counts=True)

#### Train

In [None]:
records_train = catalog.load("raw_records_train")

records_train

In [None]:
records_train.info(show_counts=True)

#### Merge and preprocess

In [None]:
records = (
    prp
    .concat_annotated_train_and_test(
        test_df=records_test,
        train_df=records_train
    )
    .pipe(prp.preprocess_records)
)

records

In [None]:
records.info(show_counts=True)

#### Check Unique Key

In [None]:
RECORDS_UNIQUE_KEY = ["ACCT_NBR", "successor"]

records[records.duplicated(subset=RECORDS_UNIQUE_KEY, keep=False)]

`(ACCT_NBR, successor)` is UNIQUE KEY in the merged `Records` dataset. Use the accompanying `test` entry in `Records` as source of truth for train VS test labelling, and **ignore** any such train VS test distinction in `Consumptions`, `Representations` and `Requests`.

### Representations

#### Test

In [None]:
representations_test = catalog.load("raw_representations_test")

representations_test

In [None]:
representations_test.info(show_counts=True)

#### Train

In [None]:
representations_train = (
    catalog
    .load("raw_representations_train")
    .pipe(prp.hotfix_representations_train)
)

representations_train

In [None]:
representations_train.info(show_counts=True)

#### Merge and preprocess

In [None]:
representations = (
    prp
    .concat_train_and_test(
        test_df=representations_test,
        train_df=representations_train
    )
    .pipe(prp.preprocess_representations)
)

representations

In [None]:
representations.info(show_counts=True)

### Requests

#### Test

In [None]:
requests_test = catalog.load("raw_requests_test")

requests_test

In [None]:
requests_test.info(show_counts=True)

#### Train

In [None]:
requests_train = (
    catalog
    .load("raw_requests_train")
    .pipe(prp.hotfix_requests_train)
)

requests_train

In [None]:
requests_train.info(show_counts=True)

#### Merge and preprocess

In [None]:
requests = (
    prp
    .concat_train_and_test(
        test_df=requests_test,
        train_df=requests_train
    )
    .pipe(prp.preprocess_requests)
)

requests

In [None]:
requests.info(show_counts=True)

### Consumptions

#### Test

In [None]:
constt = catalog.load("raw_consumptions_test")

constt

In [None]:
constt.info(show_counts=True)

#### Train

In [None]:
constr = catalog.load("raw_consumptions_train")

constr

In [None]:
constr.info(show_counts=True)

#### Merge and preprocess

In [None]:
consumptions = (
    prp
    .concat_train_and_test(
        test_df=constt,
        train_df=constr
    )
    .pipe(prp.preprocess_consumptions)
)

consumptions

In [None]:
consumptions.info(show_counts=True)

### PowerThefts

In [None]:
powerthefts = (
    catalog
    .load("raw_powerthefts")
    .pipe(prp.hotfix_powerthefts)
    .pipe(prp.preprocess_powerthefts)
)

powerthefts

In [None]:
powerthefts.info(show_counts=True)

### Tests

In [None]:
preprocessed_tests = (
    catalog
    .load("raw_tests")
    .pipe(prp.preprocess_tests)
)

preprocessed_tests

In [None]:
preprocessed_tests.info(show_counts=True)

## Construct ID spaces

### Acounts

In [None]:
%%time

accounts = prp.collect_accounts(
    consumptions=consumptions,
    powerthefts=powerthefts,
    records=records,
    representations=representations,
    requests=requests,
    tests=preprocessed_tests
)

accounts

In [None]:
accounts.info(show_counts=True)

### Meters

In [None]:
%%time

meters = prp.collect_meters(consumptions)

meters

In [None]:
meters.info(show_counts=True)

### Methods

In [None]:
methods = prp.collect_methods(powerthefts=powerthefts)

methods

In [None]:
methods.info(show_counts=True)

### Places

In [None]:
places = prp.collect_places(powerthefts=powerthefts)

places

In [None]:
places.info(show_counts=True)

### Powers

In [None]:
powers = prp.collect_powers(powers=preprocessed_powers, records=records)

powers

In [None]:
powers.info(show_counts=True)

### Suppliers

In [None]:
suppliers = prp.collect_suppliers(
    representations=representations,
    suppliers=preprocessed_suppliers
)

suppliers

In [None]:
suppliers.info(show_counts=True)

#### Missing suppliers

The following supplier(s) appear in `Representations` but not in `Suppliers`, thus their `supplier_name` is missing.

Note: Supplier `Π` also appears in `Representations` only, but has been hotfixed to (known supplier) `π`, as there was only 1 such entry.

In [None]:
suppliers.query("supplier_name.isna()")

### Rates

In [None]:
rates = prp.collect_rates(consumptions=consumptions, rates=preprocessed_rates)

rates

In [None]:
rates.info(show_counts=True)

#### Missing rates
The following rates appear in `Consumptions` but not in `Rates`, and thus their `rate_name` is missing.

In [None]:
rates.query("rate_name.isna()")

### Suppliers

In [None]:
suppliers = prp.collect_suppliers(
    representations=representations,
    suppliers=preprocessed_suppliers
)

suppliers

In [None]:
suppliers.info(show_counts=True)

#### Missing suppliers

The following supplier(s) appear in `Representations` but not in `Suppliers`, thus their `supplier_name` is missing.

Note: Supplier `Π` also appears in `Representations` only, but has been hotfixed to (known supplier) `π`, as there was only 1 such entry.

In [None]:
suppliers.query("supplier_name.isna()")

### Uses

In [None]:
uses = prp.collect_uses(records=records, uses=preprocessed_uses)

uses

In [None]:
uses.info(show_counts=True)

## Factor datasets with respect to ID spaces

### Consumptions

In [None]:
%%time

factored_consumptions = prp.factor_consumptions(
    consumptions=consumptions,
    accounts=accounts,
    meters=meters,
    rates=rates
)

factored_consumptions

In [None]:
factored_consumptions.info(show_counts=True)

### PowerThefts

In [None]:
factored_powerthefts = prp.factor_powerthefts(
    powerthefts=powerthefts,
    accounts=accounts,
    methods=methods,
    places=places
)

factored_powerthefts

In [None]:
factored_powerthefts.info(show_counts=True)

### Locations and Records

In [None]:
factored_records_and_locations = prp.factor_records_and_locations(
    records=records,
    accounts=accounts,
    powers=powers,
    uses=uses
)

factored_records_and_locations

In [None]:
factored_records_and_locations.info(show_counts=True)

#### Locations
Location is only supposed to change based on `account_id`.

In [None]:
factored_locations = prp.factor_locations(
    factored_records_and_locations=factored_records_and_locations
)

factored_locations

In [None]:
factored_locations.info(show_counts=True)

##### Check Unique Keys

In [None]:
LOCATION_UNIQUE_KEY = ["account_id"]

factored_locations[
    factored_locations.duplicated(subset=LOCATION_UNIQUE_KEY, keep=False)
]

`account_id` is indeed UNIQUE KEY to `x` and `y`.

#### Records

In [None]:
factored_records = prp.factor_records(
    factored_records_and_locations=factored_records_and_locations
)

factored_records

In [None]:
factored_records.info(show_counts=True)

### Representations

In [None]:
factored_representations = prp.factor_representations(
    representations=representations,
    accounts=accounts,
    suppliers=suppliers
)

factored_representations

In [None]:
factored_representations.info(show_counts=True)

### Requests

In [None]:
factored_requests = prp.factor_requests(requests=requests, accounts=accounts)

factored_requests

In [None]:
factored_requests.info(show_counts=True)

### Tests

In [None]:
tests = prp.factor_tests(tests=preprocessed_tests, accounts=accounts)

tests

In [None]:
tests.info(show_counts=True)

## Save factored datasets

In [None]:
%%time

catalog.save("factored_consumptions", factored_consumptions)
catalog.save("factored_locations", factored_locations)
catalog.save("factored_powerthefts", factored_powerthefts)
catalog.save("factored_records", factored_records)
catalog.save("factored_representations", factored_representations)
catalog.save("factored_requests", factored_requests)

## Save primary datasets

In [None]:
%%time

catalog.save("accounts", accounts)
catalog.save("meters", meters)
catalog.save("methods", methods)
catalog.save("places", places)
catalog.save("powers", powers)
catalog.save("rates", rates)
catalog.save("suppliers", suppliers)
catalog.save("tests", tests)
catalog.save("uses", uses)