# About
This notebook will produce all the electricity data for step 3.1B of the data processing for Module 1.

**This requires output from the previous steps - 1.1B and 1.1C- so run those notebooks first**

---

**Required user input**

Update the cell below once each for the full years of 2019, 2020, 2021, and run the entire notebook for each.


In [None]:
year = '2021' # Update year - this is the year of data you are working on.
# Some electricity meters sometimes report hh_sums in units of 10Wh. Set below whether to: on a day by day basis, use the daily read instead if valid (or else Nan); or, remove PUPRNs' data entirely wherever this occurs.
treat_hh_sum_10Wh = 'by_PUPRN' # Set as by_day or by_PUPRN.

In [None]:
# Don't change these.
source_directory_from_daily='Step_1_1_Outputs'
source_directory_from_hh=source_directory_from_daily

source_filename_from_daily = 'Step_1_1B_Elec_'+year+'_daily_from_daily.csv'
source_filename_from_hh = 'Step_1_1C_Elec_'+year+'_daily_from_hh.csv'
source_exporter_puprn_list_filename = 'Step_1_1C_Elec_'+year+'_list_of_exporter_puprns.csv'

output_directory='Step_3_1_Outputs'
output_filename='Step_3_1B_Elec_'+year+'_daily.csv'

# Code

## Setup

In [None]:
import os
import pandas as pd
import numpy as np

## Process the data

In [None]:
# Load the list of exporting homes
exporter_puprn_list = pd.read_csv(os.path.join(source_directory_from_hh,source_exporter_puprn_list_filename),header=None)[0].tolist()

In [None]:
# Load the source csvs calculated in 1.1B and 1.1C.
energy_daily_data_from_daily = pd.read_csv(os.path.join(source_directory_from_daily,source_filename_from_daily),
                                           index_col=['PUPRN','Read_date_effective_local'])
energy_daily_data_from_hh = pd.read_csv(os.path.join(source_directory_from_hh,source_filename_from_hh),
                                       index_col=['PUPRN','Read_date_effective_local'])

# Merge them together, keeping everything
energy_daily_data = pd.merge(energy_daily_data_from_daily,energy_daily_data_from_hh,left_index=True,right_index=True,how='outer')

In [None]:
energy_daily_data[energy_daily_data.Elec_act_imp_hh_sum_Wh.notnull()].head()

In [None]:
# Check the data has no duplicates and is sorted by PUPRN then date
all_rows_as_expected = (energy_daily_data.index.is_monotonic_increasing & energy_daily_data.index.is_unique)
if all_rows_as_expected == True:
    print('Data is sorted by PUPRN then date and has no duplicates, as it should be.')
else:
    print('WARNING! Your data has duplicates of PUPRN and date, or is out of order - check and fix before continuing.')

In [None]:
# Add an electricity flag column
energy_daily_data['Hh_sum_flag_elec']=np.nan

## Then create the best estimate of the actual usage, with appropriate flag values.

# If there is a reading based on hh data, use that.  (We can just rename Elec_act_net_hh_sum_Wh - we'll convert to kWh later)
energy_daily_data.rename(columns={'Elec_act_net_hh_sum_Wh':'Clean_elec_net_d_Wh'},inplace=True)
energy_daily_data.loc[energy_daily_data.Clean_elec_net_d_Wh.notnull(), 'Hh_sum_flag_elec']=1 # Value 1 for source data being a hh sum

# If there is no reading based on hh data, then if the Elec_act_imp_flag==1 (a valid read from the daily data) AND the PUPRN never exports electricity (it's not in exporter_puprn_list), use the Elec_act_imp_d_Wh.
energy_daily_data.loc[(energy_daily_data.Clean_elec_net_d_Wh.isnull())&
                      (energy_daily_data.Elec_act_imp_flag==1)&
                      (~energy_daily_data.index.isin(exporter_puprn_list,level='PUPRN')),
                      'Hh_sum_flag_elec']=0 # Value 0 for source data being a daily read
energy_daily_data.loc[(energy_daily_data.Clean_elec_net_d_Wh.isnull())&
                      (energy_daily_data.Elec_act_imp_flag==1)&
                      (~energy_daily_data.index.isin(exporter_puprn_list,level='PUPRN')),
                      'Clean_elec_net_d_Wh']=energy_daily_data.Elec_act_imp_d_Wh

# Final data cleaning - deal with occasional cases where hh readings are apparently stored in units of 10s of Wh
# For all rows where the daily sum is around 10x the import hh_sum (between 8 and 12 times), we treat the daily sum as being the correct import value. 
# First, check how common this issue is
rowsof10Wh = energy_daily_data.loc[(energy_daily_data.Elec_act_imp_d_Wh>=8*energy_daily_data.Elec_act_imp_hh_sum_Wh)&
                                   (energy_daily_data.Elec_act_imp_d_Wh<=12*energy_daily_data.Elec_act_imp_hh_sum_Wh)].shape[0]
all_data_length = energy_daily_data.shape[0]
affected_PUPRNs=(energy_daily_data.loc[(energy_daily_data.Elec_act_imp_d_Wh>=8*energy_daily_data.Elec_act_imp_hh_sum_Wh)&
                                       (energy_daily_data.Elec_act_imp_d_Wh<=12*energy_daily_data.Elec_act_imp_hh_sum_Wh)].index.get_level_values('PUPRN').unique().tolist())
affected_PUPRN_count=len(affected_PUPRNs)
all_PUPRN_count = len(energy_daily_data.index.get_level_values('PUPRN').unique().tolist())

if treat_hh_sum_10Wh == 'by_day':
    # If the home does not export data for that day (the net and import hh_sums are equal), then we can use the daily sum, assuming it to be accurate.
    energy_daily_data.loc[(energy_daily_data.Elec_act_imp_d_Wh>=8*energy_daily_data.Elec_act_imp_hh_sum_Wh)&
                          (energy_daily_data.Elec_act_imp_d_Wh<=12*energy_daily_data.Elec_act_imp_hh_sum_Wh)&
                          (energy_daily_data.Elec_act_imp_hh_sum_Wh==energy_daily_data.Clean_elec_net_d_Wh),
                          'Hh_sum_flag_elec']=0
    energy_daily_data.loc[(energy_daily_data.Elec_act_imp_d_Wh>=8*energy_daily_data.Elec_act_imp_hh_sum_Wh)&
                          (energy_daily_data.Elec_act_imp_d_Wh<=12*energy_daily_data.Elec_act_imp_hh_sum_Wh)&
                          (energy_daily_data.Elec_act_imp_hh_sum_Wh==energy_daily_data.Clean_elec_net_d_Wh),
                          'Clean_elec_net_d_Wh']=energy_daily_data.Elec_act_imp_d_Wh
    # If the home does export data (the net and import hh_sums are not equal), then we have to treat the data as missing - neither the hh_sum nor the daily read are accurate measures of net electricity use.
    energy_daily_data.loc[(energy_daily_data.Elec_act_imp_d_Wh>=8*energy_daily_data.Elec_act_imp_hh_sum_Wh)&
                          (energy_daily_data.Elec_act_imp_d_Wh<=12*energy_daily_data.Elec_act_imp_hh_sum_Wh)&
                          (energy_daily_data.Elec_act_imp_hh_sum_Wh!=energy_daily_data.Clean_elec_net_d_Wh),
                          'Hh_sum_flag_elec']=np.nan
    energy_daily_data.loc[(energy_daily_data.Elec_act_imp_d_Wh>=8*energy_daily_data.Elec_act_imp_hh_sum_Wh)&
                          (energy_daily_data.Elec_act_imp_d_Wh<=12*energy_daily_data.Elec_act_imp_hh_sum_Wh)&
                          (energy_daily_data.Elec_act_imp_hh_sum_Wh!=energy_daily_data.Clean_elec_net_d_Wh),
                          'Clean_elec_net_d_Wh']=np.nan
    report_10Wh = 'These were fixed on a day by day basis for the affected rows by using the daily read where valid to do so, or else assigning a value of NaN.'
elif treat_hh_sum_10Wh == 'by_PUPRN':
    energy_daily_data = energy_daily_data.drop(affected_PUPRNs,level='PUPRN')
    final_PUPRN_count=len(energy_daily_data.index.get_level_values('PUPRN').unique().tolist())
    report_10Wh = "These were fixed on a PURPN by PUPRN basis by removing those PUPRNs' data from this year's output. Final count of PUPRNs saved is "+str(final_PUPRN_count)
    if all_PUPRN_count == (final_PUPRN_count + affected_PUPRN_count):
        report_10Wh=report_10Wh+". This is the expected number."
    else:
        report_10Wh=report_10Wh+"\nWARNING: This is NOT the expected number of PUPRNs - check for errors."
else:
    report_10Wh = "No action was taken with these rows - the output contains these hh_sum values even if they are likely to be in the incorrect units (10Whs)"

# Convert to kWh
energy_daily_data['Clean_elec_net_d_kWh'] = energy_daily_data.Clean_elec_net_d_Wh.div(1000)

# Report back
print(f"There were {rowsof10Wh} rows where the daily import value is around 10x (8x-12x) the hh sum, out of a total of {all_data_length}.\n {affected_PUPRN_count} PUPRNs are affected, out of a total of {all_PUPRN_count}.\n{report_10Wh}")

In [None]:
energy_daily_data[energy_daily_data.index.isin(exporter_puprn_list,level='PUPRN')].head()

In [None]:
# Save the relevant columns to merge with the gas readings later.
if not os.path.exists(os.path.join(output_directory)):
    os.makedirs(os.path.join(output_directory))
energy_daily_data[['Clean_elec_net_d_kWh','Hh_sum_flag_elec']].to_csv(os.path.join(output_directory,output_filename),index=True)
print("Job done. Everything saved.")