# About
This notebook will produce the remaining electricity daily data for step 1.1C of the data processing for Module 1.

**This requires output from the previous step - 1_1B - so run that notebook first**

---

**Required user input**

Update the cell below once each for the full years of 2019, 2020, 2021, and run the entire notebook for each.


In [None]:
year = '2021' # Update year - this is the year of data you are working on.

In [None]:
# Don't change these.
source_directory='Step_1_1_Outputs\Step_1_1B_Elec_'+year+'_hh' # This is the path the pre-calculated clean hh data files are saved in

output_directory='Step_1_1_Outputs'
output_filename = 'Step_1_1C_Elec_'+year+'_daily_from_hh.csv'
exporter_list_filename = 'Step_1_1C_Elec_'+year+'_list_of_exporter_puprns.csv'

# Code
## Setup

In [None]:
import os
import pandas as pd
import locations
# Clock change dates
clock_changes = pd.read_csv(os.path.join(locations.serl_data_path,locations.bst_dates),index_col=False,usecols=['Read_date_effective_local','n_hh'])

In [None]:
#Get list of PUPRNs to work with, from the source directory
puprn_filelist = [f for f in os.listdir(source_directory) if os.path.isfile(os.path.join(source_directory, f))]
puprn_filelist = sorted(puprn_filelist, key=str.lower)
print('Check this is how many PUPRN hh files you were expecting to find:\n',
      len(puprn_filelist))

## Loop through the list of PUPRNs calculating daily net electricity use per day for each

We also want to create a definitive list of PUPRNs that have any export data at all for this year at the same time.

In [None]:
puprns_saved=0
exporter_puprn_list=[]
# We'll be appending each PUPRN's data to the csv of results. Create a blank csv with just headers first, otherwise we either get no headers or duplicate headers below.
# NB. Elec_act_imp_hh_sum_Wh is calculated and saved too for a final step of data cleaning in 3.1B. 
pd.DataFrame(columns=['PUPRN','Read_date_effective_local',
                      'Elec_act_net_flag','Elec_act_net_hh_sum_Wh',
                      'Elec_act_imp_hh_sum_Wh']).to_csv(os.path.join(output_directory,output_filename), index=False)

for i in puprn_filelist:
    temp_data = pd.read_csv(os.path.join(source_directory,i),
                            usecols=['PUPRN','Read_date_effective_local',
                                     'Elec_act_net_hh_Wh','Elec_act_exp_hh_Wh','Elec_act_imp_hh_Wh'])
    daily_energy = temp_data[['PUPRN','Read_date_effective_local','Elec_act_net_hh_Wh','Elec_act_imp_hh_Wh']].groupby(['PUPRN','Read_date_effective_local']).agg(['count','sum']).reset_index()
    daily_energy.columns = ['_'.join(col).strip() for col in daily_energy.columns.values]
    daily_energy.rename(columns={'PUPRN_':'PUPRN','Read_date_effective_local_':'Read_date_effective_local','Elec_act_net_hh_Wh_sum':'Elec_act_net_hh_sum_Wh','Elec_act_imp_hh_Wh_sum':'Elec_act_imp_hh_sum_Wh'},inplace=True)
    # Flag days with valid hh net totals (i.e. where all 48, or 46/50, reads are present)
    daily_energy = pd.merge(daily_energy,clock_changes,on='Read_date_effective_local',how='left')
    daily_energy['Elec_act_net_flag']=0
    # For non-clock change days, flag as 1 if 48 reads are present
    daily_energy.loc[(daily_energy.Elec_act_net_hh_Wh_count==48) & (daily_energy.n_hh.isnull()),'Elec_act_net_flag']=1
    # Now flag 1 for the clock change days where 46/50 reads are present.
    daily_energy.loc[daily_energy.Elec_act_net_hh_Wh_count==daily_energy.n_hh,'Elec_act_net_flag']=1
    # Keep rows only where the flag equals 1
    daily_energy=daily_energy.loc[daily_energy.Elec_act_net_flag==1]
    # Save the relevant output, appended to the rest (it should already be sorted by Read_date_effective_local)
    daily_energy[['PUPRN','Read_date_effective_local','Elec_act_net_flag','Elec_act_net_hh_sum_Wh','Elec_act_imp_hh_sum_Wh']].to_csv(os.path.join(output_directory,output_filename), mode='a', header=None, index=False)
    puprns_saved=puprns_saved+1
    # Make a note if the home ever exports electricity this year.
    if (temp_data.Elec_act_exp_hh_Wh.sum()>0): # This will ignore Nans for summing, and produce False if all rows are Nan
        exporter_puprn_list.append(i[:-4]) # Adds the PUPRN, without the .csv suffix
    # Note progress occasionally (every 250 homes):
    if puprns_saved % 250 == 0:
        print(puprns_saved,"PUPRNs of data have been processed. Continuing...")

# Save exporter list (pandas is actually the neatest way to save a list to csv!)
pd.Series(exporter_puprn_list).to_csv(os.path.join(output_directory,exporter_list_filename), index=False)

print('Job done, total PUPRNs gone through =',puprns_saved,'\nOf which, this many exported electricity at some point:',len(exporter_puprn_list))