# About
This notebook will produce all the final clean daily energy data for step 3.1C of the data processing for Module 1.

**This requires output from the previous steps - 2, 3.1A and 3.1B - so run those notebooks first**

---

**Required user input**

Update the cell below once each for the full years of 2019, 2020, 2021, and run the entire notebook for each.


In [None]:
year = '2021' # Update year - this is the year of data you are working on.

In [None]:
# Don't change these.

# Source data files from steps 2, 3.1A and 3.1B
source_directory_gas='Step_3_1_Outputs'
source_directory_elec='Step_3_1_Outputs'
source_filename_gas = 'Step_3_1A_Gas_'+year+'_daily.csv'
source_filename_elec = 'Step_3_1B_Elec_'+year+'_daily.csv'
source_directory_temperature = 'Step_2_Outputs'
source_filename_temperature = 'Step_2_Temp_'+str(year)+'_daily.csv'

index_start_date=year+'-01-01' # Start date for the output's index to include.
index_end_date=year+'-12-31' # End date for the output's index to include.

output_directory= 'Module_1_final_outputs'
output_filename='annual_report_sm_daily_'+year+'.csv'

# Code

## Setup

In [None]:
import os
import pandas as pd
import numpy as np
import locations

In [None]:
# Load the source csvs
gas_daily_data = pd.read_csv(os.path.join(source_directory_gas,source_filename_gas),
                             index_col=['PUPRN','Read_date_effective_local'],
                            parse_dates=['Read_date_effective_local'])
elec_daily_data = pd.read_csv(os.path.join(source_directory_elec,source_filename_elec),
                              index_col=['PUPRN','Read_date_effective_local'],
                             parse_dates=['Read_date_effective_local'])

#Get list of grid_cells mapped to PUPRN, from the participant data file
puprn_to_grid_cell = pd.read_csv(os.path.join(locations.serl_data_path,locations.participant_data_file),
                                 usecols=['PUPRN','grid_cell'])

# Load the temperature data
temperature_daily_data = pd.read_csv(os.path.join(source_directory_temperature,source_filename_temperature),
                                     parse_dates=['Read_date_effective_local'],
                                     index_col=False)

## Process the data

In [None]:
# Merge the energy together, keeping everything
energy_daily_data = pd.merge(elec_daily_data,gas_daily_data,left_index=True,right_index=True,how='outer')
num_PUPRNS = len(energy_daily_data.index.get_level_values('PUPRN').unique())

In [None]:
# At this stage, we have all the clean readings for electricity and gas, net for electricity, based on hh data wherever possible, otherwise daily, otherwise missing.
# Rows for dates with no clean gas or electricity for a particular PUPRN are still missing for that particular PUPRN. 
# Final steps are to fill those in for each PUPRN, so there are rows of Nans for the full year for each PUPRN.

# First, create the template index - a complete year.
date_index_new = pd.date_range(index_start_date,index_end_date,freq='D')
# Then a blank df based on it
energy_daily_data_final = pd.DataFrame(index=pd.MultiIndex.from_product([energy_daily_data.index.get_level_values('PUPRN').unique(),date_index_new],
                                                             names=['PUPRN','Read_date_effective_local']))

# Join onto it
energy_daily_data_final = pd.merge(energy_daily_data_final,energy_daily_data,left_index=True,right_index=True, how='outer')

In [None]:
# Join on the temperature data
# Prep the temperature data for joining - # Get it ready for each PUPRN
temperature_daily_data_per_puprn = pd.merge(temperature_daily_data, puprn_to_grid_cell, on='grid_cell',how='outer')
temperature_daily_data_per_puprn.set_index(['PUPRN','Read_date_effective_local'],inplace=True)
energy_daily_data_final = pd.merge(energy_daily_data_final,temperature_daily_data_per_puprn[['mean_temp_C','hdd']],left_index=True,right_index=True, how='left')

In [None]:
# Some basic data quality checks
# No Nans; correct length; monotonically increasing; no duplicate rows.
nancount = energy_daily_data_final[['mean_temp_C','hdd']].isnull().sum().sum()
in_sequence = energy_daily_data_final.index.is_monotonic_increasing 
no_duplicate_rows = energy_daily_data_final.index.is_unique
wrong_length = energy_daily_data_final.shape[0] - num_PUPRNS*len(date_index_new)

if (nancount== 0 and in_sequence==True and no_duplicate_rows== True and wrong_length==0):
    print('Data is sorted by PUPRN then Read_date_effective_local, has no duplicates (no duplicate PUPRN and Read_date_effective_local combinations), and has no missing rows of mean_temp_C or hdd data.')
else:
    print("WARNING! Your data has one or more issues:\n- This many mean_temp_C or hdd missing values (should be zero):",
          nancount,
          "\n- Index out of sequence:",
          (not in_sequence),
          "\n- Duplicate rows (duplicate PUPRN and Read_date_effective_local combination):",
          (not no_duplicate_rows),
          "\n- This many rows too long (or too short, if negative):",
          wrong_length,
          "\nCheck and fix before continuing.")
print("\nHere's the tail of your new dataframe:\n")
energy_daily_data_final.tail()

In [None]:
# Save: This is the output of final daily data from Module 1.
if not os.path.exists(os.path.join(output_directory)):
    os.makedirs(os.path.join(output_directory))
energy_daily_data_final.to_csv(os.path.join(output_directory,output_filename),index=True)
print("Job done. Everything saved.")