# About
This notebook will produce all the gas daily data for step 3.1A of the data processing for Module 1.

**This requires output from the previous step - 1.1A - so run that notebook first**

---

**Required user input**

Update the cell below once each for the full years of 2019, 2020, 2021, and run the entire notebook for each.


In [None]:
year = '2021' # Update year - this is the year of data you are working on.

In [None]:
# Don't change these.
source_directory='Step_1_1_Outputs'
source_filename = 'Step_1_1A_Gas_'+year+'_daily.csv'

output_directory='Step_3_1_Outputs'
output_filename='Step_3_1A_Gas_'+year+'_daily.csv'

# Code

## Setup

In [None]:
import os
import pandas as pd
import numpy as np

## Process the data

In [None]:
# Load the source csv calculated in 1.1A
energy_daily_data = pd.read_csv(os.path.join(source_directory,source_filename),
                            usecols=['PUPRN','Read_date_effective_local',
                                     'Gas_flag','Gas_d_kWh','Gas_hh_sum_kWh'],
                                index_col=['PUPRN','Read_date_effective_local'])

In [None]:
# Check the data has no duplicates and is sorted by PUPRN then date
all_rows_as_expected = (energy_daily_data.index.is_monotonic_increasing & energy_daily_data.index.is_unique)
if all_rows_as_expected == True:
    print('Data is sorted by PUPRN then date and has no duplicates, as it should be.')
else:
    print('WARNING! Your data has duplicates of PUPRN and date, or is out of order - check and fix before continuing.')

In [None]:
# Add a gas flag column
energy_daily_data['Hh_sum_flag_gas']=np.nan

# Then create the best estimate of the actual usage, with appropriate flag values.

# If there is a reading based on hh data, use that. (We can just rename Gas_hh_sum_kWh)
energy_daily_data.rename(columns={'Gas_hh_sum_kWh':'Clean_gas_d_kWh'},inplace=True)
energy_daily_data.loc[energy_daily_data.Clean_gas_d_kWh.notnull(), 'Hh_sum_flag_gas']=1 # Value 1 for source data being a hh sum

# If there is no reading based on hh data, then if the Gas_flag=1, use the Gas_d_kWh.
energy_daily_data.loc[(energy_daily_data.Clean_gas_d_kWh.isnull())&(energy_daily_data.Gas_flag==1),'Hh_sum_flag_gas']=0
energy_daily_data.loc[(energy_daily_data.Clean_gas_d_kWh.isnull())&(energy_daily_data.Gas_flag==1),'Clean_gas_d_kWh']=energy_daily_data.Gas_d_kWh


In [None]:
energy_daily_data.head()

In [None]:
# Save the relevant columns to merge with the electricity readings later.
if not os.path.exists(os.path.join(output_directory)):
    os.makedirs(os.path.join(output_directory))
energy_daily_data[['Clean_gas_d_kWh','Hh_sum_flag_gas']].to_csv(os.path.join(output_directory,output_filename),index=True)
print("Job done. Everything saved.")