# About
This notebook will produce the temperature data for step 2 of the data processing for Module 1.

---

**Required user input**

Update the cell below once each for the full years of 2019, 2020, 2021, and run the entire notebook for each.


In [None]:
year = 2021 # Update year - this is the year of data you are working on.

In [None]:
# Don't change these.

# Index for the year, UTC - note that this must start at 1 Jan, 00:00:00, and end the following 1 Jan, 00:00:00
index_start_date=str(year)+'-01-01 00:00:00' # Start date for the output's index to include.
index_end_date=str(year+1)+'-01-01 00:00:00' # End date for the output's index to include.

output_directory='Step_2_Outputs'
output_filename_daily = 'Step_2_Temp_'+str(year)+'_daily.csv'
output_filename_hh = 'Step_2_Temp_'+str(year)+'_hh.csv'

# Code
## Setup

In [None]:
import os
import pandas as pd
import locations
from datetime import datetime

In [None]:
#Get list of grid_cells to work with, from the participant data file
participant_data = pd.read_csv(os.path.join(locations.serl_data_path,locations.participant_data_file),
                               usecols=['PUPRN','grid_cell'])
grid_cell_list = sorted(participant_data.grid_cell.unique().tolist(), key=str.lower)
print(len(grid_cell_list),'grid cells found.')

In [None]:
# We'll be appending each month of data to a single dataframe to save. Create a blank csv with just headers first, otherwise we either get no headers or duplicate headers below.
temperature_data_hourly = pd.DataFrame(columns=['grid_cell','date_time_utc','2m_temperature_K','temp_C'])

# Create a list from 01 of the year being processed to 01 of the subsequent year (we need one data point for each grid_cell from 1st Jan of the following year)
months = [str(item).zfill(2) for item in list((range(1,13)))]
year_months = [str(year)+'_'+i for i in months]
year_months.append(str(year+1)+'_01') # Comment this out if this file is not available in the data release.
days_count = (datetime.strptime(index_end_date, '%Y-%m-%d %H:%M:%S') - datetime.strptime(index_start_date, '%Y-%m-%d %H:%M:%S')).days #+31

for i in year_months:
    temp_temperature = pd.read_csv(os.path.join(locations.serl_data_path,
                                                locations.climate_directory,
                                               'serl_climate_data_'+i+'_edition04.csv'),
                                   index_col=False,
                                   usecols=['grid_cell','date_time_utc','analysis_date','2m_temperature_K'],
                                   parse_dates=['date_time_utc'])
    # Drop grid_cells we don't need
    temp_temperature = temp_temperature[temp_temperature.grid_cell.isin(grid_cell_list)]
    temperature_data_hourly = temperature_data_hourly.append(temp_temperature)

# Calculate temp_C
temperature_data_hourly.temp_C=temperature_data_hourly['2m_temperature_K']-273.15
# Set index
temperature_data_hourly.set_index(['grid_cell','date_time_utc'],inplace=True)

## Prepare half-hourly data

In [None]:
# Fill in those half-hours
date_time_index_new = pd.date_range(index_start_date,index_end_date,freq='30T')
temperature_data_hh = pd.DataFrame(index=pd.MultiIndex.from_product([grid_cell_list,date_time_index_new],
                                                             names=['grid_cell','date_time_utc']))
# Join onto it
temperature_data_hh = pd.merge(temperature_data_hh,temperature_data_hourly['temp_C'].to_frame(),
                               left_index=True,right_index=True, how='left')
# Ffill the gaps
temperature_data_hh.fillna(method='ffill',limit=1,inplace=True)

In [None]:
# Some basic data quality checks
# No Nans; correct length; monotonically increasing.
nancount = temperature_data_hh.temp_C.isnull().sum()
in_sequence = temperature_data_hh.index.is_monotonic_increasing 
no_duplicate_rows = temperature_data_hh.index.is_unique
wrong_length = temperature_data_hh.shape[0] - len(grid_cell_list)*len(date_time_index_new)
if (nancount== 0 and in_sequence==True and no_duplicate_rows== True and wrong_length==0):
    print('Data is sorted by grid_cell then datetime, has no duplicates (no duplicate grid_cell and date_time_utc combinations), and has no missing rows of temp_C data.')
else:
    print("WARNING! Your data has one or more issues:\n- This many missing values (should be zero):",
          nancount,
          "\n- Index out of sequence:",
          (not in_sequence),
          "\n- Duplicate rows (duplicate grid_cell and date_time_utc combination):",
          (not no_duplicate_rows),
          "\n- This many rows too long (or too short, if negative):",
          wrong_length,
          "\nCheck and fix before continuing.")

In [None]:
# Save hh output
if not os.path.exists(os.path.join(output_directory)):
    os.makedirs(os.path.join(output_directory))
temperature_data_hh['temp_C'].to_csv(os.path.join(output_directory,output_filename_hh),header=True,index=True)
print("Job done. Everything saved.")

## Prepare daily data
We'll calculate:
* daily mean C
* heating degree days, following the Spinoni et al 2015 method

In [None]:
temperature_data_hourly_b=temperature_data_hourly.reset_index()
temperature_data_hourly_b['Read_date_effective_local']=temperature_data_hourly_b.date_time_utc.dt.tz_localize(tz='UTC').dt.tz_convert(tz='Europe/London').dt.date

In [None]:
# This is adapted from the method described in McKenna et al (2022) DOI: 10.1016/j.enbuild.2022.111845..
# Create the daily df
temperature_data_daily = temperature_data_hourly_b[['grid_cell','Read_date_effective_local',
                                        'temp_C']].groupby(by=['grid_cell','Read_date_effective_local']).agg(['mean','max','min'])

temperature_data_daily.columns=temperature_data_daily.columns.droplevel(0)
temperature_data_daily.rename(columns={'mean':'mean_temp_C','max':'T_X','min':'T_N'},inplace=True)
temperature_data_daily['T_M']= (temperature_data_daily.T_X + temperature_data_daily.T_N)/2

T_b = 15.5
temperature_data_daily['hdd']=0

logic = temperature_data_daily.T_X <= T_b
temperature_data_daily.loc[logic,'hdd'] = T_b - temperature_data_daily.loc[logic,'T_M']
logic = (temperature_data_daily.T_X > T_b) & (temperature_data_daily.T_M <= T_b)
temperature_data_daily.loc[logic,'hdd'] = (T_b - temperature_data_daily.loc[logic,'T_N']) / 2 - (temperature_data_daily.loc[logic,'T_X'] - T_b) /4
logic = (temperature_data_daily.T_M > T_b) & (temperature_data_daily.T_N <= T_b)
temperature_data_daily.loc[logic,'hdd'] = (T_b - temperature_data_daily.loc[logic,'T_N']) / 4

# Round outputs to 2d.p.
temperature_data_daily = temperature_data_daily.round(decimals=2)

In [None]:
# Some basic data quality checks
# No Nans; correct length; monotonically increasing; no duplicate rows.
nancount = temperature_data_daily[['mean_temp_C','hdd']].isnull().sum().sum()
in_sequence = temperature_data_daily.index.is_monotonic_increasing 
no_duplicate_rows = temperature_data_daily.index.is_unique
wrong_length = temperature_data_daily.shape[0] - len(grid_cell_list)*days_count
if (nancount== 0 and in_sequence==True and no_duplicate_rows== True and wrong_length==0):
    print('Data is sorted by grid_cell then Read_date_effective_local, has no duplicates (no duplicate grid_cell and analysis_date combinations), and has no missing rows of mean_temp_C or hdd data.')
else:
    print("WARNING! Your data has one or more issues:\n- This many mean_temp_C or hdd missing values (should be zero):",
          nancount,
          "\n- Index out of sequence:",
          (not in_sequence),
          "\n- Duplicate rows (duplicate grid_cell and Read_date_effective_local combination):",
          (not no_duplicate_rows),
          "\n- This many rows too long (or too short, if negative):",
          wrong_length,
          "\nCheck and fix before continuing.")

In [None]:
# Save daily output
if not os.path.exists(os.path.join(output_directory)):
    os.makedirs(os.path.join(output_directory))
temperature_data_daily[['mean_temp_C','hdd']].to_csv(os.path.join(output_directory,output_filename_daily),header=True,index=True)
print("Job done. Everything saved.")