In [1]:
import numpy as np
import os
import pandas as pd
from collections import defaultdict
from pathlib import Path
from tqdm import tqdm
import statsmodels.api as sm
import warnings
import random
from IPython.display import display, HTML
from statsmodels.tsa.seasonal import seasonal_decompose
from matplotlib import pyplot as plt
np.random.seed(42)
random.seed(42)
tqdm.pandas()

## 03 - Creating Train, Test and Eval Splits

In this notebook, we create training, testing and evaluation splits of the data for building models.

We take 50 households from each of the three Acorn groups for now, in order to reduce the burden on our hardware (peforming forecasts with large numbers of series may be memory intensive). Note that this means we're ignoring households with an unknown Acorn group.

In [2]:
# Read in the map of acord household types
lclid_acorn_map = pd.read_pickle('../data/preprocessed/london_smart_meters_lclid_acorn_map.pkl')

# Split the households according to their acorn group
affluent_households = lclid_acorn_map.loc[lclid_acorn_map.Acorn_grouped=='Affluent', ['LCLid', 'file']]
adversity_households = lclid_acorn_map.loc[lclid_acorn_map.Acorn_grouped=='Adversity', ['LCLid', 'file']]
comfortable_households = lclid_acorn_map.loc[lclid_acorn_map.Acorn_grouped=='Comfortable', ['LCLid', 'file']]

In [3]:
households_per_acorn = 50
random_state = 76 #  Set for reproducibility

selected_households = pd.concat(
    [
        affluent_households.sample(households_per_acorn, random_state=random_state),
        comfortable_households.sample(households_per_acorn, random_state=random_state),
        adversity_households.sample(households_per_acorn, random_state=random_state),
    ]
)
selected_households['block'] = selected_households.file.str.split('_', expand=True).iloc[:,1].astype(int)

In [4]:
# Extract the paths to the different blocks, extracting the starting and ending blocks
path_blocks = [
    (p, *list(map(int, p.name.split('_')[5].split('.')[0].split('-'))))
    for p in Path('../data/preprocessed').glob(
        'london_smart_meters_merged_block*'
    )
]

In [5]:
household_df_l = []
for path, start_b, end_b in tqdm(path_blocks):
    block_df = pd.read_parquet(path)
    selected_households['block'].between
    mask = selected_households['block'].between(start_b, end_b)
    lclids = selected_households.loc[mask, "LCLid"]
    household_df_l.append(block_df.loc[block_df.LCLid.isin(lclids)])

100%|██████████| 14/14 [01:59<00:00,  8.54s/it]


In [6]:
block_df = pd.concat(household_df_l)
del household_df_l
block_df.head()

Unnamed: 0,LCLid,start_timestamp,frequency,energy_consumption,series_length,stdorToU,Acorn,Acorn_grouped,file,holidays,...,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
57,MAC000768,2012-04-21,30min,"[0.8440000000000001, 0.265, 0.262, 0.233999999...",32544,Std,ACORN-A,Affluent,block_1,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[251, 251, 251, 251, 246, 246, 242, 242, 244, ...","[6.42, 6.42, 6.2, 6.2, 5.68, 5.68, 5.16, 5.16,...","[3.54, 3.54, 3.61, 3.61, 3.52, 3.52, 3.11, 3.1...","[994.96, 994.96, 994.98, 994.98, 994.82, 994.8...","[3.79, 3.79, 3.67, 3.67, 3.15, 3.15, 2.61, 2.6...","[3.64, 3.64, 3.42, 3.42, 3.25, 3.25, 3.13, 3.1...","[rain, rain, rain, rain, rain, rain, rain, rai...","[partly-cloudy-night, partly-cloudy-night, par...","[0.82, 0.82, 0.83, 0.83, 0.86, 0.86, 0.87, 0.8...","[Partly Cloudy, Partly Cloudy, Partly Cloudy, ..."
63,MAC000948,2012-05-02,30min,"[0.008, 0.009, 0.008, 0.008, 0.008, 0.009, 0.0...",32016,Std,ACORN-A,Affluent,block_1,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[351, 351, 0, 0, 0, 0, 351, 351, 348, 348, 3, ...","[11.81, 11.81, 11.12, 11.12, 11.2, 11.2, 11.18...","[10.47, 10.47, 10.15, 10.15, 9.89, 9.89, 9.29,...","[1021.42, 1021.42, 1021.44, 1021.44, 1021.33, ...","[11.81, 11.81, 11.12, 11.12, 11.2, 11.2, 11.18...","[2.53, 2.53, 2.41, 2.41, 2.06, 2.06, 2.98, 2.9...","[rain, rain, rain, rain, rain, rain, rain, rai...","[partly-cloudy-night, partly-cloudy-night, par...","[0.91, 0.91, 0.94, 0.94, 0.92, 0.92, 0.88, 0.8...","[Mostly Cloudy, Mostly Cloudy, Mostly Cloudy, ..."
2827,MAC003299,2012-09-25,30min,"[0.254, 0.201, 0.183, 0.2189999999999999, 0.18...",25008,Std,ACORN-C,Affluent,block_5,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[220, 220, 218, 218, 215, 215, 211, 211, 207, ...","[10.93, 10.93, 10.81, 10.81, 10.27, 10.27, 10....","[7.76, 7.76, 8.07, 8.07, 8.04, 8.04, 7.62, 7.6...","[989.26, 989.26, 989.27, 989.27, 989.0, 989.0,...","[10.93, 10.93, 10.81, 10.81, 10.27, 10.27, 10....","[4.9, 4.9, 4.98, 4.98, 4.45, 4.45, 4.51, 4.51,...","[rain, rain, rain, rain, rain, rain, rain, rai...","[clear-night, clear-night, clear-night, clear-...","[0.81, 0.81, 0.83, 0.83, 0.86, 0.86, 0.84, 0.8...","[Clear, Clear, Clear, Clear, Clear, Clear, Cle..."
3389,MAC003157,2012-07-15,30min,"[0.181, 0.126, 0.13, 0.134, 0.18, 0.179, 0.118...",28464,ToU,ACORN-C,Affluent,block_6,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[250, 250, 259, 259, 267, 267, 284, 284, 286, ...","[12.72, 12.72, 12.72, 12.72, 12.98, 12.98, 12....","[12.01, 12.01, 12.0, 12.0, 12.32, 12.32, 11.83...","[1011.2, 1011.2, 1011.17, 1011.17, 1011.23, 10...","[12.72, 12.72, 12.72, 12.72, 12.98, 12.98, 12....","[1.73, 1.73, 2.15, 2.15, 2.31, 2.31, 2.28, 2.2...","[rain, rain, rain, rain, rain, rain, rain, rai...","[clear-night, clear-night, partly-cloudy-night...","[0.95, 0.95, 0.95, 0.95, 0.96, 0.96, 0.95, 0.9...","[Clear, Clear, Partly Cloudy, Partly Cloudy, P..."
3916,MAC000193,2012-01-01,30min,"[0.368, 0.386, 0.17, 0.021, 0.038, 0.038, 0.02...",37872,ToU,ACORN-D,Affluent,block_7,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[229, 229, 238, 238, 229, 229, 231, 231, 227, ...","[12.12, 12.12, 12.59, 12.59, 12.45, 12.45, 12....","[10.97, 10.97, 11.02, 11.02, 11.04, 11.04, 10....","[1008.1, 1008.1, 1007.88, 1007.88, 1007.95, 10...","[12.12, 12.12, 12.59, 12.59, 12.45, 12.45, 12....","[5.9, 5.9, 6.06, 6.06, 5.31, 5.31, 4.68, 4.68,...","[rain, rain, rain, rain, rain, rain, rain, rai...","[partly-cloudy-night, partly-cloudy-night, clo...","[0.93, 0.93, 0.9, 0.9, 0.91, 0.91, 0.93, 0.93,...","[Mostly Cloudy, Mostly Cloudy, Overcast, Overc..."


We need to convert the compact form of the time series columns into the expanded form to make them easier to work with.

First define a function for converted each time series from its compact form to expanded form:

In [7]:
def compact_to_expanded(df, timeseries_col, static_cols, time_varying_cols, ts_identifier):
    """Takes a DataFrame of time series, a time series column, a list of static columns,
    a list of time-varying columns and an identifier column name. 
    Returns an expanded form version of the DataFrame."""

    def preprocess_expanded(x):
        # Create a date range from start
        dr = pd.date_range(
            start = x['start_timestamp'],
            periods = len(x['energy_consumption']),
            freq = x['frequency'],
        )
        # Build a dictionary of lists for each of our columns,
        # then convert this into a DataFrame for that time series
        df_columns = defaultdict(list)
        df_columns['timestamp'] = dr
        for col in [ts_identifier, timeseries_col] + static_cols + time_varying_cols:
            df_columns[col] = x[col]
        return pd.DataFrame(df_columns)

    # expand each time series row, then join these expansions together into a single DataFrame
    all_series = []
    for i in tqdm(range(len(df))):
        all_series.append(preprocess_expanded(df.iloc[i]))
    df = pd.concat(all_series)
    del all_series
    return df

In [8]:
#convert our time series to expanded form
exp_block_df = compact_to_expanded(block_df, timeseries_col = 'energy_consumption',
static_cols = ["frequency", "series_length", "stdorToU", "Acorn", "Acorn_grouped", "file"],
time_varying_cols = ['holidays', 'visibility', 'windBearing', 'temperature', 'dewPoint',
       'pressure', 'apparentTemperature', 'windSpeed', 'precipType', 'icon',
       'humidity', 'summary'],
ts_identifier = "LCLid")

exp_block_df.head()

  0%|          | 0/150 [00:00<?, ?it/s]

100%|██████████| 150/150 [00:02<00:00, 59.66it/s]


Unnamed: 0,timestamp,LCLid,energy_consumption,frequency,series_length,stdorToU,Acorn,Acorn_grouped,file,holidays,...,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
0,2012-04-21 00:00:00,MAC000768,0.844,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,251,6.42,3.54,994.96,3.79,3.64,rain,partly-cloudy-night,0.82,Partly Cloudy
1,2012-04-21 00:30:00,MAC000768,0.265,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,251,6.42,3.54,994.96,3.79,3.64,rain,partly-cloudy-night,0.82,Partly Cloudy
2,2012-04-21 01:00:00,MAC000768,0.262,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,251,6.2,3.61,994.98,3.67,3.42,rain,partly-cloudy-night,0.83,Partly Cloudy
3,2012-04-21 01:30:00,MAC000768,0.234,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,251,6.2,3.61,994.98,3.67,3.42,rain,partly-cloudy-night,0.83,Partly Cloudy
4,2012-04-21 02:00:00,MAC000768,0.046,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,246,5.68,3.52,994.82,3.15,3.25,rain,clear-night,0.86,Clear


In [9]:
exp_block_df.info(memory_usage="deep", verbose=False)

<class 'pandas.core.frame.DataFrame'>
Index: 4711440 entries, 0 to 33263
Columns: 21 entries, timestamp to summary
dtypes: datetime64[ns](1), float64(8), int64(2), object(10)
memory usage: 3.3 GB


We can reduce the memory footprint of the expanded data by converting the datatypes of particular columns:

In [10]:
def reduce_memory_footprint(df):
    dtypes = df.dtypes
    object_cols = dtypes[dtypes == "object"].index.tolist()
    float_cols = dtypes[dtypes == "float64"].index.tolist()
    int_cols = dtypes[dtypes == "int64"].index.tolist()
    df[int_cols] = df[int_cols].astype("int32")
    df[object_cols] = df[object_cols].astype("category")
    df[float_cols] = df[float_cols].astype("float32")
    return df

In [11]:
exp_block_df = reduce_memory_footprint(exp_block_df)

In [12]:
exp_block_df.info(memory_usage="deep", verbose=False)

<class 'pandas.core.frame.DataFrame'>
Index: 4711440 entries, 0 to 33263
Columns: 21 entries, timestamp to summary
dtypes: category(10), datetime64[ns](1), float32(8), int32(2)
memory usage: 301.1 MB


We have data from the beginning of November 2011 until the end of February 2014. 

We will set aside data from 2014 as our validation and testing data, with January as validation and February as test. 

In [13]:
test_mask = (exp_block_df.timestamp.dt.year == 2014) & (exp_block_df.timestamp.dt.month == 2)
val_mask = (exp_block_df.timestamp.dt.year == 2014) & (exp_block_df.timestamp.dt.month == 1)

train = exp_block_df[~(val_mask|test_mask)]
val = exp_block_df[val_mask]
test = exp_block_df[test_mask]
print(f'# of Training samples: {len(train)} | # of Validation samples: {len(val)} | # of Test samples: {len(test)}')
print(f'Max Date in Train: {train.timestamp.max()} | Min Date in Validation: {val.timestamp.min()} | Min Date in Test: {test.timestamp.min()}')

# of Training samples: 4293840 | # of Validation samples: 223200 | # of Test samples: 194400
Max Date in Train: 2013-12-31 23:30:00 | Min Date in Validation: 2014-01-01 00:00:00 | Min Date in Test: 2014-02-01 00:00:00


### Filling missing values with seasonal interpolation

Before filling in any missing values in the energy consumption series using imputation, we save the raw (untouched) version of our train, val and test sets to parquet:

In [14]:
train.to_parquet('../data/preprocessed/selected_blocks_train.parquet')
val.to_parquet('../data/preprocessed/selected_blocks_val.parquet')
test.to_parquet('../data/preprocessed/selected_blocks_test.parquet')

In [15]:
def seasonal_interpolation(data):
    """Takes a time series and returns a seasonal-interpolated version
    of the time series."""
    res = data.copy()
    # Seasonal_decompose doesn't work where the series has missing values,
    # so for now we interpolate these (using limit_direction = 'both')

    na_index = pd.isna(data)
    temp = data.interpolate(method='spline', order=3, limit_direction = 'both')
    dr = seasonal_decompose(temp, period=48*7)
    # join trend and irregular component (timeseries without seasonality):
    data_no_seasonality = dr.trend + dr.resid
    # fill in NA values:
    data_no_seasonality[na_index] = np.nan
    # interpolate data without seasonality:
    data_no_seasonality_imputed = data_no_seasonality.interpolate(method='spline', order=3, limit_direction = 'both')
    # add back seasonality:
    data_imputed = data_no_seasonality_imputed + dr.seasonal
    # merge interpolated values into original timeseries:
    res[na_index] = data_imputed[na_index]

    return res

In [16]:
# apply seasonal interpolation to each household 

for lclid in tqdm(selected_households.LCLid):
    ts_df = exp_block_df[exp_block_df.LCLid == lclid].set_index('timestamp')
    ts_interpolated = seasonal_interpolation(ts_df.energy_consumption).reset_index()
    exp_block_df.loc[(exp_block_df['LCLid'] == lclid) & (exp_block_df['timestamp'].isin(ts_interpolated['timestamp'])), 
    'energy_consumption'] = ts_interpolated['energy_consumption']

100%|██████████| 150/150 [00:24<00:00,  6.08it/s]


Here, we create train, val and test sets after applying seasonal interpolation to fill in missing values.

In [17]:
train = exp_block_df[~(val_mask|test_mask)]
val = exp_block_df[val_mask]
test = exp_block_df[test_mask]
print(f'# of training samples: {len(train)} | # of validation samples: {len(val)} | # of Test samples: {len(test)}')
print(f'Max Date in train: {train.timestamp.max()} | Min Date in validation: {val.timestamp.min()} | Min Date in test: {test.timestamp.min()}')

# of training samples: 4293840 | # of validation samples: 223200 | # of Test samples: 194400
Max Date in train: 2013-12-31 23:30:00 | Min Date in validation: 2014-01-01 00:00:00 | Min Date in test: 2014-02-01 00:00:00


In [18]:
train.to_parquet('../data/preprocessed/selected_blocks_train_missing_imputed.parquet')
val.to_parquet('../data/preprocessed/selected_blocks_val_missing_imputed.parquet')
test.to_parquet('../data/preprocessed/selected_blocks_test_missing_imputed.parquet')