## 02 - Preprocessing London Smart Meter Dataset

- https://github.com/PacktPublishing/Modern-Time-Series-Forecasting-with-Python/blob/main/notebooks/Chapter02/02%20-%20Preprocessing%20London%20Smart%20Meter%20Dataset.ipynb

In [1]:

import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings('ignore')
plt.style.use("seaborn-v0_8-whitegrid")
%matplotlib inline

pd.options.display.max_columns = 999

In [2]:
#!pip install git+https://github.com/TimeSynth/TimeSynth.git

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import os
import plotly.io as pio
pio.templates.default = "plotly_white"
import pandas as pd
from pathlib import Path
from tqdm.autonotebook import tqdm
# %load_ext autoreload
# %autoreload 2
np.random.seed()
tqdm.pandas()

## 데이터 집합 정보

무료 개방형 데이터 공유 포털인 런던 데이터 스토어에서 이 데이터 세트를 제공했으며, Jean-Michel D.가 수집하고 보강하여 Kaggle(https://www.kaggle.com/jeanmidev/smart-meters-in-london)에 업로드했습니다. 
이 데이터 세트에는 2011년 11월부터 2014년 2월까지 영국 전력 네트워크가 주도한 저탄소 런던 프로젝트에 참여한 5,567개의 런던 가구 샘플에 대한 에너지 소비량 측정값이 포함되어 있습니다. 판독값은 30분 간격으로 측정되었습니다. 가구에 대한 일부 메타데이터도 데이터 세트의 일부로 제공됩니다.

## 데이터 랭글링
Kaggle 데이터 집합에는 일 단위로 사전 처리되고 모든 개별 파일 등을 결합한 시계열 데이터도 있습니다. 하지만 이러한 파일은 무시하고 hhblock_dataset 폴더에 있는 원시 파일부터 시작하겠습니다.

### Converting the half hourly block level dataset into a time series data 

Let's pick one block and see how we can transform the data.

In [4]:
source_data = Path("../data/london_smart_meters/")
block_data_path = source_data/"hhblock_dataset"/"hhblock_dataset"

In [5]:
block_1 = pd.read_csv(block_data_path/"block_0.csv", parse_dates=False)
block_1['day'] = pd.to_datetime(block_1['day'], yearfirst=True)
block_1.head()

Unnamed: 0,LCLid,day,hh_0,hh_1,hh_2,hh_3,hh_4,hh_5,hh_6,hh_7,hh_8,hh_9,hh_10,hh_11,hh_12,hh_13,hh_14,hh_15,hh_16,hh_17,hh_18,hh_19,hh_20,hh_21,hh_22,hh_23,hh_24,hh_25,hh_26,hh_27,hh_28,hh_29,hh_30,hh_31,hh_32,hh_33,hh_34,hh_35,hh_36,hh_37,hh_38,hh_39,hh_40,hh_41,hh_42,hh_43,hh_44,hh_45,hh_46,hh_47
0,MAC000002,2012-10-13,0.263,0.269,0.275,0.256,0.211,0.136,0.161,0.119,0.167,0.109,0.168,0.107,0.166,0.117,0.157,0.126,0.146,0.106,0.135,0.191,0.915,0.933,0.122,0.138,0.076,0.133,0.076,0.133,0.085,0.263,0.134,0.235,0.124,0.184,0.23,0.176,0.388,0.26,0.918,0.278,0.267,0.239,0.23,0.233,0.235,0.188,0.259,0.25
1,MAC000002,2012-10-14,0.262,0.166,0.226,0.088,0.126,0.082,0.123,0.083,0.12,0.079,0.121,0.075,0.124,0.073,0.125,0.07,0.13,0.108,0.196,0.346,0.524,0.076,0.129,0.667,0.23,0.22,0.163,0.091,0.17,0.11,0.11,0.121,0.099,0.157,0.093,0.371,0.386,1.085,1.075,0.956,0.821,0.745,0.712,0.511,0.231,0.21,0.278,0.159
2,MAC000002,2012-10-15,0.192,0.097,0.141,0.083,0.132,0.07,0.13,0.074,0.124,0.078,0.118,0.082,0.112,0.087,0.106,0.14,0.12,1.075,0.146,0.123,0.082,0.127,0.077,0.551,0.149,0.129,0.075,0.13,0.075,0.129,0.075,0.128,0.166,0.194,0.695,0.26,0.227,0.255,1.164,0.249,0.225,0.258,0.26,0.334,0.299,0.236,0.241,0.237
3,MAC000002,2012-10-16,0.237,0.237,0.193,0.118,0.098,0.107,0.094,0.109,0.091,0.105,0.091,0.104,0.092,0.103,0.093,0.101,0.144,0.1,0.408,0.102,0.1,0.116,0.354,0.146,0.19,0.991,0.31,0.121,0.113,0.094,0.119,0.087,0.13,0.238,0.204,0.284,0.447,0.266,0.966,0.172,0.192,0.228,0.203,0.211,0.188,0.213,0.157,0.202
4,MAC000002,2012-10-17,0.157,0.211,0.155,0.169,0.101,0.117,0.084,0.118,0.08,0.119,0.075,0.123,0.071,0.126,0.067,0.124,0.118,0.132,0.358,0.628,0.784,0.681,0.749,0.593,0.502,0.115,0.113,0.092,0.124,0.084,0.125,0.078,0.136,0.227,0.207,0.141,0.258,0.217,0.223,0.075,0.23,0.208,0.265,0.377,0.327,0.277,0.288,0.256


Find the Global End Date

In [6]:
block_1.groupby("LCLid")['day'].max().sample(10)# 종료일이 같은지 확인

LCLid
MAC003844   2014-02-27
MAC003422   2014-02-27
MAC003388   2014-02-27
MAC000246   2014-02-27
MAC003856   2014-02-27
MAC003423   2014-02-27
MAC004034   2013-09-18
MAC003680   2014-02-27
MAC003737   2014-02-27
MAC003874   2014-02-27
Name: day, dtype: datetime64[ns]

In [7]:
max_date = None
for f in tqdm(block_data_path.glob("*.csv")):
    df = pd.read_csv(f, parse_dates=False)
    df['day'] = pd.to_datetime(df['day'], yearfirst=True)
    if max_date is None:
        max_date = df['day'].max()
    else:
        if df['day'].max()>max_date:
            max_date = df['day'].max()
print(f"Max Date across all blocks: {max_date}")
del df

0it [00:00, ?it/s]

Max Date across all blocks: 2014-02-27 00:00:00


Basic Preprocessing: 시간대를 세로 방향으로 reshaping

In [8]:
block_1.set_index(["LCLid", "day"])

Unnamed: 0_level_0,Unnamed: 1_level_0,hh_0,hh_1,hh_2,hh_3,hh_4,hh_5,hh_6,hh_7,hh_8,hh_9,hh_10,hh_11,hh_12,hh_13,hh_14,hh_15,hh_16,hh_17,hh_18,hh_19,hh_20,hh_21,hh_22,hh_23,hh_24,hh_25,hh_26,hh_27,hh_28,hh_29,hh_30,hh_31,hh_32,hh_33,hh_34,hh_35,hh_36,hh_37,hh_38,hh_39,hh_40,hh_41,hh_42,hh_43,hh_44,hh_45,hh_46,hh_47
LCLid,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1
MAC000002,2012-10-13,0.263,0.269,0.275,0.256,0.211,0.136,0.161,0.119,0.167,0.109,0.168,0.107,0.166,0.117,0.157,0.126,0.146,0.106,0.135,0.191,0.915,0.933,0.122,0.138,0.076,0.133,0.076,0.133,0.085,0.263,0.134,0.235,0.124,0.184,0.230,0.176,0.388,0.260,0.918,0.278,0.267,0.239,0.230,0.233,0.235,0.188,0.259,0.250
MAC000002,2012-10-14,0.262,0.166,0.226,0.088,0.126,0.082,0.123,0.083,0.120,0.079,0.121,0.075,0.124,0.073,0.125,0.070,0.130,0.108,0.196,0.346,0.524,0.076,0.129,0.667,0.230,0.220,0.163,0.091,0.170,0.110,0.110,0.121,0.099,0.157,0.093,0.371,0.386,1.085,1.075,0.956,0.821,0.745,0.712,0.511,0.231,0.210,0.278,0.159
MAC000002,2012-10-15,0.192,0.097,0.141,0.083,0.132,0.070,0.130,0.074,0.124,0.078,0.118,0.082,0.112,0.087,0.106,0.140,0.120,1.075,0.146,0.123,0.082,0.127,0.077,0.551,0.149,0.129,0.075,0.130,0.075,0.129,0.075,0.128,0.166,0.194,0.695,0.260,0.227,0.255,1.164,0.249,0.225,0.258,0.260,0.334,0.299,0.236,0.241,0.237
MAC000002,2012-10-16,0.237,0.237,0.193,0.118,0.098,0.107,0.094,0.109,0.091,0.105,0.091,0.104,0.092,0.103,0.093,0.101,0.144,0.100,0.408,0.102,0.100,0.116,0.354,0.146,0.190,0.991,0.310,0.121,0.113,0.094,0.119,0.087,0.130,0.238,0.204,0.284,0.447,0.266,0.966,0.172,0.192,0.228,0.203,0.211,0.188,0.213,0.157,0.202
MAC000002,2012-10-17,0.157,0.211,0.155,0.169,0.101,0.117,0.084,0.118,0.080,0.119,0.075,0.123,0.071,0.126,0.067,0.124,0.118,0.132,0.358,0.628,0.784,0.681,0.749,0.593,0.502,0.115,0.113,0.092,0.124,0.084,0.125,0.078,0.136,0.227,0.207,0.141,0.258,0.217,0.223,0.075,0.230,0.208,0.265,0.377,0.327,0.277,0.288,0.256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MAC005492,2014-02-23,0.165,0.206,0.149,0.101,0.115,0.120,0.102,0.105,0.125,0.106,0.103,0.131,0.133,0.102,0.101,0.130,0.281,0.211,0.149,0.224,0.216,0.379,0.179,0.215,0.208,0.194,0.203,0.265,0.187,0.196,0.168,0.122,0.131,0.128,0.135,0.163,0.494,0.290,0.296,0.750,0.315,0.326,0.293,0.287,0.213,0.287,0.245,0.222
MAC005492,2014-02-24,0.219,0.236,0.288,0.219,0.165,0.086,0.111,0.095,0.084,0.081,0.105,0.129,0.087,0.079,0.089,0.111,0.133,0.133,0.188,0.208,0.168,0.238,0.160,0.250,0.175,0.080,0.082,0.116,0.088,0.206,0.161,0.203,0.150,0.096,0.263,0.234,0.254,0.273,0.303,0.378,0.253,0.193,0.212,0.193,0.192,0.297,0.168,0.170
MAC005492,2014-02-25,0.194,0.185,0.150,0.095,0.081,0.082,0.081,0.112,0.094,0.082,0.079,0.124,0.125,0.082,0.080,0.084,0.080,0.120,0.082,0.348,0.195,0.198,0.210,0.185,0.174,0.204,0.179,0.156,0.137,0.137,0.132,0.109,0.170,0.119,0.155,0.155,0.111,0.272,0.545,0.233,0.245,0.194,0.215,0.201,0.241,0.162,0.157,0.167
MAC005492,2014-02-26,0.205,0.183,0.186,0.165,0.194,0.114,0.080,0.090,0.106,0.082,0.080,0.125,0.117,0.081,0.079,0.083,0.142,0.186,0.153,0.653,0.235,0.144,0.162,0.176,0.145,0.118,0.168,0.104,0.081,0.129,0.083,0.082,0.098,0.123,0.193,0.141,0.141,0.687,0.298,0.224,0.239,0.242,0.237,0.246,0.333,0.202,0.232,0.203


In [9]:
block_1.set_index(["LCLid", "day"]).stack()

LCLid      day              
MAC000002  2012-10-13  hh_0     0.263
                       hh_1     0.269
                       hh_2     0.275
                       hh_3     0.256
                       hh_4     0.211
                                ...  
MAC005492  2014-02-27  hh_43    0.250
                       hh_44    0.182
                       hh_45    0.122
                       hh_46    0.140
                       hh_47    0.192
Length: 1213678, dtype: float64

In [10]:
block_1.set_index(["LCLid", "day"]).stack().reset_index()

Unnamed: 0,LCLid,day,level_2,0
0,MAC000002,2012-10-13,hh_0,0.263
1,MAC000002,2012-10-13,hh_1,0.269
2,MAC000002,2012-10-13,hh_2,0.275
3,MAC000002,2012-10-13,hh_3,0.256
4,MAC000002,2012-10-13,hh_4,0.211
...,...,...,...,...
1213673,MAC005492,2014-02-27,hh_43,0.250
1213674,MAC005492,2014-02-27,hh_44,0.182
1213675,MAC005492,2014-02-27,hh_45,0.122
1213676,MAC005492,2014-02-27,hh_46,0.140


정리하기

In [11]:
#Reshaping the dataframe into the long form with hour blocks along the rows
block_1 = block_1.set_index(['LCLid', "day"]).stack().reset_index().rename(columns={"level_2": "hour_block", 0: "energy_consumption"})
#Creating a numerical hourblock column
block_1['offset'] = block_1['hour_block'].str.replace("hh_", "").astype(int)

block_1.head()

Unnamed: 0,LCLid,day,hour_block,energy_consumption,offset
0,MAC000002,2012-10-13,hh_0,0.263,0
1,MAC000002,2012-10-13,hh_1,0.269,1
2,MAC000002,2012-10-13,hh_2,0.275,2
3,MAC000002,2012-10-13,hh_3,0.256,3
4,MAC000002,2012-10-13,hh_4,0.211,4


각 데이터 집합을 컴팩트 또는 확장된 형식으로 변환하기 위해 수행해야 하는 단계는 다를 수 있습니다.  
원본 데이터가 어떻게 구조화되어 있는지에 따라 다릅니다.  
여기서는 런던 스마트 미터 데이터 집합을 어떻게 변환하여 이러한 학습 내용을 다른 데이터 집합으로 옮길 수 있는지 살펴보겠습니다. 

데이터를 compact 또는 expanded 형태로 처리하기 전에 수행해야 할 두 가지 단계가 있습니다:

1. 글로벌 종료 날짜를 찾습니다: 시계열의 글로벌 종료 날짜를 알 수 있도록 모든 블록 파일에서 최대 날짜를 찾아야 합니다.

2. 기본 전처리: hhblock_dataset의 구조를 기억하신다면, 각 행에 날짜가 있고 열을 따라 30분마다 블록이 있다는 것을 기억하실 것입니다. 이를 각 행에 날짜와 30분 단위의 단일 블록이 있는 긴 형태로 재구성해야 합니다. 이렇게 하면 처리하기가 더 쉽습니다.

Expanded form

1. 시작 날짜를 찾습니다.

2. 시작 날짜와 전역 종료 날짜를 사용하여 표준 데이터 프레임을 만듭니다.

3. 누락된 데이터는 np.nan으로 남겨두고 LCLid용 데이터 프레임을 표준 데이터 프레임에 왼쪽으로 병합합니다.

4. 병합된 데이터 프레임을 반환합니다.

In [12]:
def preprocess_expanded(x):
    start_date = x['day'].min()
    ### Fill missing dates with NaN ###
    # Create a date range from  min to max
    dr = pd.date_range(start=x['day'].min(), end=x['day'].max(), freq="1D")
    # Add hh_0 to hh_47 to columns and with some unstack magic recreating date-hh_x combinations
    dr = pd.DataFrame(columns=[f"hh_{i}" for i in range(48)], index=dr).unstack().reset_index()
    # renaming the columns
    dr.columns = ["hour_block", "day", "_"]
    # left merging the dataframe to the standard dataframe
    # now the missing values will be left as NaN
    dr = dr.merge(x, on=['hour_block','day'], how='left')
    dr['series_length'] = len(dr)
    return dr

In [13]:
def load_process_block_expanded(block_df, freq="30min"):
    grps = block_df.groupby('LCLid')
    all_series = []
    for idx, df in tqdm(grps, leave=False):
        ts = preprocess_expanded(df)
        all_series.append(ts)

    block_df = pd.concat(all_series)
    # Recreate Offset because there would be null rows now
    block_df['offset'] = block_df['hour_block'].str.replace("hh_", "").astype(int)
    # Creating a datetime column with the date | Will take some time because operation is not vectorized
    block_df['timestamp'] = block_df['day'] + block_df['offset']*30*pd.offsets.Minute()
    block_df['frequency'] = freq
    block_df.sort_values(["LCLid","timestamp"], inplace=True)
    block_df.drop(columns=["_", "hour_block", "offset", "day"], inplace=True)
    return block_df

In [14]:
block_1_expanded = load_process_block_expanded(
    block_1,
    freq='30min'
    )

  0%|          | 0/50 [00:00<?, ?it/s]

In [15]:
block_1_expanded.head()

Unnamed: 0,LCLid,energy_consumption,series_length,timestamp,frequency
0,MAC000002,0.263,24144,2012-10-13 00:00:00,30min
503,MAC000002,0.269,24144,2012-10-13 00:30:00,30min
1006,MAC000002,0.275,24144,2012-10-13 01:00:00,30min
1509,MAC000002,0.256,24144,2012-10-13 01:30:00,30min
2012,MAC000002,0.211,24144,2012-10-13 02:00:00,30min


In [16]:
display(block_1_expanded.memory_usage())
print(f"Total: {block_1_expanded.memory_usage().sum()/1024**2} MB")

Index                 9834240
LCLid                 9834240
energy_consumption    9834240
series_length         9834240
timestamp             9834240
frequency             9834240
dtype: int64

Total: 56.27197265625 MB


- 메모리 많이 소요

Compact form    

1. 시작 날짜와 시계열 식별자를 찾습니다.  
2. 시작 날짜와 전역 종료 날짜를 사용하여 표준 데이터 프레임을 만듭니다.  
3. LCLid 데이터 프레임을 left merge로 표준 데이터 프레임에 병합하고 누락된 데이터는 np.nan으로 남깁니다.  
4. 날짜를 기준으로 값을 정렬합니다.
5. 시계열 배열을 시계열 식별자, 시작 날짜 및 시계열 길이와 함께 반환합니다.

각 LCLid에 대한 이 정보를 확보하면 이를 데이터프레임으로 컴파일하고 빈도로 30분을 추가할 수 있습니다. 한 블록의 경우, 이 표현은 약 0.002MB의 메모리만 차지합니다. 작업하기 쉽고 리소스를 훨씬 덜 소모하는 컴팩트한 형태를 사용할 것입니다.

In [17]:
def preprocess_compact(x):
    start_date = x['day'].min()
    name = x['LCLid'].unique()[0]
    ### Fill missing dates with NaN ###
    # Create a date range from  min to max
    dr = pd.date_range(start=x['day'].min(), end=max_date, freq="1D")
    # Add hh_0 to hh_47 to columns and with some unstack magic recreating date-hh_x combinations
    dr = pd.DataFrame(columns=[f"hh_{i}" for i in range(48)], index=dr).unstack().reset_index()
    # renaming the columns
    dr.columns = ["hour_block", "day", "_"]
    # left merging the dataframe to the standard dataframe
    # now the missing values will be left as NaN
    dr = dr.merge(x, on=['hour_block','day'], how='left')
    # sorting the rows
    dr.sort_values(['day',"offset"], inplace=True)
    # extracting the timeseries array
    ts = dr['energy_consumption'].values
    len_ts = len(ts)
    return start_date, name, ts, len_ts

In [18]:
def load_process_block_compact(block_df, freq="30min", ts_identifier="series_name", value_name="series_value"):
    grps = block_df.groupby('LCLid')
    all_series = []
    all_start_dates = []
    all_names = []
    all_data = {}
    all_len = []
    for idx, df in tqdm(grps, leave=False):
        start_date, name, ts, len_ts = preprocess_compact(df)
        all_series.append(ts)
        all_start_dates.append(start_date)
        all_names.append(name)
        all_len.append(len_ts)

    all_data[ts_identifier] = all_names
    all_data['start_timestamp'] = all_start_dates
    all_data['frequency'] = freq
    all_data[value_name] = all_series
    all_data['series_length'] = all_len
    return pd.DataFrame(all_data)

In [19]:
block1_compact = load_process_block_compact(
    block_1, 
    freq="30min", 
    ts_identifier="LCLid", 
    value_name="energy_consumption"
    )

  0%|          | 0/50 [00:00<?, ?it/s]

In [20]:
block1_compact.head()

Unnamed: 0,LCLid,start_timestamp,frequency,energy_consumption,series_length
0,MAC000002,2012-10-13,30min,"[0.263, 0.2689999999999999, 0.275, 0.256, 0.21...",24144
1,MAC000246,2011-12-04,30min,"[0.175, 0.098, 0.144, 0.065, 0.071, 0.037, 0.0...",39216
2,MAC000450,2012-03-23,30min,"[1.337, 1.426, 0.996, 0.971, 0.994, 0.952, 0.8...",33936
3,MAC001074,2012-05-09,30min,"[0.18, 0.086, 0.106, 0.173, 0.146, 0.223, 0.21...",31680
4,MAC003223,2012-09-18,30min,"[0.076, 0.079, 0.123, 0.109, 0.051, 0.069, 0.0...",25344


In [21]:
display(block1_compact.memory_usage(deep=True))
print(f"Total: {block1_compact.memory_usage(deep=True).sum()/1024**2} MB")

Index                  128
LCLid                 3300
start_timestamp        400
frequency             3100
energy_consumption    6000
series_length          400
dtype: int64

Total: 0.0127105712890625 MB


In [22]:
del block_1_expanded, block_1, block1_compact

#### Reading and combining all the block data into a single dataframe

In [23]:
block_df_l = []
for file in tqdm(sorted(list(block_data_path.glob("*.csv"))), desc="Processing Blocks.."):
    block_df = pd.read_csv(file, parse_dates=False)
    block_df['day'] = pd.to_datetime(block_df['day'], yearfirst=True)
    # Taking only from 2012-01-01
    block_df = block_df.loc[block_df['day']>="2012-01-01"]
    #Reshaping the dataframe into the long form with hour blocks along the rows
    block_df = block_df.set_index(['LCLid', "day"]).stack().reset_index().rename(columns={"level_2": "hour_block", 0: "energy_consumption"})
    #Creating a numerical hourblock column
    block_df['offset'] = block_df['hour_block'].str.replace("hh_", "").astype(int)
    block_df_l.append(load_process_block_compact(block_df, freq="30min", ts_identifier="LCLid", value_name="energy_consumption"))

Processing Blocks..:   0%|          | 0/112 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [24]:
hhblock_df = pd.concat(block_df_l)
del block_df_l
display(hhblock_df.memory_usage(deep=True))
print(f"Total: {hhblock_df.memory_usage(deep=True).sum()/1024**2} MB")

Index                  44480
LCLid                 366960
start_timestamp        44480
frequency             344720
energy_consumption    667200
series_length          44480
dtype: int64

Total: 1.4422607421875 MB


In [25]:
hhblock_df.head()

Unnamed: 0,LCLid,start_timestamp,frequency,energy_consumption,series_length
0,MAC000002,2012-10-13,30min,"[0.263, 0.2689999999999999, 0.275, 0.256, 0.21...",24144
1,MAC000246,2012-01-01,30min,"[0.509, 0.317, 0.253, 0.249, 0.93, 0.607, 0.10...",37872
2,MAC000450,2012-03-23,30min,"[1.337, 1.426, 0.996, 0.971, 0.994, 0.952, 0.8...",33936
3,MAC001074,2012-05-09,30min,"[0.18, 0.086, 0.106, 0.173, 0.146, 0.223, 0.21...",31680
4,MAC003223,2012-09-18,30min,"[0.076, 0.079, 0.123, 0.109, 0.051, 0.069, 0.0...",25344


### Mapping additional information

앞서 준비한 데이터 모델에서 매핑해야 하는 세 가지 주요 파일이 있습니다:  
가구 정보, 날씨, 공휴일입니다.  
informations_households.csv 파일에는 가구에 대한 메타 데이터가 포함되어 있습니다.  
시간에 의존하지 않는 정적 기능이 있습니다.  
이를 위해 시계열 식별자인 LCLid를 기반으로 정보를_가구_csv를 압축된 형태로 병합하기만 하면 됩니다.

best practice  

판다스 병합을 수행하는 동안 가장 일반적이고 예상치 못한 결과 중 하나는 작업 전후의 행 수가 동일하지 않다는 것입니다(왼쪽 병합을 수행하는 경우에도).  
이는 일반적으로 병합하는 키에 중복이 있기 때문에 발생합니다.  
모범 사례로, 병합하는 동안 이 검사가 수행되고 가정이 충족되지 않으면 오류가 발생하도록 one_to_one 및 many_to_one과 같은 입력을 받는 pandas 병합에서 validate 매개 변수를 사용할 수 있습니다. 자세한 내용은 https://pandas.pydata.org/docs/reference/api/pandas.merge.html 에서 확인하세요.

반면에 공휴일과 날씨는 시간에 따라 변하는 기능이므로 그에 따라 처리해야 합니다. 명심해야 할 가장 중요한 측면은 이 정보를 매핑할 때 이미 배열로 저장한 시계열과 완벽하게 일치해야 한다는 것입니다.    

uk_bank_holidays.csv는 휴일 날짜와 휴일 종류가 포함된 파일로, 공휴일에는 가족 구성원들이 집에서 함께 시간을 보내거나 TV를 시청하는 등 에너지 소비 패턴이 달라질 수 있으므로 휴일 정보가 매우 중요합니다.  
이 파일을 처리하려면 다음 단계를 따르세요:

1. 날짜 열을 날짜/시간 형식으로 변환하고 이를 데이터프레임의 인덱스로 설정합니다.   

2. 앞서 살펴본 리샘플링 함수를 사용하여 시계열의 빈도인 30분마다 인덱스가 리샘플링되도록 해야 합니다.  

3. 하루 이내의 공휴일을 포워드 채우고 나머지 NaN 값은 NO_HOLIDAY로 채웁니다.  

이제 휴일 파일을 30분 간격마다 행이 있는 데이터 프레임으로 변환했습니다.  
각 행에는 해당 날짜가 휴일인지 아닌지를 지정하는 열이 있습니다.  
weather_hourly_darksky.csv는 다시 한 번 일별 빈도로 된 파일입니다.  
여기에 매핑해야 하는 데이터는 30분 단위의 빈도로 되어 있으므로 30분 빈도로 다운샘플링해야 합니다.  
이렇게 하지 않으면 날씨가 시간별 타임스탬프에만 매핑되고 30분별 타임스탬프는 비어 있게 됩니다.  
이 파일을 처리하기 위해 따라야 하는 단계도 휴일을 처리하는 방식과 유사합니다:

1. 날짜 열을 날짜/시간 형식으로 변환하고 이를 데이터프레임의 인덱스로 설정합니다. 

2. 리샘플링 함수를 사용하여 시계열의 빈도인 30분마다 인덱스가 리샘플링되도록 해야 합니다.

3. 리샘플링하는 동안 생성된 누락된 값을 채우기 위해 날씨 기능을 앞으로 채웁니다.

이제 시계열과 시간 변화 기능 간의 정렬이 보장되었는지 확인했으므로 각 시계열을 반복하여 날씨 및 공휴일 배열을 추출한 다음 데이터 프레임의 해당 행에 저장할 수 있습니다.

#### Household information

In [26]:
household_info = pd.read_csv(source_data/"informations_households.csv")
household_info.head()

Unnamed: 0,LCLid,stdorToU,Acorn,Acorn_grouped,file
0,MAC005492,ToU,ACORN-,ACORN-,block_0
1,MAC001074,ToU,ACORN-,ACORN-,block_0
2,MAC000002,Std,ACORN-A,Affluent,block_0
3,MAC003613,Std,ACORN-A,Affluent,block_0
4,MAC003597,Std,ACORN-A,Affluent,block_0


In [27]:
hhblock_df = hhblock_df.merge(household_info, on='LCLid', validate="one_to_one")
hhblock_df.head()

Unnamed: 0,LCLid,start_timestamp,frequency,energy_consumption,series_length,stdorToU,Acorn,Acorn_grouped,file
0,MAC000002,2012-10-13,30min,"[0.263, 0.2689999999999999, 0.275, 0.256, 0.21...",24144,Std,ACORN-A,Affluent,block_0
1,MAC000246,2012-01-01,30min,"[0.509, 0.317, 0.253, 0.249, 0.93, 0.607, 0.10...",37872,Std,ACORN-A,Affluent,block_0
2,MAC000450,2012-03-23,30min,"[1.337, 1.426, 0.996, 0.971, 0.994, 0.952, 0.8...",33936,Std,ACORN-A,Affluent,block_0
3,MAC001074,2012-05-09,30min,"[0.18, 0.086, 0.106, 0.173, 0.146, 0.223, 0.21...",31680,ToU,ACORN-,ACORN-,block_0
4,MAC003223,2012-09-18,30min,"[0.076, 0.079, 0.123, 0.109, 0.051, 0.069, 0.0...",25344,Std,ACORN-A,Affluent,block_0


#### Weather and Bank Holidays

In [28]:
bank_holidays = pd.read_csv(source_data/"uk_bank_holidays.csv", parse_dates=False)
bank_holidays['Bank holidays'] = pd.to_datetime(bank_holidays['Bank holidays'], yearfirst=True)
bank_holidays.set_index("Bank holidays", inplace=True)
bank_holidays.head()

Unnamed: 0_level_0,Type
Bank holidays,Unnamed: 1_level_1
2012-12-26,Boxing Day
2012-12-25,Christmas Day
2012-08-27,Summer bank holiday
2012-05-06,Queen?s Diamond Jubilee (extra bank holiday)
2012-04-06,Spring bank holiday (substitute day)


In [29]:
#Reindex on standard date range
bank_holidays = bank_holidays.resample("30min").asfreq()
bank_holidays = bank_holidays.groupby(bank_holidays.index.date).ffill().fillna("NO_HOLIDAY")
bank_holidays.index.name="datetime"
bank_holidays.head()

Unnamed: 0_level_0,Type
datetime,Unnamed: 1_level_1
2012-02-01 00:00:00,New Year?s Day (substitute day)
2012-02-01 00:30:00,New Year?s Day (substitute day)
2012-02-01 01:00:00,New Year?s Day (substitute day)
2012-02-01 01:30:00,New Year?s Day (substitute day)
2012-02-01 02:00:00,New Year?s Day (substitute day)


In [30]:
weather_hourly = pd.read_csv(source_data/"weather_hourly_darksky.csv", parse_dates=False)
weather_hourly['time'] = pd.to_datetime(weather_hourly['time'], yearfirst=True)
weather_hourly.set_index("time", inplace=True)
weather_hourly.head()

Unnamed: 0_level_0,visibility,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-11-11 00:00:00,5.97,104,10.24,8.86,1016.76,10.24,2.77,rain,partly-cloudy-night,0.91,Partly Cloudy
2011-11-11 01:00:00,4.88,99,9.76,8.83,1016.63,8.24,2.95,rain,partly-cloudy-night,0.94,Partly Cloudy
2011-11-11 02:00:00,3.7,98,9.46,8.79,1016.36,7.76,3.17,rain,partly-cloudy-night,0.96,Partly Cloudy
2011-11-11 03:00:00,3.12,99,9.23,8.63,1016.28,7.44,3.25,rain,fog,0.96,Foggy
2011-11-11 04:00:00,1.85,111,9.26,9.21,1015.98,7.24,3.7,rain,fog,1.0,Foggy


In [31]:
#Resampling at 30min and forward fill
weather_hourly = weather_hourly.resample("30min").ffill()
weather_hourly.head()

Unnamed: 0_level_0,visibility,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-11-01 00:00:00,13.63,160,13.49,11.48,1008.14,13.49,3.11,rain,clear-night,0.88,Clear
2011-11-01 00:30:00,13.63,160,13.49,11.48,1008.14,13.49,3.11,rain,clear-night,0.88,Clear
2011-11-01 01:00:00,13.26,154,12.73,11.58,1007.88,12.73,3.08,rain,partly-cloudy-night,0.93,Partly Cloudy
2011-11-01 01:30:00,13.26,154,12.73,11.58,1007.88,12.73,3.08,rain,partly-cloudy-night,0.93,Partly Cloudy
2011-11-01 02:00:00,12.94,161,13.65,12.14,1007.09,13.65,3.71,rain,clear-night,0.91,Clear


In [32]:
def map_weather_holidays(row):
    date_range = pd.date_range(row['start_timestamp'], periods=row['series_length'], freq=row['frequency'])
    std_df = pd.DataFrame(index=date_range)
    #Filling Na iwth NO_HOLIDAY cause rows before earliers holiday will be NaN
    holidays = std_df.join(bank_holidays, how="left").fillna("NO_HOLIDAY")
    weather = std_df.join(weather_hourly, how='left')
    assert len(holidays)==row['series_length'], "Length of holidays should be same as series length"
    assert len(weather)==row['series_length'], "Length of weather should be same as series length"
    row['holidays'] = holidays['Type'].values
    for col in weather:
        row[col] = weather[col].values
    return row

In [33]:
hhblock_df = hhblock_df.progress_apply(map_weather_holidays, axis=1)

  0%|          | 0/5560 [00:00<?, ?it/s]

In [34]:
hhblock_df.head()

Unnamed: 0,LCLid,start_timestamp,frequency,energy_consumption,series_length,stdorToU,Acorn,Acorn_grouped,file,holidays,visibility,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
0,MAC000002,2012-10-13,30min,"[0.263, 0.2689999999999999, 0.275, 0.256, 0.21...",24144,Std,ACORN-A,Affluent,block_0,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...","[13.08, 13.08, 13.42, 13.42, 13.94, 13.94, 13....","[186, 186, 188, 188, 190, 190, 203, 203, 206, ...","[8.78, 8.78, 8.27, 8.27, 7.87, 7.87, 7.89, 7.8...","[6.28, 6.28, 6.21, 6.21, 6.22, 6.22, 6.76, 6.7...","[1007.7, 1007.7, 1007.36, 1007.36, 1006.73, 10...","[7.55, 7.55, 7.34, 7.34, 6.75, 6.75, 6.89, 6.8...","[2.28, 2.28, 1.81, 1.81, 1.95, 1.95, 1.83, 1.8...","[rain, rain, rain, rain, rain, rain, rain, rai...","[clear-night, clear-night, clear-night, clear-...","[0.84, 0.84, 0.87, 0.87, 0.89, 0.89, 0.93, 0.9...","[Clear, Clear, Clear, Clear, Partly Cloudy, Pa..."
1,MAC000246,2012-01-01,30min,"[0.509, 0.317, 0.253, 0.249, 0.93, 0.607, 0.10...",37872,Std,ACORN-A,Affluent,block_0,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...","[12.99, 12.99, 12.89, 12.89, 11.54, 11.54, 13....","[229, 229, 238, 238, 229, 229, 231, 231, 227, ...","[12.12, 12.12, 12.59, 12.59, 12.45, 12.45, 12....","[10.97, 10.97, 11.02, 11.02, 11.04, 11.04, 10....","[1008.1, 1008.1, 1007.88, 1007.88, 1007.95, 10...","[12.12, 12.12, 12.59, 12.59, 12.45, 12.45, 12....","[5.9, 5.9, 6.06, 6.06, 5.31, 5.31, 4.68, 4.68,...","[rain, rain, rain, rain, rain, rain, rain, rai...","[partly-cloudy-night, partly-cloudy-night, clo...","[0.93, 0.93, 0.9, 0.9, 0.91, 0.91, 0.93, 0.93,...","[Mostly Cloudy, Mostly Cloudy, Overcast, Overc..."
2,MAC000450,2012-03-23,30min,"[1.337, 1.426, 0.996, 0.971, 0.994, 0.952, 0.8...",33936,Std,ACORN-A,Affluent,block_0,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...","[3.19, 3.19, 2.48, 2.48, 2.41, 2.41, 1.82, 1.8...","[78, 78, 73, 73, 81, 81, 80, 80, 75, 75, 71, 7...","[8.76, 8.76, 8.54, 8.54, 8.09, 8.09, 7.34, 7.3...","[7.25, 7.25, 7.12, 7.12, 7.17, 7.17, 6.68, 6.6...","[1027.41, 1027.41, 1026.91, 1026.91, 1026.54, ...","[7.59, 7.59, 7.43, 7.43, 7.24, 7.24, 7.34, 7.3...","[2.18, 2.18, 2.07, 2.07, 1.72, 1.72, 1.34, 1.3...","[rain, rain, rain, rain, rain, rain, rain, rai...","[fog, fog, fog, fog, fog, fog, fog, fog, fog, ...","[0.9, 0.9, 0.91, 0.91, 0.94, 0.94, 0.96, 0.96,...","[Foggy, Foggy, Foggy, Foggy, Foggy, Foggy, Fog..."
3,MAC001074,2012-05-09,30min,"[0.18, 0.086, 0.106, 0.173, 0.146, 0.223, 0.21...",31680,ToU,ACORN-,ACORN-,block_0,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...","[10.51, 10.51, 9.43, 9.43, 6.69, 6.69, 5.29, 5...","[215, 215, 207, 207, 215, 215, 216, 216, 126, ...","[11.46, 11.46, 11.38, 11.38, 11.38, 11.38, 10....","[10.23, 10.23, 10.17, 10.17, 10.24, 10.24, 10....","[1007.39, 1007.39, 1007.21, 1007.21, 1007.06, ...","[11.46, 11.46, 11.38, 11.38, 11.38, 11.38, 10....","[2.35, 2.35, 2.15, 2.15, 1.84, 1.84, 1.22, 1.2...","[rain, rain, rain, rain, rain, rain, rain, rai...","[partly-cloudy-night, partly-cloudy-night, par...","[0.92, 0.92, 0.92, 0.92, 0.93, 0.93, 0.95, 0.9...","[Partly Cloudy, Partly Cloudy, Mostly Cloudy, ..."
4,MAC003223,2012-09-18,30min,"[0.076, 0.079, 0.123, 0.109, 0.051, 0.069, 0.0...",25344,Std,ACORN-A,Affluent,block_0,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...","[13.44, 13.44, 13.36, 13.36, 13.29, 13.29, 13....","[236, 236, 240, 240, 242, 242, 244, 244, 248, ...","[14.06, 14.06, 13.12, 13.12, 12.53, 12.53, 12....","[10.82, 10.82, 10.29, 10.29, 9.86, 9.86, 9.83,...","[1011.09, 1011.09, 1010.82, 1010.82, 1010.65, ...","[14.06, 14.06, 13.12, 13.12, 12.53, 12.53, 12....","[3.86, 3.86, 3.81, 3.81, 4.27, 4.27, 4.12, 4.1...","[rain, rain, rain, rain, rain, rain, rain, rai...","[clear-night, clear-night, clear-night, clear-...","[0.81, 0.81, 0.83, 0.83, 0.84, 0.84, 0.86, 0.8...","[Clear, Clear, Clear, Clear, Clear, Clear, Par..."


In [35]:
del block_df, weather_hourly, bank_holidays, household_info

In [36]:
display(hhblock_df.memory_usage(deep=True))
print(f"Total: {hhblock_df.memory_usage(deep=True).sum()/1024**2} MB")

Index                     128
LCLid                  366960
start_timestamp         44480
frequency              344720
energy_consumption     667200
series_length           44480
stdorToU               333600
Acorn                  355838
Acorn_grouped          367675
file                   361465
holidays               667200
visibility             667200
windBearing            667200
temperature            667200
dewPoint               667200
pressure               667200
apparentTemperature    667200
windSpeed              667200
precipType             667200
icon                   667200
humidity               667200
summary                667200
dtype: int64

Total: 10.388322830200195 MB


In [37]:
os.listdir()

['01 - pandas.ipynb', '02 - preprocessing london smard meter dataset.ipynb']

In [38]:
os.makedirs("../data/london_smart_meters/preprocessed", exist_ok=True)

In [39]:
# Saving the LCLid - Acorn map as a pickle to be used later
hhblock_df[['LCLid',"file", "Acorn_grouped"]].to_pickle(f"../data/london_smart_meters/preprocessed/london_smart_meters_lclid_acorn_map.pkl")

Saving blocks in 8 chunks as parquet

In [40]:
# Splitting the blocks into 8 chunks
blocks = [f"block_{i}" for i in range(111)]

n_chunks= 8
split_blocks = [blocks[i:i + n_chunks] for i in range(0, len(blocks), n_chunks)] 

In [42]:
#Writing each chunk to disk
for blk in tqdm(split_blocks):
    df = hhblock_df.loc[hhblock_df.file.isin(blk)]
    blk = [int(b.replace("block_","")) for b in blk]
    block_str = f"block_{min(blk)}-{max(blk)}"
    df.to_parquet(f"../data/london_smart_meters/preprocessed/london_smart_meters_merged_{block_str}.parquet")

  0%|          | 0/14 [00:00<?, ?it/s]