## 02 - Preprocessing London Smart Meter Dataset

- https://github.com/PacktPublishing/Modern-Time-Series-Forecasting-with-Python/blob/main/notebooks/Chapter02/02%20-%20Preprocessing%20London%20Smart%20Meter%20Dataset.ipynb

In [1]:

import os
import sys
import time
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings('ignore')
plt.style.use("seaborn-v0_8-whitegrid")
%matplotlib inline

pd.options.display.max_columns = 999

In [2]:
#!pip install git+https://github.com/TimeSynth/TimeSynth.git

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import os
import plotly.io as pio
pio.templates.default = "plotly_white"
import pandas as pd
from pathlib import Path
from tqdm.autonotebook import tqdm
# %load_ext autoreload
# %autoreload 2
np.random.seed()
tqdm.pandas()

## 데이터 집합 정보

무료 개방형 데이터 공유 포털인 런던 데이터 스토어에서 이 데이터 세트를 제공했으며, Jean-Michel D.가 수집하고 보강하여 Kaggle(https://www.kaggle.com/jeanmidev/smart-meters-in-london)에 업로드했습니다. 
이 데이터 세트에는 2011년 11월부터 2014년 2월까지 영국 전력 네트워크가 주도한 저탄소 런던 프로젝트에 참여한 5,567개의 런던 가구 샘플에 대한 에너지 소비량 측정값이 포함되어 있습니다. 판독값은 30분 간격으로 측정되었습니다. 가구에 대한 일부 메타데이터도 데이터 세트의 일부로 제공됩니다.

## 데이터 랭글링
Kaggle 데이터 집합에는 일 단위로 사전 처리되고 모든 개별 파일 등을 결합한 시계열 데이터도 있습니다. 하지만 이러한 파일은 무시하고 hhblock_dataset 폴더에 있는 원시 파일부터 시작하겠습니다.

### Converting the half hourly block level dataset into a time series data 

Let's pick one block and see how we can transform the data.

In [4]:
source_data = Path("../data/london_smart_meters/")
block_data_path = source_data/"hhblock_dataset"/"hhblock_dataset"

In [5]:
block_1 = pd.read_csv(block_data_path/"block_0.csv", parse_dates=False)
block_1['day'] = pd.to_datetime(block_1['day'], yearfirst=True)
block_1.head()

Unnamed: 0,LCLid,day,hh_0,hh_1,hh_2,hh_3,hh_4,hh_5,hh_6,hh_7,hh_8,hh_9,hh_10,hh_11,hh_12,hh_13,hh_14,hh_15,hh_16,hh_17,hh_18,hh_19,hh_20,hh_21,hh_22,hh_23,hh_24,hh_25,hh_26,hh_27,hh_28,hh_29,hh_30,hh_31,hh_32,hh_33,hh_34,hh_35,hh_36,hh_37,hh_38,hh_39,hh_40,hh_41,hh_42,hh_43,hh_44,hh_45,hh_46,hh_47
0,MAC000002,2012-10-13,0.263,0.269,0.275,0.256,0.211,0.136,0.161,0.119,0.167,0.109,0.168,0.107,0.166,0.117,0.157,0.126,0.146,0.106,0.135,0.191,0.915,0.933,0.122,0.138,0.076,0.133,0.076,0.133,0.085,0.263,0.134,0.235,0.124,0.184,0.23,0.176,0.388,0.26,0.918,0.278,0.267,0.239,0.23,0.233,0.235,0.188,0.259,0.25
1,MAC000002,2012-10-14,0.262,0.166,0.226,0.088,0.126,0.082,0.123,0.083,0.12,0.079,0.121,0.075,0.124,0.073,0.125,0.07,0.13,0.108,0.196,0.346,0.524,0.076,0.129,0.667,0.23,0.22,0.163,0.091,0.17,0.11,0.11,0.121,0.099,0.157,0.093,0.371,0.386,1.085,1.075,0.956,0.821,0.745,0.712,0.511,0.231,0.21,0.278,0.159
2,MAC000002,2012-10-15,0.192,0.097,0.141,0.083,0.132,0.07,0.13,0.074,0.124,0.078,0.118,0.082,0.112,0.087,0.106,0.14,0.12,1.075,0.146,0.123,0.082,0.127,0.077,0.551,0.149,0.129,0.075,0.13,0.075,0.129,0.075,0.128,0.166,0.194,0.695,0.26,0.227,0.255,1.164,0.249,0.225,0.258,0.26,0.334,0.299,0.236,0.241,0.237
3,MAC000002,2012-10-16,0.237,0.237,0.193,0.118,0.098,0.107,0.094,0.109,0.091,0.105,0.091,0.104,0.092,0.103,0.093,0.101,0.144,0.1,0.408,0.102,0.1,0.116,0.354,0.146,0.19,0.991,0.31,0.121,0.113,0.094,0.119,0.087,0.13,0.238,0.204,0.284,0.447,0.266,0.966,0.172,0.192,0.228,0.203,0.211,0.188,0.213,0.157,0.202
4,MAC000002,2012-10-17,0.157,0.211,0.155,0.169,0.101,0.117,0.084,0.118,0.08,0.119,0.075,0.123,0.071,0.126,0.067,0.124,0.118,0.132,0.358,0.628,0.784,0.681,0.749,0.593,0.502,0.115,0.113,0.092,0.124,0.084,0.125,0.078,0.136,0.227,0.207,0.141,0.258,0.217,0.223,0.075,0.23,0.208,0.265,0.377,0.327,0.277,0.288,0.256


Find the Global End Date

In [6]:
block_1.groupby("LCLid")['day'].max().sample(10)# 종료일이 같은지 확인

LCLid
MAC000450   2013-05-14
MAC003428   2014-02-27
MAC003281   2014-02-27
MAC003613   2014-02-27
MAC003463   2013-08-11
MAC004247   2014-02-27
MAC004034   2013-09-18
MAC003737   2014-02-27
MAC003740   2014-02-27
MAC003686   2014-02-27
Name: day, dtype: datetime64[ns]

In [7]:
max_date = None
for f in tqdm(block_data_path.glob("*.csv")):
    df = pd.read_csv(f, parse_dates=False)
    df['day'] = pd.to_datetime(df['day'], yearfirst=True)
    if max_date is None:
        max_date = df['day'].max()
    else:
        if df['day'].max()>max_date:
            max_date = df['day'].max()
print(f"Max Date across all blocks: {max_date}")
del df

0it [00:00, ?it/s]

Max Date across all blocks: 2014-02-27 00:00:00


Basic Preprocessing: 시간대를 세로 방향으로 reshaping

In [8]:
block_1.set_index(["LCLid", "day"])

Unnamed: 0_level_0,Unnamed: 1_level_0,hh_0,hh_1,hh_2,hh_3,hh_4,hh_5,hh_6,hh_7,hh_8,hh_9,hh_10,hh_11,hh_12,hh_13,hh_14,hh_15,hh_16,hh_17,hh_18,hh_19,hh_20,hh_21,hh_22,hh_23,hh_24,hh_25,hh_26,hh_27,hh_28,hh_29,hh_30,hh_31,hh_32,hh_33,hh_34,hh_35,hh_36,hh_37,hh_38,hh_39,hh_40,hh_41,hh_42,hh_43,hh_44,hh_45,hh_46,hh_47
LCLid,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1
MAC000002,2012-10-13,0.263,0.269,0.275,0.256,0.211,0.136,0.161,0.119,0.167,0.109,0.168,0.107,0.166,0.117,0.157,0.126,0.146,0.106,0.135,0.191,0.915,0.933,0.122,0.138,0.076,0.133,0.076,0.133,0.085,0.263,0.134,0.235,0.124,0.184,0.230,0.176,0.388,0.260,0.918,0.278,0.267,0.239,0.230,0.233,0.235,0.188,0.259,0.250
MAC000002,2012-10-14,0.262,0.166,0.226,0.088,0.126,0.082,0.123,0.083,0.120,0.079,0.121,0.075,0.124,0.073,0.125,0.070,0.130,0.108,0.196,0.346,0.524,0.076,0.129,0.667,0.230,0.220,0.163,0.091,0.170,0.110,0.110,0.121,0.099,0.157,0.093,0.371,0.386,1.085,1.075,0.956,0.821,0.745,0.712,0.511,0.231,0.210,0.278,0.159
MAC000002,2012-10-15,0.192,0.097,0.141,0.083,0.132,0.070,0.130,0.074,0.124,0.078,0.118,0.082,0.112,0.087,0.106,0.140,0.120,1.075,0.146,0.123,0.082,0.127,0.077,0.551,0.149,0.129,0.075,0.130,0.075,0.129,0.075,0.128,0.166,0.194,0.695,0.260,0.227,0.255,1.164,0.249,0.225,0.258,0.260,0.334,0.299,0.236,0.241,0.237
MAC000002,2012-10-16,0.237,0.237,0.193,0.118,0.098,0.107,0.094,0.109,0.091,0.105,0.091,0.104,0.092,0.103,0.093,0.101,0.144,0.100,0.408,0.102,0.100,0.116,0.354,0.146,0.190,0.991,0.310,0.121,0.113,0.094,0.119,0.087,0.130,0.238,0.204,0.284,0.447,0.266,0.966,0.172,0.192,0.228,0.203,0.211,0.188,0.213,0.157,0.202
MAC000002,2012-10-17,0.157,0.211,0.155,0.169,0.101,0.117,0.084,0.118,0.080,0.119,0.075,0.123,0.071,0.126,0.067,0.124,0.118,0.132,0.358,0.628,0.784,0.681,0.749,0.593,0.502,0.115,0.113,0.092,0.124,0.084,0.125,0.078,0.136,0.227,0.207,0.141,0.258,0.217,0.223,0.075,0.230,0.208,0.265,0.377,0.327,0.277,0.288,0.256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MAC005492,2014-02-23,0.165,0.206,0.149,0.101,0.115,0.120,0.102,0.105,0.125,0.106,0.103,0.131,0.133,0.102,0.101,0.130,0.281,0.211,0.149,0.224,0.216,0.379,0.179,0.215,0.208,0.194,0.203,0.265,0.187,0.196,0.168,0.122,0.131,0.128,0.135,0.163,0.494,0.290,0.296,0.750,0.315,0.326,0.293,0.287,0.213,0.287,0.245,0.222
MAC005492,2014-02-24,0.219,0.236,0.288,0.219,0.165,0.086,0.111,0.095,0.084,0.081,0.105,0.129,0.087,0.079,0.089,0.111,0.133,0.133,0.188,0.208,0.168,0.238,0.160,0.250,0.175,0.080,0.082,0.116,0.088,0.206,0.161,0.203,0.150,0.096,0.263,0.234,0.254,0.273,0.303,0.378,0.253,0.193,0.212,0.193,0.192,0.297,0.168,0.170
MAC005492,2014-02-25,0.194,0.185,0.150,0.095,0.081,0.082,0.081,0.112,0.094,0.082,0.079,0.124,0.125,0.082,0.080,0.084,0.080,0.120,0.082,0.348,0.195,0.198,0.210,0.185,0.174,0.204,0.179,0.156,0.137,0.137,0.132,0.109,0.170,0.119,0.155,0.155,0.111,0.272,0.545,0.233,0.245,0.194,0.215,0.201,0.241,0.162,0.157,0.167
MAC005492,2014-02-26,0.205,0.183,0.186,0.165,0.194,0.114,0.080,0.090,0.106,0.082,0.080,0.125,0.117,0.081,0.079,0.083,0.142,0.186,0.153,0.653,0.235,0.144,0.162,0.176,0.145,0.118,0.168,0.104,0.081,0.129,0.083,0.082,0.098,0.123,0.193,0.141,0.141,0.687,0.298,0.224,0.239,0.242,0.237,0.246,0.333,0.202,0.232,0.203


In [9]:
block_1.set_index(["LCLid", "day"]).stack()

LCLid      day              
MAC000002  2012-10-13  hh_0     0.263
                       hh_1     0.269
                       hh_2     0.275
                       hh_3     0.256
                       hh_4     0.211
                                ...  
MAC005492  2014-02-27  hh_43    0.250
                       hh_44    0.182
                       hh_45    0.122
                       hh_46    0.140
                       hh_47    0.192
Length: 1213678, dtype: float64

In [10]:
block_1.set_index(["LCLid", "day"]).stack().reset_index()

Unnamed: 0,LCLid,day,level_2,0
0,MAC000002,2012-10-13,hh_0,0.263
1,MAC000002,2012-10-13,hh_1,0.269
2,MAC000002,2012-10-13,hh_2,0.275
3,MAC000002,2012-10-13,hh_3,0.256
4,MAC000002,2012-10-13,hh_4,0.211
...,...,...,...,...
1213673,MAC005492,2014-02-27,hh_43,0.250
1213674,MAC005492,2014-02-27,hh_44,0.182
1213675,MAC005492,2014-02-27,hh_45,0.122
1213676,MAC005492,2014-02-27,hh_46,0.140


정리하기

In [11]:
#Reshaping the dataframe into the long form with hour blocks along the rows
block_1 = block_1.set_index(['LCLid', "day"]).stack().reset_index().rename(columns={"level_2": "hour_block", 0: "energy_consumption"})
#Creating a numerical hourblock column
block_1['offset'] = block_1['hour_block'].str.replace("hh_", "").astype(int)

block_1.head()

Unnamed: 0,LCLid,day,hour_block,energy_consumption,offset
0,MAC000002,2012-10-13,hh_0,0.263,0
1,MAC000002,2012-10-13,hh_1,0.269,1
2,MAC000002,2012-10-13,hh_2,0.275,2
3,MAC000002,2012-10-13,hh_3,0.256,3
4,MAC000002,2012-10-13,hh_4,0.211,4


Compact form    

1. 시작 날짜와 시계열 식별자를 찾습니다.  
2. 시작 날짜와 전역 종료 날짜를 사용하여 표준 데이터 프레임을 만듭니다.  
3. 왼쪽 클릭으로 LCLid용 데이터 프레임을 표준 데이터 프레임에 병합하고 누락된 데이터는 np.nan으로 남깁니다.  
4. 날짜를 기준으로 값을 정렬합니다.
5. 시계열 배열을 시계열 식별자, 시작 날짜 및 시계열 길이와 함께 반환합니다.