### 라이브러리 로드

In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot  as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data.dataset import TensorDataset
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


# for GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
# for colab
from google.colab import drive
drive.mount('/content/drive')

### 데이터 로드 (Household electric power consumption dataset)

Attribute
*   **DateTime**: Date and Time in format dd/mm/yyyy hh:mm:ss
*   **Global_active_power**: household global minute-averaged active power (in kilowatt)
*   **Global_reactive_power**: household global minute-averaged reactive power (in kilowatt)
*   **Voltage**: minute-averaged voltage (in volt)
*   **Global_intensity**: household global minute-averaged current intensity (in ampere)
*   **Sub_metering_1**: energy sub-metering No. 1 (in watt-hour of active energy). It corresponds to the kitchen, containing mainly a dishwasher, an oven and a microwave (hot plates are not electric but gas powered).
*   **Sub_metering_2**: energy sub-metering No. 2 (in watt-hour of active energy). It corresponds to the laundry room, containing a washing-machine, a tumble-drier, a refrigerator and a light.
*   **Sub_metering_3**: energy sub-metering No. 3 (in watt-hour of active energy). It corresponds to an electric water-heater and an air-conditioner.

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/power_consumption.csv", sep=',', infer_datetime_format=True, na_values=['nan','?'], index_col='DateTime')
df.index = pd.to_datetime(df.index)
df

### 데이터 전처리 및 특성 분석

- 결측값(NaN) 제거: 평균값을 채우는 방식 등으로 대체 가능
- 여기에서는 2008년 데이터만 사용할 예정
- 진행할 TASK: 전날의 24h 시계열로 다음날 24h 예측
- TASK 관련하여 데이터 특성 분석

In [11]:
df = df.dropna(axis=0)
df = df.loc['2008']

In [None]:
df.loc['2008-1-1'].plot(subplots=True, figsize=(20,10))
plt.show()

In [None]:
df.describe().transpose()

In [None]:
df.hist(figsize=(15,10))
plt.show()

#### 분 단위의 데이터를 시간 단위의 데이터로 변경

In [None]:
df2 = df.resample('H').mean()
df2.shape

#### Trainset과 Validationset 분리

In [16]:
dataset = df2.values
dataset_train, dataset_val = train_test_split(dataset, test_size=0.2, shuffle=False)

In [None]:
dataset_train.shape, dataset_val.shape

#### 데이터에 대해 Normalize 처리 (0~1 사이의 값)

In [None]:
scaler = MinMaxScaler()
scaler.fit(dataset_train)

In [19]:
dataset_train = scaler.transform(dataset_train)
dataset_val = scaler.transform(dataset_val)

#### 슬라이딩 윈도우 방식으로 하나의 긴 시퀀스를 윈도우 사이즈로 잘게 분리

In [20]:
# 24 hours
window_size = 24

x_train_split = []
y_train_split = []
for offset in range(len(dataset_train)-window_size*2):
  x_train_split.append(dataset_train[offset:offset+window_size])                # 0 -24
  y_train_split.append(dataset_train[offset+window_size:offset+2*window_size])  # 24 - 48

x_val_split = []
y_val_split = []
for offset in range(len(dataset_val)-window_size*2):
  x_val_split.append(dataset_val[offset:offset+window_size])
  y_val_split.append(dataset_val[offset+window_size:offset+2*window_size])

In [21]:
x_train = np.stack(x_train_split, axis=0)
y_train = np.stack(y_train_split, axis=0)

x_val = np.stack(x_val_split, axis=0)
y_val = np.stack(y_val_split, axis=0)

In [None]:
# (data instances, length, dim)
x_train.shape, y_train.shape, x_val.shape, y_val.shape

### Testset 로드

In [23]:
dir = '/content/drive/My Drive/Colab Notebooks/'
with open(dir+'x_test.pickle', 'rb') as f:
    x_test = pickle.load(f)

In [24]:
x_test = scaler.transform(x_test.reshape(-1,7)).reshape(-1,24,7)

In [None]:
x_test.shape

### Pytorch 데이터셋 구성

In [26]:
train_data = TensorDataset(torch.FloatTensor(x_train), torch.FloatTensor(y_train))
val_data = TensorDataset(torch.FloatTensor(x_val), torch.FloatTensor(y_val))
test_data = TensorDataset(torch.FloatTensor(x_test))

In [27]:
batch_size = 100
train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=val_data, batch_size=batch_size, shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)

### Baseline 설정
#### 첫번째 방법: 미래의 값이 바뀌지 않을 것이라 가정 (마지막 값을 반복)

In [34]:
def Baseline1(x):
    out = x[:, -1:, :]
    out = out.repeat(1,x.shape[1],1)

    return out

In [None]:
with torch.no_grad():
    mse_total = 0
    cnt = 0
    for i, (x, y) in enumerate(val_loader):
        x = x.to(device)
        y = y.to(device)
        outputs = Baseline1(x)        
        mse_total += F.mse_loss(outputs, y, reduction='sum')
        cnt += len(outputs)

    print ('Epoch [{}/{}], Validation MSE: {:.4f}'.format(1, 1, mse_total/(cnt*24*7)))         

In [36]:
x_data = scaler.inverse_transform(x[0].detach().cpu().numpy())
pred_data = scaler.inverse_transform(outputs[0].detach().cpu().numpy())
true_data = scaler.inverse_transform(y[0].detach().cpu().numpy())

In [None]:
fig, ax = plt.subplots(7, 1, figsize=(10, 10))

for i in range(7):
    ax[i].plot([i for i in range(24)], x_data[:,i],label='input')
    ax[i].plot([i for i in range(24, 48)], pred_data[:,i],label='pred')
    ax[i].plot([i for i in range(24, 48)], true_data[:,i],label='true')
    ax[i].set_title(df.columns.values[i])
    ax[i].legend(loc='upper right')
fig.tight_layout()
plt.show()

#### 두번째 방법: 미래의 값이 이전 24hr과 동일한 패턴일 것이라 가정 (지난 24hr 반복)


In [38]:
def Baseline2(x):
    out = x

    return out

In [None]:
with torch.no_grad():
    mse_total = 0
    cnt = 0
    for i, (x, y) in enumerate(val_loader):
        x = x.to(device)
        y = y.to(device)
        outputs = Baseline2(x)        
        mse_total += F.mse_loss(outputs, y, reduction='sum')
        cnt += len(outputs)

    print ('Epoch [{}/{}], Validation MSE: {:.4f}'.format(1, 1, mse_total/(cnt*24*7)))         

In [40]:
x_data = scaler.inverse_transform(x[0].detach().cpu().numpy())
pred_data = scaler.inverse_transform(outputs[0].detach().cpu().numpy())
true_data = scaler.inverse_transform(y[0].detach().cpu().numpy())

In [None]:
fig, ax = plt.subplots(7, 1, figsize=(10, 10))

for i in range(7):
    ax[i].plot([i for i in range(24)], x_data[:,i],label='input')
    ax[i].plot([i for i in range(24, 48)], pred_data[:,i],label='pred')
    ax[i].plot([i for i in range(24, 48)], true_data[:,i],label='true')
    ax[i].set_title(df.columns.values[i])
    ax[i].legend(loc='upper right')
fig.tight_layout()
plt.show()

### 모델 클래스 정의

In [42]:
# MLP (Fully connected)
class FC(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, input_length, output_steps):
        super(FC, self).__init__()
        
        self.fc1 = nn.Linear(input_dim*input_length, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)  
        self.fc3 = nn.Linear(hidden_dim, output_dim*output_steps)
        self.output_steps = output_steps
    
    def forward(self, x):    
        z = self.fc1(x.view(x.shape[0],-1))
        z = F.relu(z)
        z = self.fc2(z)
        z = F.relu(z)
        out = self.fc3(z)
        out = out.view(out.shape[0], self.output_steps, -1)

        return out

### 하이퍼파라미터, 모델, Loss, Optimizer 설정

In [43]:
n_epochs = 100
learning_rate = 0.01
input_dim = 7
hidden_dim = 64
output_dim = 7
input_length = 24
output_steps = 24
num_layers = 2

model = FC(input_dim, hidden_dim, output_dim, input_length, output_steps).to(device)
criterion = torch.nn.MSELoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### Training & Validation 성능 측정

In [None]:
for epoch in range(n_epochs):
    model.train()
    for i, (x, y) in enumerate(train_loader):        
        x = x.to(device)
        y = y.to(device)
        outputs = model(x)
        loss = criterion(outputs, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print ('Epoch [{}/{}], Training MSE: {:.4f}'.format(epoch+1, n_epochs, loss.item()))

    model.eval()
    with torch.no_grad():
        mse_total = 0
        cnt = 0
        for i, (x, y) in enumerate(val_loader):
            x = x.to(device)
            y = y.to(device)
            outputs = model(x)        
            mse_total += F.mse_loss(outputs, y, reduction='sum')
            cnt += len(outputs)

        print ('Epoch [{}/{}], Validation MSE: {:.4f}'.format(epoch+1, n_epochs, mse_total/(cnt*24*7)))   
       

### Regression Plot

In [45]:
x_data = scaler.inverse_transform(x[0].detach().cpu().numpy())
pred_data = scaler.inverse_transform(outputs[0].detach().cpu().numpy())
true_data = scaler.inverse_transform(y[0].detach().cpu().numpy())

In [None]:
fig, ax = plt.subplots(7, 1, figsize=(10, 10))

for i in range(7):
    ax[i].plot([i for i in range(24)], x_data[:,i],label='input')
    ax[i].plot([i for i in range(24, 48)], pred_data[:,i],label='pred')
    ax[i].plot([i for i in range(24, 48)], true_data[:,i],label='true')
    ax[i].set_title(df.columns.values[i])
    ax[i].legend(loc='upper right')
fig.tight_layout()
plt.show()

Test

In [47]:
res = []
model.eval()
with torch.no_grad():
    for i, (x,) in enumerate(test_loader):
        x = x.to(device).float()

        # Forward pass
        outputs = model(x)
        res.append(outputs)

In [48]:
pred_data = torch.cat(res,dim=0).reshape(-1,7).squeeze()
pred_data = scaler.inverse_transform(pred_data.detach().cpu().numpy())
pred_data = pred_data.reshape(-1).squeeze()
y = pred_data
pd.DataFrame(y,columns =['Predicted']).to_csv(dir+"result.csv",index_label='Id')