In [None]:
!pip install numpy==1.26.4 
!pip install -U matplotlib
!pip install pandas
!pip install pyarrow
!pip install seaborn
!pip install tensorflow
!pip install scikit-learn

Import libraries

In [None]:
import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
import torch
from torch.utils.data import TensorDataset # 텐서데이터셋
from torch.utils.data import DataLoader # 데이터로더

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False

데이터 다운로드

데이터 전처리

In [None]:
df = pd.read_csv("dataset/jena_climate_2009_2016.csv")
# Slice [start:stop:step], starting from index 5 take every 6th record.
df = df[5::6]

In [None]:
df.head()

In [None]:
date_time = pd.to_datetime(df.pop('Date Time'), format='%d.%m.%Y %H:%M:%S')
df=df.drop(['Tpot (K)','Tdew (degC)','VPmax (mbar)','VPact (mbar)','VPdef (mbar)','sh (g/kg)','H2OC (mmol/mol)','rho (g/m**3)','max. wv (m/s)','wd (deg)'], axis=1)
df = df[['p (mbar)', 'rh (%)', 'wv (m/s)','T (degC)' ]]

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
wv = df['wv (m/s)']
bad_wv = wv == -9999.0
wv[bad_wv] = 0.0

In [None]:
df.describe()

In [None]:
plot_cols = ['p (mbar)','T (degC)']
plot_features = df[plot_cols]
plot_features.index = date_time
_ = plot_features.plot(subplots=True)

plot_features = df[plot_cols][:480]
plot_features.index = date_time[:480]
_ = plot_features.plot(subplots=True)

Train, test 데이터 분할

In [None]:
# 7일간의 데이터가 입력으로 들어가고 batch size는 임의로 지정
seq_length = 7
batch = 100
train_size = ####빈칸을 채우세요####
train_set = ####빈칸을 채우세요####
train_date_time=####빈칸을 채우세요####
test_set = ####빈칸을 채우세요####
test_date_time=####빈칸을 채우세요####

In [None]:
train_set.head()

In [None]:
plot_cols = ['p (mbar)','T (degC)']
plot_features = train_set[plot_cols]
plot_features.index = train_date_time
_ = plot_features.plot(subplots=True)

plot_features = test_set[plot_cols]
plot_features.index = test_date_time
_ = plot_features.plot(subplots=True)

데이터 스케일링

In [None]:
# Input scale
scaler_x = MinMaxScaler()
scaler_x.fit(train_set.iloc[:, :-1])

train_set.iloc[:, :-1] = scaler_x.transform(####빈칸을 채우세요####)
test_set.iloc[:, :-1] = scaler_x.transform(####빈칸을 채우세요####)

# Output scale
scaler_y = MinMaxScaler()
scaler_y.fit(train_set.iloc[:, [-1]])

train_set.iloc[:, -1] = scaler_y.transform(####빈칸을 채우세요####)
test_set.iloc[:, -1] = scaler_y.transform(####빈칸을 채우세요####)

In [None]:
print(test_set[0:5])

In [None]:
train_set.head()

데이터셋 생성

In [None]:
device = torch.device('cpu')
# 데이터셋 생성 함수
def build_dataset(time_series, seq_length):
    dataX = []
    dataY = []
    for i in range(0, len(time_series)-seq_length):
        _x = time_series[####빈칸을 채우세요####]
        _y = time_series[####빈칸을 채우세요####]
        # print(_x, "-->",_y)
        dataX.append(_x)
        dataY.append(_y)

    return np.array(dataX), np.array(dataY)

trainX, trainY = build_dataset(np.array(train_set), seq_length)
testX, testY = build_dataset(np.array(test_set), seq_length)


# 텐서로 변환
trainX_tensor = torch.FloatTensor(trainX)
trainY_tensor = torch.FloatTensor(trainY)

testX_tensor = torch.FloatTensor(testX)
testY_tensor = torch.FloatTensor(testY)

testX_tensor = testX_tensor.to(device)
testY_tensor = testY_tensor.to(device)
# 텐서 형태로 데이터 정의
dataset = TensorDataset(trainX_tensor, trainY_tensor)
# 데이터로더는 기본적으로 2개의 인자를 입력받으며 배치크기는 통상적으로 2의 배수를 사용
dataloader = DataLoader(dataset,
                        batch_size=batch,
                        shuffle=True,
                        drop_last=True)

In [None]:
testX_tensor.shape

In [None]:
next(iter(dataloader))

In [None]:
print(testX_tensor[0:2])

LSTM

In [None]:
import torch.nn as nn

# 설정값
data_dim = 5
hidden_dim = 10
output_dim = 1
learning_rate = 0.01
nb_epochs = 100

class Net(nn.Module):
    # # 기본변수, layer를 초기화해주는 생성자
    def __init__(self, input_dim, hidden_dim, seq_len, output_dim, layers):
        super(Net, self).__init__()
        self.hidden_dim = hidden_dim
        self.seq_len = seq_len
        self.output_dim = output_dim
        self.layers = layers

        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=layers,
                            # dropout = 0.1,
                            batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim, bias = True)

    # 학습 초기화를 위한 함수
    def reset_hidden_state(self):
        self.hidden = (
                torch.zeros(self.layers, self.seq_len, self.hidden_dim),
                torch.zeros(self.layers, self.seq_len, self.hidden_dim))

    # 예측을 위한 함수
    def forward(self, x):
        x, _status = self.lstm(x)
        x = self.fc(x[:, -1])
        return x

Training

In [None]:
def train_model(model, train_df, num_epochs = None, lr = None, verbose = 10, patience = 10):

    criterion = nn.MSELoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
    nb_epochs = num_epochs

    # epoch마다 loss 저장
    train_hist = np.zeros(nb_epochs)

    for epoch in range(nb_epochs):
        avg_cost = 0
        total_batch = len(train_df)

        for batch_idx, samples in enumerate(train_df):

            x_train, y_train = samples

            # seq별 hidden state reset
            model.reset_hidden_state()

            # H(x) 계산
            outputs = model(x_train)

            # cost 계산
            loss = criterion(outputs, y_train)

            # cost로 H(x) 개선
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            avg_cost += loss/total_batch

        train_hist[epoch] = avg_cost

        if epoch % verbose == 0:
            print('Epoch:', '%04d' % (epoch), 'train loss :', '{:.4f}'.format(avg_cost))

        # patience번째 마다 early stopping 여부 확인
        if (epoch % patience == 0) & (epoch != 0):

            # loss가 커졌다면 early stop
            if train_hist[epoch-patience] < train_hist[epoch]:
                print('\n Early Stopping : %04d epoch' %(epoch))

                break

    return model.eval(), train_hist

학습 시작

In [None]:
# 모델 학습
# 설정값
data_dim = 5
output_dim = 1
hidden_dim = ####빈칸을 채우세요####
learning_rate = ####빈칸을 채우세요####
nb_epochs = ####빈칸을 채우세요####
net = Net(data_dim, hidden_dim, seq_length, output_dim, 1)
model, train_hist = train_model(net, dataloader, num_epochs = nb_epochs, lr = learning_rate, verbose = 20, patience = 10)

In [None]:
# epoch별 손실값
fig = plt.figure(figsize=(10, 4))
plt.plot(train_hist, label="Training loss")
plt.legend()
plt.show()

In [None]:
# 모델 저장
PATH ="####빈칸을 채우세요####"
#torch.save(model.state_dict(), PATH)

# 불러오기
model = Net(data_dim, hidden_dim, seq_length, output_dim, 1)
model.load_state_dict(torch.load(PATH), strict=False)
model.eval()

모델 테스트

In [None]:
# 예측 테스트
a=####빈칸을 채우세요####
testX_tensor_100=testX_tensor[####빈칸을 채우세요####]
testY_tensor_100=testY_tensor[####빈칸을 채우세요####]
with torch.no_grad():
    pred = []
    for pr in range(len(####빈칸을 채우세요####)):

        model.reset_hidden_state()

        predicted = model(torch.unsqueeze(####빈칸을 채우세요####[pr], 0))
        predicted = torch.flatten(predicted).item()
        pred.append(predicted)

    # INVERSE
    pred_inverse = scaler_y.inverse_transform(np.array(pred).reshape(-1, 1))
    testY_inverse = scaler_y.inverse_transform(####빈칸을 채우세요####)


In [None]:
fig = plt.figure(figsize=(10,3))
plt.plot(np.arange(len(pred_inverse)), pred_inverse, label = 'pred')
plt.plot(np.arange(len(testY_inverse)), testY_inverse, label = 'true')
plt.title("Test plot")
plt.legend()
plt.show()