실제 데이터와 비슷한 형식의 dummy data를 생성해서 LSTM 모델로 실험하는 과정을 코딩해본다.   

기본적인 LSTM 기반 seq2seq 이기 때문에, 이 모델 이후 seq2seq + attention 그리고 이후에 트랜스포머 기반 architecture들을 살펴보면 좋을 것 같다. 

In [69]:
import pandas as pd 
import numpy as np 
import time 
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim 
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler, IterableDataset 
from sklearn.preprocessing import MinMaxScaler
import random 
import pickle
from transformers import *
from sklearn.metrics import mean_absolute_error

In [45]:
# dummy data 생성 

df = pd.DataFrame(np.random.randint(0, 100, size=(1000, 10)), columns=["id","crawling_date","sales_3days","prd_rank","prd_name", "price", "ctg_2_name", "brand", "score", "delivery_fee"])

# 임의의 datetime으로 
datetimes = [] 
dtobj = datetime(2020,1,1)
for i in range(1000): 
    datetimes.append(dtobj) 
    dtobj = dtobj + timedelta(days=1) 

df["crawling_date"] = datetimes 

#예시를 위해서 id 통일 
df["id"] = 1 

df

Unnamed: 0,id,crawling_date,sales_3days,prd_rank,prd_name,price,ctg_2_name,brand,score,delivery_fee
0,1,2020-01-01,77,53,42,74,99,87,65,5
1,1,2020-01-02,48,52,94,92,99,56,49,31
2,1,2020-01-03,25,78,67,63,42,48,27,44
3,1,2020-01-04,87,0,21,14,18,37,61,97
4,1,2020-01-05,73,76,38,26,51,93,73,55
...,...,...,...,...,...,...,...,...,...,...
995,1,2022-09-22,81,36,86,42,57,85,51,16
996,1,2022-09-23,68,28,44,28,7,41,6,30
997,1,2022-09-24,0,99,39,64,22,90,66,13
998,1,2022-09-25,47,83,80,39,82,27,77,19


In [46]:
# clip sales values 
df[["sales_3days"]] = df[["sales_3days"]].clip(0,)
df[["sales_3days"]] = df[["sales_3days"]].apply(lambda x: np.log1p(x))

In [47]:
df

Unnamed: 0,id,crawling_date,sales_3days,prd_rank,prd_name,price,ctg_2_name,brand,score,delivery_fee
0,1,2020-01-01,4.356709,53,42,74,99,87,65,5
1,1,2020-01-02,3.891820,52,94,92,99,56,49,31
2,1,2020-01-03,3.258097,78,67,63,42,48,27,44
3,1,2020-01-04,4.477337,0,21,14,18,37,61,97
4,1,2020-01-05,4.304065,76,38,26,51,93,73,55
...,...,...,...,...,...,...,...,...,...,...
995,1,2022-09-22,4.406719,36,86,42,57,85,51,16
996,1,2022-09-23,4.234107,28,44,28,7,41,6,30
997,1,2022-09-24,0.000000,99,39,64,22,90,66,13
998,1,2022-09-25,3.871201,83,80,39,82,27,77,19


In [48]:
# 시계열 예측을 위해서 shift   
lookback_window, lookahead_window = 28, 28 

for i in range(1, lookback_window+1):
    df[f"shift_{i}"] = df.groupby("id")["sales_3days"].shift(i)

In [49]:
train_df = df[df['crawling_date'] < datetime(2021,1,1)]
val_df = df[(df['crawling_date'] >= datetime(2021,1,1)) & (df['crawling_date'] < datetime(2022,1,1))]
test_df = df[df['crawling_date'] >= datetime(2022,1,1)]

train_df.shape, val_df.shape, test_df.shape   

((366, 38), (365, 38), (269, 38))

In [35]:
# 나중에 실제 데이터를 이용할때 이 부분은 원래 하시던대로 실행하시면 됩니다. Validation data도 추가했으니 비슷하게 하시면됩니다. 
'''
test_df = test_df.query("id in @id_list")
le = LabelEncoder()
le.fit(train_df["id"])
train_df["id"] = le.transform(train_df["id"])
test_df["id"] = le.transform(test_df["id"])
''' 

'\ntest_df = test_df.query("id in @id_list")\nle = LabelEncoder()\nle.fit(train_df["id"])\ntrain_df["id"] = le.transform(train_df["id"])\ntest_df["id"] = le.transform(test_df["id"])\n'

In [53]:
train_df = train_df.iloc[:,10:]
val_df = val_df.iloc[:, 10:] 
test_df = test_df.iloc[:,10:]

train_df.shape, val_df.shape, test_df.shape

((366, 28), (365, 28), (269, 28))

In [54]:
train_df.dropna(axis=0, inplace=True) 
val_df.dropna(axis=0, inplace=True) 
test_df.dropna(axis=0, inplace=True)  

train_df.shape, val_df.shape, test_df.shape

((338, 28), (365, 28), (269, 28))

In [55]:
# normalize data 
norm = np.max(train_df) 
train_df = train_df / norm  
val_df = val_df / norm 
test_df = test_df / norm 

train_df = train_df.values  
val_df = val_df.values 
test_df = test_df.values

In [65]:
norm

shift_1     4.60517
shift_2     4.60517
shift_3     4.60517
shift_4     4.60517
shift_5     4.60517
shift_6     4.60517
shift_7     4.60517
shift_8     4.60517
shift_9     4.60517
shift_10    4.60517
shift_11    4.60517
shift_12    4.60517
shift_13    4.60517
shift_14    4.60517
shift_15    4.60517
shift_16    4.60517
shift_17    4.60517
shift_18    4.60517
shift_19    4.60517
shift_20    4.60517
shift_21    4.60517
shift_22    4.60517
shift_23    4.60517
shift_24    4.60517
shift_25    4.60517
shift_26    4.60517
shift_27    4.60517
shift_28    4.60517
dtype: float64

In [56]:
X_train, Y_train = [], [] 
X_val, Y_val = [], [] 
X_test, Y_test = [], []  

for i in tqdm(range(28, train_df.shape[0]-lookback_window), position=0, leave=True): 
    X_train.append(train_df[i].reshape((lookback_window, 1))) 
    Y_train.append(train_df[i+28].reshape((lookahead_window, 1))) 
    
X_train = np.array(X_train, dtype=np.float32)
Y_train = np.array(Y_train, dtype=np.float32)

# convert to torch tensor 
X_train = torch.from_numpy(X_train) 
Y_train = torch.from_numpy(Y_train) 

X_train.shape, Y_train.shape

  0%|          | 0/282 [00:00<?, ?it/s]

(torch.Size([282, 28, 1]), torch.Size([282, 28, 1]))

In [57]:
for i in tqdm(range(28, val_df.shape[0]-lookback_window), position=0, leave=True): 
    X_val.append(val_df[i].reshape((lookback_window, 1))) 
    Y_val.append(val_df[i+28].reshape((lookahead_window, 1)))  
    
X_val = np.array(X_val, dtype=np.float32)
Y_val = np.array(Y_val, dtype=np.float32)

# convert to torch tensor 
X_val = torch.from_numpy(X_val) 
Y_val = torch.from_numpy(Y_val) 

X_val.shape, Y_val.shape

  0%|          | 0/309 [00:00<?, ?it/s]

(torch.Size([309, 28, 1]), torch.Size([309, 28, 1]))

In [58]:
for i in tqdm(range(28, test_df.shape[0]-lookback_window), position=0, leave=True): 
    X_test.append(test_df[i][::-1].reshape((lookback_window, 1))) 
    Y_test.append(test_df[i+28][::-1].reshape((lookahead_window, 1))) 

X_test = np.array(X_test, dtype=np.float32)
Y_test = np.array(Y_test, dtype=np.float32)

# convert to torch tensor 
X_test = torch.from_numpy(X_test) 
Y_test = torch.from_numpy(Y_test) 

X_test.shape, Y_test.shape

  0%|          | 0/213 [00:00<?, ?it/s]

(torch.Size([213, 28, 1]), torch.Size([213, 28, 1]))

In [59]:
batch_size = 32 

train_data = TensorDataset(X_train, Y_train) 
train_sampler = RandomSampler(train_data) 
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) 

val_data = TensorDataset(X_val, Y_val) 
val_sampler = SequentialSampler(val_data) 
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size) 

test_data = TensorDataset(X_test, Y_test) 
test_sampler = SequentialSampler(test_data) 
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size) 

In [60]:
class lstm_encoder(nn.Module): 
    def __init__(self, input_size, hidden_size, num_layers=1): 
        super(lstm_encoder, self).__init__() 
        self.input_size = input_size 
        self.hidden_size = hidden_size 
        self.num_layers = num_layers 
        self.lstm = nn.LSTM(input_size=self.input_size, 
                            hidden_size=self.hidden_size,
                            num_layers=self.num_layers,
                            batch_first=True) 
    
    def forward(self, x): 
        lstm_out, lstm_hidden = self.lstm(x) 
        return lstm_out, lstm_hidden 

In [61]:
class lstm_decoder(nn.Module): 
    def __init__(self, input_size, hidden_size, num_layers=1): 
        super(lstm_decoder, self).__init__() 
        self.input_size = input_size 
        self.hidden_size = hidden_size 
        self.num_layers = num_layers 
        self.lstm = nn.LSTM(input_size=self.input_size, 
                            hidden_size=self.hidden_size, 
                            num_layers=self.num_layers,
                            batch_first=True) 
        self.linear = nn.Linear(hidden_size, input_size) 
    
    def forward(self, x, encoder_hidden_states): 
        lstm_out, self.hidden = self.lstm(x.unsqueeze(-1), encoder_hidden_states) 
        output = self.linear(lstm_out) 
        return output, self.hidden 

In [62]:
class lstm_encoder_decoder(nn.Module): 
    def __init__(self, input_size, hidden_size): 
        super(lstm_encoder_decoder, self).__init__() 
        self.input_size = input_size 
        self.hidden_size = hidden_size 
        self.encoder = lstm_encoder(input_size=self.input_size, hidden_size=self.hidden_size) 
        self.decoder = lstm_decoder(input_size=self.input_size, hidden_size=self.hidden_size) 
        
    def forward(self, inputs, targets, target_len, teacher_forcing_ratio):  
        batch_size = inputs.shape[0] 
        input_size = inputs.shape[2] 
        outputs = torch.zeros(batch_size, target_len, input_size) 
        _, hidden = self.encoder(inputs) 
        decoder_input = inputs[:, -1, :] 
        for t in range(target_len): 
            out, hidden = self.decoder(decoder_input, hidden) 
            out = out.squeeze(1) 
            # apply teacher forcing 
            if random.random() < teacher_forcing_ratio: 
                decoder_input = targets[:, t, :] 
            else: 
                decoder_input = out 
            outputs[:, t, :] = out 
        return outputs 
    
    def predict(self, inputs, target_len): 
        self.eval() 
        batch_size = inputs.shape[0] 
        input_size = inputs.shape[2] 
        outputs = torch.zeros(batch_size, target_len, input_size) 
        _, hidden = self.encoder(inputs) 
        decoder_input = inputs[:, -1, :] 
        for t in range(target_len): 
            out, hidden = self.decoder(decoder_input, hidden) 
            out = out.squeeze(1) 
            decoder_input = out 
            outputs[:, t, :] = out 
        return outputs.detach() 

In [63]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 

model = lstm_encoder_decoder(input_size=1, hidden_size=32).to(device) 

In [78]:
train_losses, val_losses = [], [] 

learning_rate = 0.01 
epochs = 20 
optimizer = optim.Adam(model.parameters(), lr=learning_rate) 
loss_func = nn.L1Loss() # MAE Loss 
total_steps = len(train_dataloader) * epochs 
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=int(0.1*total_steps), 
                                            num_training_steps=total_steps)  

best_val_loss = 9999999999


model.train() 
for epoch_i in tqdm(range(epochs), desc="Epochs", position=0, leave=True, total=epochs): 
    train_loss = 0 
    model.train() 
    with tqdm(train_dataloader, unit="batch") as tepoch: 
        for step, batch in enumerate(tepoch): 
            batch = tuple(t.to(device).float() for t in batch) 
            x, y = batch 
            outputs = model(x, y, lookahead_window, 0.6).to(device) 
            loss = loss_func(outputs, y)  
            train_loss += loss.item() 
            loss.backward() 
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) 
            optimizer.step() 
            scheduler.step() 
            model.zero_grad() 
            tepoch.set_postfix(loss=train_loss / (step+1)) 
            time.sleep(0.1) 
        avg_train_loss = train_loss / len(train_dataloader) 
        train_losses.append(avg_train_loss) 
    val_loss = 0 
    for x, y in tqdm(val_dataloader): 
        x = x.to(device).float() 
        predict = model.predict(x, lookahead_window) 
        predict = predict * norm[0]  
        y = y.squeeze(2).detach().cpu().numpy()
        predict = predict.squeeze(2).detach().cpu().numpy() 
        val_loss += mean_absolute_error(y, predict) 
    avg_val_loss = val_loss / len(val_dataloader)  
    val_losses.append(avg_val_loss) 
    if best_val_loss > avg_val_loss: 
        best_val_loss = avg_val_loss 
        torch.save(model.state_dict(), "seq2seq.pt") 


Epochs:   0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?batch/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [79]:
from sklearn.metrics import mean_absolute_error

best_model = lstm_encoder_decoder(input_size=1, hidden_size=32).to(device) 
checkpoint = torch.load("seq2seq.pt")
best_model.load_state_dict(checkpoint) 

test_loss = 0

for x, y in tqdm(test_dataloader): 
    x = x.to(device).float() 
    predict = model.predict(x, lookahead_window) 
    predict = predict * norm[0]  
    y = y.squeeze(2).detach().cpu().numpy()
    predict = predict.squeeze(2).detach().cpu().numpy() 
    test_loss += mean_absolute_error(y, predict) 

    
avg_test_loss = test_loss / len(test_dataloader) 

print(f"Test MAE = {avg_test_loss}") 

  0%|          | 0/7 [00:00<?, ?it/s]

Test MAE = 3.189077172960554
