simple moving average를 이용해서 예측 

$x_{t-L+1}, ..., x_{t-2}, x_{t-1}, x_{t}$가 주어지면 $x_{t+1} = \sum_{i=t-L+1}^t \frac{x_{i}}{L}$

학습이 필요없다. 

In [66]:
import pandas as pd 
import numpy as np 
import time 
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim 
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler, IterableDataset 
from sklearn.preprocessing import MinMaxScaler
import random 
import pickle
from transformers import *
from sklearn.metrics import mean_absolute_error 

import warnings
warnings.filterwarnings('ignore')

In [67]:
# dummy data 생성 

df = pd.DataFrame(np.random.randint(0, 100, size=(1000, 10)), columns=["id","crawling_date","sales_3days","prd_rank","prd_name", "price", "ctg_2_name", "brand", "score", "delivery_fee"])

# 임의의 datetime으로 
datetimes = [] 
dtobj = datetime(2020,1,1)
for i in range(1000): 
    datetimes.append(dtobj) 
    dtobj = dtobj + timedelta(days=1) 

df["crawling_date"] = datetimes 

#예시를 위해서 id 통일 
df["id"] = 1 

df

Unnamed: 0,id,crawling_date,sales_3days,prd_rank,prd_name,price,ctg_2_name,brand,score,delivery_fee
0,1,2020-01-01,99,90,15,29,24,31,95,45
1,1,2020-01-02,23,42,94,47,20,8,91,67
2,1,2020-01-03,87,22,4,20,92,76,55,69
3,1,2020-01-04,8,69,15,37,38,85,19,75
4,1,2020-01-05,57,57,61,12,64,97,96,47
...,...,...,...,...,...,...,...,...,...,...
995,1,2022-09-22,15,49,1,23,41,12,67,91
996,1,2022-09-23,2,0,10,46,14,24,76,63
997,1,2022-09-24,42,2,73,61,15,94,56,95
998,1,2022-09-25,53,67,68,11,80,0,45,54


In [68]:
# clip sales values 
df[["sales_3days"]] = df[["sales_3days"]].clip(0,)
df[["sales_3days"]] = df[["sales_3days"]].apply(lambda x: np.log1p(x))

In [69]:
# 시계열 예측을 위해서 shift   
lookback_window, lookahead_window = 28, 28 

for i in range(1, lookback_window+1):
    df[f"shift_{i}"] = df.groupby("id")["sales_3days"].shift(i)

In [70]:
train_df = df[df['crawling_date'] < datetime(2021,1,1)]
val_df = df[(df['crawling_date'] >= datetime(2021,1,1)) & (df['crawling_date'] < datetime(2022,1,1))]
test_df = df[df['crawling_date'] >= datetime(2022,1,1)]

train_df.shape, val_df.shape, test_df.shape   

((366, 38), (365, 38), (269, 38))

In [71]:
# 나중에 실제 데이터를 이용할때 이 부분은 원래 하시던대로 실행하시면 됩니다. Validation data도 추가했으니 비슷하게 하시면됩니다. 
'''
test_df = test_df.query("id in @id_list")
le = LabelEncoder()
le.fit(train_df["id"])
train_df["id"] = le.transform(train_df["id"])
test_df["id"] = le.transform(test_df["id"])
''' 

'\ntest_df = test_df.query("id in @id_list")\nle = LabelEncoder()\nle.fit(train_df["id"])\ntrain_df["id"] = le.transform(train_df["id"])\ntest_df["id"] = le.transform(test_df["id"])\n'

In [72]:
train_df = train_df.iloc[:,10:]
val_df = val_df.iloc[:, 10:] 
test_df = test_df.iloc[:,10:]

train_df.shape, val_df.shape, test_df.shape

((366, 28), (365, 28), (269, 28))

In [73]:
train_df.dropna(axis=0, inplace=True) 
val_df.dropna(axis=0, inplace=True) 
test_df.dropna(axis=0, inplace=True)  

train_df.shape, val_df.shape, test_df.shape

((338, 28), (365, 28), (269, 28))

In [74]:
train_df = train_df.values  
val_df = val_df.values 
test_df = test_df.values

In [75]:
X_test, Y_test = [], []
for i in tqdm(range(28, test_df.shape[0]-lookback_window), position=0, leave=True): 
    X_test.append(test_df[i][::-1].reshape((lookback_window, 1))) 
    Y_test.append(test_df[i+28][::-1].reshape((lookahead_window, 1))) 

X_test = np.array(X_test, dtype=np.float32)
Y_test = np.array(Y_test, dtype=np.float32)

X_test.shape, Y_test.shape

  0%|          | 0/213 [00:00<?, ?it/s]

((213, 28, 1), (213, 28, 1))

In [76]:
test_loss = 0 
for i in tqdm(range(X_test.shape[0])): 
    preds = [] 
    seq = X_test[i].flatten() 
    for j in range(28):
        all_values = [] 
        all_values.extend(seq[j:28]) 
        for p in preds: 
            all_values.append(p)  
        ma_val = np.mean(all_values) 
        preds.append(ma_val.item()) 
    y = Y_test[i].flatten() 
    test_loss += mean_absolute_error(y, preds) 
    
        
test_loss /= X_test.shape[0] 

print(f"Test MAE = {test_loss}") 

  0%|          | 0/213 [00:00<?, ?it/s]

Test MAE = 0.7701713033680991
