In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings

import argparse
import logging
import pandas as pd
import random
import os
import numpy as np
from loggers import logger

from process import input_data, preprocess

def set_logger(log_name):
    log_obj = logger.AutoMLLog(log_name)
    log_obj.set_handler('automl_process')
    log_obj.set_formats()
    auto_logger = log_obj.addOn()
    
    auto_logger.info('logger 세팅')

In [2]:
import os
import warnings

warnings.filterwarnings("ignore")  # avoid printing out absolute paths

import copy
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

import tensorflow as tf
import tensorboard as tb
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

In [5]:
import statsmodels.api as sm
from datetime import timedelta
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima.arima import auto_arima
from statsmodels.tsa.stattools import adfuller

In [4]:
#!pip install pmdarima

In [3]:
path ='storage/stallion.csv'
target_var='volume'
date_var= 'date'
store_list=['sku','agency']
unit='month'
predict_n = 7

In [6]:
path ='storage/Walmart.csv'
target_var='Weekly_Sales'
date_var= 'Date'
store_list=['Store']
unit='week'
predict_n = 7

In [3]:
path ='storage/demand_forecast_dataset.csv'
target_var='sale_qty'
date_var= 'sale_dy'
store_list=['str_cd', 'prod_cd']
unit='day'
predict_n = 7

In [7]:
log_name = 'practice_tft'
set_logger(log_name)
data, var_list, num_var, obj_var = input_data.Data_load(path, log_name).read_data()
df = preprocess.Preprocessing(log_name, data, var_list, num_var, obj_var, target_var=target_var, date_var=date_var, store_list=store_list, unit=unit).df

logger 세팅
csv 데이터 불러오기
storage/Walmart.csv
변수 분리 시작
데이터 구성: 6435 행, 8열
Memory usage of dataframe is 0.39MB
Memory usage after optimization is: 0.10MB
Decreased by 73.7%
결측치 처리
결측치 처리 이후 데이터 구성: 6435 행, 8열
전처리를 위한 target, date, store 분리
정규화 진행
라벨 인코딩 진행
전처리 완료


   Temperature  Fuel_Price       CPI  Unemployment  Holiday_Flag  \
0    -0.995605   -1.712891  1.004883      0.058716     -0.274170   
1    -1.202148   -1.763672  1.007812      0.058716      3.646484   
2    -1.124023   -1.840820  1.007812      0.058716     -0.274170   
3    -0.761230   -1.738281  1.010742      0.058716     -0.274170   
4    -0.768066   -1.597656  1.010742      0.058716     -0.274170   

   Weekly_Sales        Date  Store  
0   1643690.875  05-02-2010      1  
1   1641957.500  12-02-2010      1  
2   1611968.125  19-02-2010      1  
3   1409727.625  26-02-2010      1  
4   1554806.625  05-03-2010      1  
시계열용 전처리 진행
  data[date_var] = pd.to_datetime(data[date_var],infer_datetime_format = True, utc = True).asty

In [12]:
if len(store_list) == 1:
    store_list = ['dummy'] + store_list
    df.loc[:,'dummy'] = 'dummy'
    
train_df = pd.DataFrame()
val_df = pd.DataFrame()
pred_df = pd.DataFrame()

for store_var_0, store_var_1 in df.drop_duplicates(store_list)[store_list].values:
    ari_df = df.loc[(df[store_list[0]]==store_var_0)&(df[store_list[1]]==store_var_1), :]               
    ari_df.loc[:, 'ds'] = ari_df[date_var]
    ari_df.loc[:, 'y'] = ari_df[target_var]        

    predict_size = predict_n
    ari_train = ari_df.iloc[:-predict_size, :]
    ari_var = ari_df.iloc[-predict_size:, :]
    
    #auto arima best parameter pick
    ari = auto_arima(y = ari_train['y'].values, d = 1, start_p = 0, max_p = 3, start_q = 0 
                      , max_q = 3, m = 1, seasonal = False , stepwise = True, trace=True)
    
    #arima fit 후 ari_var에 예측값 대입
    ari.fit(ari_train['y'].values)
    val_preds = ari.predict(n_pediods = predict_n)
    ari_var.loc[:, 'yhat'] = val_preds
    
    #train
    train= ari_train[['y','ds']]
    train.loc[:, store_list[0]] = store_var_0
    train.loc[:, store_list[1]] = store_var_1
    
    train_df = pd.concat([train_df, train], axis=0)

    #valid
    val_preds_df.loc[:, store_list[0]] = store_var_0
    val_preds_df.loc[:, store_list[1]] = store_var_1
    
    val_df = pd.concat([val_df, val_preds_df], axis=0)
    
    #predict_date 생성 후 예측 predict_df생성
    if unit =='day':
        predict_date = [last_date + timedelta(days=i) for i in range(1, predict_n+1)] #weeks, days 변경 가능
    elif unit == 'week':
        predict_date = [last_date + timedelta(days=7*i) for i in range(1, predict_n+1)] #weeks, days 변경 가능
    elif unit == 'month':
        predict_date = [last_date + timedelta(days=30*i) for i in range(1, predict_n+1)] #weeks, days 변경 가능
        
    test_df = pd.DataFrame({'ds':predict_date})
    ari.fit(ari_df['y'].values)
    preds = ari.predict(n_periods = predict_n)
    test_df.loc[:, 'yhat'] = preds
    test_df.loc[:, store_list[0]] = store_var_0
    test_df.loc[:, store_list[1]] = store_var_1
    
    pred_df = pd.concat([pred_df, test_df], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_df.loc[:, 'ds'] = fb_df[date_var]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_df.loc[:, 'y'] = fb_df[target_var]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_df.loc[:, 'ds'] = fb_df[date_var]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer

In [18]:
ari.fit(fb_train['y'].values)

In [19]:
val_preds = ari.predict(n_periods = predict_n)

In [20]:
val_preds

array([748116.58101569, 738202.8179056 , 746676.20645194, 737520.00270401,
       742787.01226244, 735829.68919233, 739147.95732119])

In [25]:
fb_var.loc[:,'yhat'] = val_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_var.loc[:,'yhat'] = val_preds


In [26]:
fb_var

Unnamed: 0,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,Weekly_Sales,Date,Store,dummy,time_idx,week,ds,y,yhat
6428,0.228271,1.389648,0.519043,0.366699,-0.27417,713173.9375,2012-09-28,45,dummy,136,39,2012-09-28,748116.581016,748116.581016
6429,-1.283203,0.614258,0.46167,0.225342,3.646484,803657.125,2012-10-02,45,dummy,137,40,2012-10-02,738202.817906,738202.817906
6430,0.973633,0.792969,0.496826,0.366699,-0.27417,733037.3125,2012-10-08,45,dummy,138,41,2012-10-08,746676.206452,746676.206452
6431,-0.227417,1.330078,0.52832,0.354248,-0.27417,718125.5,2012-10-19,45,dummy,139,42,2012-10-19,737520.002704,737520.002704
6432,-0.098694,1.142578,0.525391,0.354248,-0.27417,760281.4375,2012-10-26,45,dummy,140,43,2012-10-26,742787.012262,742787.012262
6433,0.031769,1.155273,0.493408,0.304199,-0.27417,770487.375,2012-11-05,45,dummy,141,45,2012-11-05,735829.689192,735829.689192
6434,-0.335938,1.397461,0.52832,0.354248,-0.27417,734464.375,2012-12-10,45,dummy,142,50,2012-12-10,739147.957321,739147.957321
