# Amazon Forecast

Black Friday (매년 11월 말)시점의 sales를 예측해 보자.

 - dept_id FOODS_3 (8230개) 중 200개 Sampling (Best200)
 - Target Time Series
     - From/To : 2013-11-16/2015-11-15 (365*2일)
     - timestamp (timestamp)
     - id (string)
     - sales (int)
 - Related Time Series
     - From/To : 2013-11-16/2015-12-15 (365*2일 + 30일)
     - timestamp (timestamp)
     - id (string)
     - sell_price (int)
     - snap_CA, snap_TX, snap_WI (int)
     - Easter, LaborDay, Purim_End, StPatricksDay, SuperBowl (int)
     - Black Friday (int)
 - Item meta data : id, item_id, dept_id, cat_id, store_id, state_id
 - BackTestWindows : 4
 - BackTestWindowOffset : Default (Same as ForecastHorizon)

<img src="../img/forecast-steps.png" align="left">

# Data Preparation

In [1]:
# Import required library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import boto3
from datetime import datetime, timedelta
from itertools import cycle
import json
import time
from time import sleep
import warnings

color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])
%matplotlib inline
plt.rcParams["figure.figsize"] = (12,5)
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['lines.color'] = 'r'
plt.rcParams['axes.grid'] = True

# Set maximum number of lines
pd.set_option ('display.max_rows', 500)
# Set the maximum number of columns
pd.set_option ('display.max_columns', 500)
# Width to display
pd.set_option ('display.width', 1000)

warnings.filterwarnings(action='ignore')

In [2]:
%store -r

In [3]:
len(df_merged) # 15,743,990

15743990

In [4]:
len(df_sales_foods_3) #8,230

8230

In [5]:
df_sales_foods_3.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,d_35,d_36,d_37,d_38,d_39,d_40,d_41,d_42,d_43,d_44,d_45,d_46,d_47,d_48,d_49,d_50,d_51,d_52,d_53,d_54,d_55,d_56,d_57,d_58,d_59,d_60,d_61,d_62,d_63,d_64,d_65,d_66,d_67,d_68,d_69,d_70,d_71,d_72,d_73,d_74,d_75,d_76,d_77,d_78,d_79,d_80,d_81,d_82,d_83,d_84,d_85,d_86,d_87,d_88,d_89,d_90,d_91,d_92,d_93,d_94,d_95,d_96,d_97,d_98,d_99,d_100,d_101,d_102,d_103,d_104,d_105,d_106,d_107,d_108,d_109,d_110,d_111,d_112,d_113,d_114,d_115,d_116,d_117,d_118,d_119,d_120,d_121,d_122,d_123,d_124,d_125,d_126,d_127,d_128,d_129,d_130,d_131,d_132,d_133,d_134,d_135,d_136,d_137,d_138,d_139,d_140,d_141,d_142,d_143,d_144,d_145,d_146,d_147,d_148,d_149,d_150,d_151,d_152,d_153,d_154,d_155,d_156,d_157,d_158,d_159,d_160,d_161,d_162,d_163,d_164,d_165,d_166,d_167,d_168,d_169,d_170,d_171,d_172,d_173,d_174,d_175,d_176,d_177,d_178,d_179,d_180,d_181,d_182,d_183,d_184,d_185,d_186,d_187,d_188,d_189,d_190,d_191,d_192,d_193,d_194,d_195,d_196,d_197,d_198,d_199,d_200,d_201,d_202,d_203,d_204,d_205,d_206,d_207,d_208,d_209,d_210,d_211,d_212,d_213,d_214,d_215,d_216,d_217,d_218,d_219,d_220,d_221,d_222,d_223,d_224,d_225,d_226,d_227,d_228,d_229,d_230,d_231,d_232,d_233,d_234,d_235,d_236,d_237,d_238,d_239,d_240,d_241,d_242,d_243,d_244,...,d_1664,d_1665,d_1666,d_1667,d_1668,d_1669,d_1670,d_1671,d_1672,d_1673,d_1674,d_1675,d_1676,d_1677,d_1678,d_1679,d_1680,d_1681,d_1682,d_1683,d_1684,d_1685,d_1686,d_1687,d_1688,d_1689,d_1690,d_1691,d_1692,d_1693,d_1694,d_1695,d_1696,d_1697,d_1698,d_1699,d_1700,d_1701,d_1702,d_1703,d_1704,d_1705,d_1706,d_1707,d_1708,d_1709,d_1710,d_1711,d_1712,d_1713,d_1714,d_1715,d_1716,d_1717,d_1718,d_1719,d_1720,d_1721,d_1722,d_1723,d_1724,d_1725,d_1726,d_1727,d_1728,d_1729,d_1730,d_1731,d_1732,d_1733,d_1734,d_1735,d_1736,d_1737,d_1738,d_1739,d_1740,d_1741,d_1742,d_1743,d_1744,d_1745,d_1746,d_1747,d_1748,d_1749,d_1750,d_1751,d_1752,d_1753,d_1754,d_1755,d_1756,d_1757,d_1758,d_1759,d_1760,d_1761,d_1762,d_1763,d_1764,d_1765,d_1766,d_1767,d_1768,d_1769,d_1770,d_1771,d_1772,d_1773,d_1774,d_1775,d_1776,d_1777,d_1778,d_1779,d_1780,d_1781,d_1782,d_1783,d_1784,d_1785,d_1786,d_1787,d_1788,d_1789,d_1790,d_1791,d_1792,d_1793,d_1794,d_1795,d_1796,d_1797,d_1798,d_1799,d_1800,d_1801,d_1802,d_1803,d_1804,d_1805,d_1806,d_1807,d_1808,d_1809,d_1810,d_1811,d_1812,d_1813,d_1814,d_1815,d_1816,d_1817,d_1818,d_1819,d_1820,d_1821,d_1822,d_1823,d_1824,d_1825,d_1826,d_1827,d_1828,d_1829,d_1830,d_1831,d_1832,d_1833,d_1834,d_1835,d_1836,d_1837,d_1838,d_1839,d_1840,d_1841,d_1842,d_1843,d_1844,d_1845,d_1846,d_1847,d_1848,d_1849,d_1850,d_1851,d_1852,d_1853,d_1854,d_1855,d_1856,d_1857,d_1858,d_1859,d_1860,d_1861,d_1862,d_1863,d_1864,d_1865,d_1866,d_1867,d_1868,d_1869,d_1870,d_1871,d_1872,d_1873,d_1874,d_1875,d_1876,d_1877,d_1878,d_1879,d_1880,d_1881,d_1882,d_1883,d_1884,d_1885,d_1886,d_1887,d_1888,d_1889,d_1890,d_1891,d_1892,d_1893,d_1894,d_1895,d_1896,d_1897,d_1898,d_1899,d_1900,d_1901,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
2226,FOODS_3_001_CA_1_validation,FOODS_3_001,FOODS_3,FOODS,CA_1,CA,1,1,1,1,1,0,1,2,1,1,1,0,0,0,1,3,0,0,1,1,0,1,2,0,2,1,1,1,2,0,1,0,0,0,0,0,0,5,1,0,3,1,0,1,0,0,0,0,2,1,1,1,0,2,0,0,1,0,1,0,0,0,2,1,1,0,1,4,0,0,1,0,1,1,1,0,0,1,2,1,3,1,1,1,1,3,2,2,0,0,0,1,5,5,0,0,0,0,0,1,0,0,1,1,0,1,1,0,2,0,0,1,1,1,2,1,0,1,0,0,0,0,0,0,1,0,0,1,1,0,1,2,1,3,1,0,0,1,0,1,1,0,3,0,2,3,1,0,1,0,0,1,0,1,0,0,2,1,0,3,1,4,2,0,2,0,0,0,0,1,1,0,1,0,1,0,1,0,2,0,0,1,1,3,1,0,0,1,2,0,0,0,0,0,5,1,0,1,0,0,0,0,1,0,0,1,1,2,0,1,1,0,2,0,0,0,0,1,0,2,1,3,0,0,0,1,0,1,0,0,0,0,1,2,2,0,1,0,0,0,0,2,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,3,0,2,1,2,2,1,4,2,0,1,2,0,0,0,1,0,0,1,1,1,2,2,0,9,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,2,4,1,0,0,1,0,5,2,2,0,1,0,0,0,0,0,0,0,3,2,0,0,0,1,0,2,0,1,0,2,2,0,0,0,0,0,0,2,0,1,2,3,0,0,1,0,0,2,0,0,1,1,0,0,0,0,0,0,3,0,0,1,1,1,0,2,0,0,2,0,2,0,0,0,0,2,0,2,1,0,2,0,0,0,0,1,0,0,0,0,0,0,2,1,0,2,0,0,2,0,2,1,0,0,1,0,0,0,0,2,0,1,2,0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,1,0,0,0,0,1,1,0,0,0,2,1,0,0,1,1,1,3,1,0,0,0,1,0,0,1,0,1,0,0,2,0,3,1,0,0,0,0,0,1,2,0,0,1,0,0,1
2227,FOODS_3_002_CA_1_validation,FOODS_3_002,FOODS_3,FOODS,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,1,1,3,4,1,0,0,0,3,5,0,5,0,1,3,4,2,0,1,1,2,0,2,2,5,0,1,3,0,1,0,2,1,0,1,1,3,1,1,2,0,0,0,1,2,3,1,3,4,2,3,4,1,0,1,3,1,2,1,3,2,4,1,1,4,2,3,0,1,0,1,1,0,0,3,0,1,0,1,4,3,1,1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,4,2,3,5,4,4,6,1,8,12,5,6,8,3,0,5,3,4,4,8,2,0,4,2,6,6,1,3,1,1,5,2,1,1,2,0,3,1,7,3,2,5,1,3,6,3,0,1,2,2,10,4,5,5,8,8,6,6,4,6,5,4,2,5,3,4,2,2,2,2,4,4,1,0,1,4,6,5,5,3,6,1,6,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2228,FOODS_3_003_CA_1_validation,FOODS_3_003,FOODS_3,FOODS,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,3,2,3,3,2,0,2,1,2,1,1,2,2,1,1,3,3,1,0,2,0,1,2,1,0,1,1,0,1,1,2,2,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,1,0,2,2,3,0,0,0,0,0,0,0,0,0,1,2,3,0,0,3,0,0,1,3,0,1,0,1,0,0,1,1,1,0,0,0,1,0,1,1,1,0,0,0,0,1,0
2229,FOODS_3_004_CA_1_validation,FOODS_3_004,FOODS_3,FOODS,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,2,1,0,5,1,0,1,0,0,0,4,0,0,2,3,0,0,3,2,1,3,0,1,0,2,3,3,2,0,0,3,2,0,2,0,1,0,0,1,2,1,1,0,1,2,4,0,0,1,2,3,0,2,0,0,0,0,0,2,4,1,0,0,3,1,1,0,0,0,0,3,1,0,1,0,0,2,2,2,2,1,0,0,0,0,0,0,1,1,0,0,1,0,0,2,0,1,2,0,0,0,2,0,0,0,0,1,1,2,1,1,2,0,4,0,0,0,0,0,0,0,2,1,0,2,0,1,4,0,1,0,0,1,0,0,0,0,1,2,0,0,0,2,2,1,1,1,0,2,2,2,0,0,0,1,4,0,2,3,0,1,1,1,0,1,0,0,4,6,0,2,2,1,2,0,0,0,0,4,0,1,0,3,2,1,3,1,0,0,0,0,3,2,1,0,0,0,0,0,0,0,0,0,0,1,0,7,0,0,1,0,0,0,1,0,0,2,0,1,1,2,0,1,0,0,0,0,1,0,0,1,0,1,3,5,0,2,4,0,1,1,2,0,0,0,0,0,2,0,1
2230,FOODS_3_005_CA_1_validation,FOODS_3_005,FOODS_3,FOODS,CA_1,CA,1,0,1,2,2,0,1,1,3,1,1,1,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,2,4,0,0,1,0,1,4,2,0,0,1,0,3,2,2,0,0,2,0,0,1,1,0,2,1,0,0,1,2,1,3,2,2,1,2,1,0,2,2,2,0,0,1,4,1,1,1,3,3,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,1,3,0,0,0,0,0,0,4,2,0,1,1,2,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,2,0,0,0,1,0,1,0,0,1,0,1,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,3,1,1,1,2,2,0,0,0,0,0,1,3,0,0,2,0,0,0,1,1,1,0,0,0,1,1,0,2,0,0,0,1,2,0,1,0,1,0,1,1,1,0,0,0,0,1,0,1,0,1,1,1,0,1,0,1,0,0,0,0,0,1,1,0,1,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,2,1,0,0,1,3,0,2,0,0,1,3,1,0,1,2,1,1,1,2,1,2,1,0,0,0,1,1,1,0,0,0,1,0,1,0,1,1,2,0,3,0,0,1,0,0,0,0,0,2,2,0,0,0,1,0,1,0,0,0,0,0,1,2,2,1,0,0,0,1,1,0,0,0,0,2,0,0,2,0,0,0,0,0,1,0


## Best200 선택

In [6]:
# d_로 시작하는 column 추출
d_cols = [c for c in df_sales_foods_3.columns if 'd_' in c]

# d_로 시작하는 column의 value(판매량)들을 더해 "sales_total" column에 추가
df_sales_foods_3["sales_total"] = df_sales_foods_3.loc[:,d_cols].sum(axis=1)

In [7]:
# Daily sales가 가장 많은 item Best200 list 선택
best200  = df_sales_foods_3.sort_values(by="sales_total", ascending=False).head(200)
sampled = best200
len(sampled)

200

In [8]:
sampled[["id", "sales_total"]].head()

Unnamed: 0,id,sales_total
8412,FOODS_3_090_CA_3_validation,250502
18055,FOODS_3_586_TX_2_validation,192835
21104,FOODS_3_586_TX_3_validation,150122
8908,FOODS_3_586_CA_3_validation,134386
2314,FOODS_3_090_CA_1_validation,127203


In [9]:
sampled[["id", "sales_total"]].tail()

Unnamed: 0,id,sales_total
15101,FOODS_3_681_TX_1_validation,24273
8635,FOODS_3_313_CA_3_validation,24261
29751,FOODS_3_086_WI_3_validation,24133
29819,FOODS_3_154_WI_3_validation,24019
8699,FOODS_3_377_CA_3_validation,23976


In [10]:
# Best200 추출
df_merged_sampled = df_merged[df_merged["id"].isin(sampled.id)]

In [11]:
len(df_merged_sampled["id"].unique()) # 200

200

In [12]:
df_merged_sampled.head()

Unnamed: 0_level_0,id,item_id,dept_id,cat_id,store_id,state_id,day,sales,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2011-01-29,FOODS_3_064_CA_1_validation,FOODS_3_064,FOODS_3,FOODS,CA_1,CA,d_1,0,11101,Saturday,1,1,2011,d_1,,,,,0,0,0,
2011-01-29,FOODS_3_080_CA_1_validation,FOODS_3_080,FOODS_3,FOODS,CA_1,CA,d_1,33,11101,Saturday,1,1,2011,d_1,,,,,0,0,0,1.48
2011-01-29,FOODS_3_090_CA_1_validation,FOODS_3_090,FOODS_3,FOODS,CA_1,CA,d_1,107,11101,Saturday,1,1,2011,d_1,,,,,0,0,0,1.25
2011-01-29,FOODS_3_099_CA_1_validation,FOODS_3_099,FOODS_3,FOODS,CA_1,CA,d_1,0,11101,Saturday,1,1,2011,d_1,,,,,0,0,0,
2011-01-29,FOODS_3_120_CA_1_validation,FOODS_3_120,FOODS_3,FOODS,CA_1,CA,d_1,0,11101,Saturday,1,1,2011,d_1,,,,,0,0,0,


## Create Data Sets

### Target (df_target)

In [13]:
# 2013-11-16 ~ 2015-11-15
df_target = df_merged_sampled[["id", "sales"]]
df_target = df_target.loc["2013-11-16":"2015-11-15"] # 2year
df_target.head()

Unnamed: 0_level_0,id,sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-11-16,FOODS_3_064_CA_1_validation,11
2013-11-16,FOODS_3_080_CA_1_validation,27
2013-11-16,FOODS_3_090_CA_1_validation,0
2013-11-16,FOODS_3_099_CA_1_validation,29
2013-11-16,FOODS_3_120_CA_1_validation,0


In [14]:
len(df_target) #146,000 = 200*365*2

146000

In [15]:
df_target = df_target.sort_values(by=["id", "date"])

### Related (df_related)
- Black Friday 전일, 당일, 다음 날을 df_related 데이터에 추가한다.

In [17]:
df_merged_sampled['black_friday'] = 0

In [18]:
# 2013년 Black Friday : 2013-11-29
df_merged_sampled['black_friday'].loc["2013-11-28":"2013-11-30"] = 1 

# 2014년 Black Friday : 2013-11-28
df_merged_sampled['black_friday'].loc["2014-11-27":"2014-11-29"] = 1

# 2015년 Black Friday : 2013-11-27
df_merged_sampled['black_friday'].loc["2015-11-26":"2015-11-28"] = 1

In [19]:
# 2013-11-16 ~ 2015-12-15

df_related = df_merged_sampled[["id", "event_name_1", "snap_CA", "snap_TX", "snap_WI", "sell_price", "black_friday"]]

# Related TS는 Target TS + ForecastHorizon까지 데이터가 있어야 하고,
# Missing Value가 있으면 안된다.
df_related = df_related.loc["2013-11-16":"2015-12-15"]

print(len(df_related)) # 152,000 = 200*(365*2+30)
df_related.isnull().sum()

152000


id                   0
event_name_1    140000
snap_CA              0
snap_TX              0
snap_WI              0
sell_price           0
black_friday         0
dtype: int64

In [20]:
# event_name_1의 NaN를 "None"으로 fill
df_related["event_name_1"] = df_related["event_name_1"].fillna("None")

# 특정 item이 2015-07-01 이후부터 판매 되었다고 한다면, df_sales의 해당 item의 sell_price 데이터는 2015-07-01 이후부터 있을 것이다.
# df_merged의 date는 df_calendar를 merge했으므로 특정 item의 date는 2011-01-29~2016-06-19 범위지만,
# 특정 item의 df_sales내 date는 2015-07-01 이후이므로
# df_merged와 df_sales를 Merge하면 특정 item의 2015-07-01 이전 시점의 sell_price는 NaN이다.
# 따라서 sell_price의 NaN를 "0"으로 fill
df_related["sell_price"] = df_related["sell_price"].fillna(0)
df_related.isnull().sum()

id              0
event_name_1    0
snap_CA         0
snap_TX         0
snap_WI         0
sell_price      0
black_friday    0
dtype: int64

In [21]:
print(len(df_related)) # 152,000 = 200*(365*2+30)
df_related.isnull().sum()

152000


id              0
event_name_1    0
snap_CA         0
snap_TX         0
snap_WI         0
sell_price      0
black_friday    0
dtype: int64

In [22]:
# One-hot encoding for event_name_1
df_related = pd.concat([df_related, pd.get_dummies(df_related['event_name_1'])],axis=1)

In [23]:
print(len(df_related)) # 39,500 = 100*(365+30)
df_related.isnull().sum()

152000


id                     0
event_name_1           0
snap_CA                0
snap_TX                0
snap_WI                0
sell_price             0
black_friday           0
Chanukah End           0
Christmas              0
Cinco De Mayo          0
ColumbusDay            0
Easter                 0
Eid al-Fitr            0
EidAlAdha              0
Father's day           0
Halloween              0
IndependenceDay        0
LaborDay               0
LentStart              0
LentWeek2              0
MartinLutherKingDay    0
MemorialDay            0
Mother's day           0
NBAFinalsEnd           0
NBAFinalsStart         0
NewYear                0
None                   0
OrthodoxChristmas      0
OrthodoxEaster         0
Pesach End             0
PresidentsDay          0
Purim End              0
Ramadan starts         0
StPatricksDay          0
SuperBowl              0
Thanksgiving           0
ValentinesDay          0
VeteransDay            0
dtype: int64

In [24]:
# event_name_1에서 Unique value 추출
all_events = df_related.event_name_1.unique()

# event_name_1 : SuperBowl, LaborDay, Purim End, Easter, StPatricksDay <- Sales가 가장 많은 event 다섯 개만 Related에 추가
chosen_events = ['SuperBowl', 'LaborDay', 'Purim End', 'Easter', 'StPatricksDay']
for event in [event for event in all_events if event not in chosen_events]:
    df_related.drop([event], axis=1, inplace=True)

df_related.drop(["event_name_1"], axis=1, inplace=True)
#df_related.head()

In [25]:
print(len(df_related)) # 325,085 = 823*(365+30)
df_related.isnull().sum()

152000


id               0
snap_CA          0
snap_TX          0
snap_WI          0
sell_price       0
black_friday     0
Easter           0
LaborDay         0
Purim End        0
StPatricksDay    0
SuperBowl        0
dtype: int64

In [26]:
df_related = df_related.sort_values(by=["id", "date"])

### Item_metadata (df_item)

In [27]:
df_item = df_merged_sampled[["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]].drop_duplicates()

In [28]:
len(df_item) #200

200

In [29]:
len(df_item["id"].unique()) # 100

200

In [30]:
df_item.head()

Unnamed: 0_level_0,id,item_id,dept_id,cat_id,store_id,state_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-01-29,FOODS_3_064_CA_1_validation,FOODS_3_064,FOODS_3,FOODS,CA_1,CA
2011-01-29,FOODS_3_080_CA_1_validation,FOODS_3_080,FOODS_3,FOODS,CA_1,CA
2011-01-29,FOODS_3_090_CA_1_validation,FOODS_3_090,FOODS_3,FOODS,CA_1,CA
2011-01-29,FOODS_3_099_CA_1_validation,FOODS_3_099,FOODS_3,FOODS,CA_1,CA
2011-01-29,FOODS_3_120_CA_1_validation,FOODS_3_120,FOODS_3,FOODS,CA_1,CA


## Make CSV files

In [31]:
# Prepare csv files

!mkdir ./train

local_path = "./train/"

target_file_name     = "df_target.csv"
related_file_name    = "df_related.csv"
item_file_name       = "df_item.csv"

local_target     = local_path + target_file_name
local_related    = local_path + related_file_name
local_item       = local_path + item_file_name

df_target.to_csv(local_target, header=False, index=True)
df_related.to_csv(local_related, header=False, index=True)
df_item.to_csv(local_item, header=False, index=False) #index 제외

# Forecast 시작

참고 : https://github.com/chrisking/ForecastPOC/blob/master/

In [33]:
DATASET_FREQUENCY = "D" # Day
TIMESTAMP_FORMAT = "yyyy-MM-dd"

project = 'walmart_m5'
datasetName= project+'_ds'
datasetGroupName= project +'_dsg'

In [34]:
# Jupyter notebook이 실행되는 AWS region 정보 추출
with open('/opt/ml/metadata/resource-metadata.json') as notebook_info:
    data = json.load(notebook_info)
    resource_arn = data['ResourceArn']
    region = resource_arn.split(':')[3]
print(region)

us-east-1


In [35]:
session = boto3.Session(region_name=region)
forecast = session.client(service_name='forecast')
forecast_query = session.client(service_name='forecastquery')

In [36]:
# Sagemaker Jupyter notebook에서 Amazon Forecast의 API를 사용할 수 있도록 execution_role을 가져 온다.

from sagemaker import get_execution_role

role_arn = get_execution_role()
print(role_arn)

arn:aws:iam::889750940888:role/service-role/AmazonSageMaker-ExecutionRole-20190121T165438


## 1. Datagroup 생성

In [37]:
create_dataset_group_response = forecast.create_dataset_group(DatasetGroupName=datasetGroupName,
                                                              Domain="RETAIL",
                                                             )
datasetGroupArn = create_dataset_group_response['DatasetGroupArn']

In [38]:
forecast.describe_dataset_group(DatasetGroupArn=datasetGroupArn)

{'DatasetGroupName': 'walmart_m5_dsg',
 'DatasetGroupArn': 'arn:aws:forecast:us-east-1:889750940888:dataset-group/walmart_m5_dsg',
 'DatasetArns': [],
 'Domain': 'RETAIL',
 'Status': 'ACTIVE',
 'CreationTime': datetime.datetime(2020, 4, 14, 1, 28, 53, 85000, tzinfo=tzlocal()),
 'LastModificationTime': datetime.datetime(2020, 4, 14, 1, 28, 53, 85000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': 'bf8b62c9-3d07-4847-9370-c5e7e945e0e4',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Tue, 14 Apr 2020 01:28:53 GMT',
   'x-amzn-requestid': 'bf8b62c9-3d07-4847-9370-c5e7e945e0e4',
   'content-length': '251',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

## 2a. Target Time Series Dataset 생성

In [44]:
# Specify the schema of your dataset here. Make sure the order of columns matches the raw data files.
schema ={
   "Attributes":[
      {
         "AttributeName":"timestamp",
         "AttributeType":"timestamp"
      },
      {
         "AttributeName":"item_id",
         "AttributeType":"string"
      },
      {
         "AttributeName":"demand",
         "AttributeType":"integer"
      }
   ]
}

In [45]:
target_DSN = datasetName + "_target"

response=forecast.create_dataset(
                    Domain="RETAIL",
                    DatasetType='TARGET_TIME_SERIES',
                    DatasetName=target_DSN,
                    DataFrequency=DATASET_FREQUENCY, 
                    Schema = schema
)

In [46]:
target_datasetArn = response['DatasetArn']
forecast.describe_dataset(DatasetArn=target_datasetArn)

{'DatasetArn': 'arn:aws:forecast:us-east-1:889750940888:dataset/walmart_m5_ds_target',
 'DatasetName': 'walmart_m5_ds_target',
 'Domain': 'RETAIL',
 'DatasetType': 'TARGET_TIME_SERIES',
 'DataFrequency': 'D',
 'Schema': {'Attributes': [{'AttributeName': 'timestamp',
    'AttributeType': 'timestamp'},
   {'AttributeName': 'item_id', 'AttributeType': 'string'},
   {'AttributeName': 'demand', 'AttributeType': 'integer'}]},
 'EncryptionConfig': {},
 'Status': 'ACTIVE',
 'CreationTime': datetime.datetime(2020, 4, 14, 1, 29, 20, 242000, tzinfo=tzlocal()),
 'LastModificationTime': datetime.datetime(2020, 4, 14, 1, 29, 20, 242000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': 'de5dc94e-805f-4b8b-8cf2-93a1f6a04c92',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Tue, 14 Apr 2020 01:29:20 GMT',
   'x-amzn-requestid': 'de5dc94e-805f-4b8b-8cf2-93a1f6a04c92',
   'content-length': '497',
   'connection': 'keep-alive'},
  'RetryAttempts': 

## 2b. Target Time Series Dataset Import

In [47]:
# Create S3 Bucket
# {Account Number}-forecastpoc

print(region)
s3 = boto3.client('s3')
account_id = boto3.client('sts').get_caller_identity().get('Account')
bucket_name = account_id + "-forecastpoc"
print(bucket_name)
s3.create_bucket(Bucket=bucket_name)
if region != "us-east-1":
    s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': region})
else:
    s3.create_bucket(Bucket=bucket_name)

us-east-1
889750940888-forecastpoc


In [48]:
# Upload Target File

bucket_name = bucket_name
role_arn = role_arn

s3_path = "walmart"

s3_target     = "s3://" + bucket_name + "/" + s3_path + "/" + target_file_name
s3_related    = "s3://" + bucket_name + "/" + s3_path + "/" + related_file_name
s3_item       = "s3://" + bucket_name + "/" + s3_path + "/" + item_file_name

boto3.Session().resource('s3').Bucket(bucket_name).Object(s3_path + "/" + target_file_name).upload_file(local_target)

In [49]:
# Finally we can call import the dataset
role_arn = role_arn #ForecastRolePOC
datasetImportJobName = 'DSIMPORT_JOB_TARGET_POC'
ds_import_job_response=forecast.create_dataset_import_job(DatasetImportJobName=datasetImportJobName,
                                                          DatasetArn=target_datasetArn,
                                                          DataSource= {
                                                              "S3Config" : {
                                                                 "Path":s3_target,
                                                                 "RoleArn": role_arn
                                                              } 
                                                          },
                                                          TimestampFormat=TIMESTAMP_FORMAT
                                                         )

In [50]:
ds_import_job_arn=ds_import_job_response['DatasetImportJobArn']
print(ds_import_job_arn)

arn:aws:forecast:us-east-1:889750940888:dataset-import-job/walmart_m5_ds_target/DSIMPORT_JOB_TARGET_POC


In [51]:
#while True:
#    dataImportStatus = forecast.describe_dataset_import_job(DatasetImportJobArn=ds_import_job_arn)['Status']
#    print(dataImportStatus)
#    if dataImportStatus != 'ACTIVE' and dataImportStatus != 'CREATE_FAILED':
#        sleep(30)
#    else:
#        break

In [52]:
# 방금 만든 dataset을 dataset group에 attach한다.
# attach하지 않으면 Forecast dataset group의 dataset가 조회되지 않는다.
#response = forecast.update_dataset_group(
#    DatasetGroupArn=datasetGroupArn,
#    DatasetArns=[
#        target_datasetArn
#    ]
#)

## 2c. Related Time Series dataset 생성

Related Time Series 고려사항 : https://docs.aws.amazon.com/ko_kr/forecast/latest/dg/related-time-series-datasets.html

<img src="../img/related-ts.png" align="left">


In [67]:
# Upload Related File
boto3.Session().resource('s3').Bucket(bucket_name).Object(s3_path + "/" + related_file_name).upload_file(local_related)

In [68]:
df_related.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 152000 entries, 2013-11-16 to 2015-12-15
Data columns (total 11 columns):
id               152000 non-null object
snap_CA          152000 non-null int64
snap_TX          152000 non-null int64
snap_WI          152000 non-null int64
sell_price       152000 non-null float64
black_friday     152000 non-null int64
Easter           152000 non-null uint8
LaborDay         152000 non-null uint8
Purim End        152000 non-null uint8
StPatricksDay    152000 non-null uint8
SuperBowl        152000 non-null uint8
dtypes: float64(1), int64(4), object(1), uint8(5)
memory usage: 8.8+ MB


In [69]:
# Specify the schema of your dataset here. Make sure the order of columns matches the raw data files.
related_schema ={
   "Attributes":[
      {
         "AttributeName":"timestamp",
         "AttributeType":"timestamp"
      },
      {
         "AttributeName":"item_id",
         "AttributeType":"string"
      },
       {
         "AttributeName":"snap_CA",
         "AttributeType":"integer"
      },
       {
         "AttributeName":"snap_TX",
         "AttributeType":"integer"
      },
       {
         "AttributeName":"snap_WI",
         "AttributeType":"integer"
      },
       {
         "AttributeName":"sell_price",
         "AttributeType":"float"
      },
       {
         "AttributeName":"black_friday",
         "AttributeType":"integer"
      },
       {
         "AttributeName":"Easter",
         "AttributeType":"integer"
      },
       {
         "AttributeName":"LaborDay",
         "AttributeType":"integer"
      },
       {
         "AttributeName":"Purim_End",
         "AttributeType":"integer"
      },
       {
         "AttributeName":"StPatricksDay",
         "AttributeType":"integer"
      },
       {
         "AttributeName":"SuperBowl",
         "AttributeType":"integer"
      }
   ]
}

In [70]:
related_DSN = datasetName + "_related"
response=forecast.create_dataset(
                    Domain="RETAIL",
                    DatasetType='RELATED_TIME_SERIES',
                    DatasetName=related_DSN,
                    DataFrequency=DATASET_FREQUENCY, 
                    Schema = related_schema
)

In [71]:
related_datasetArn = response['DatasetArn']
print(related_datasetArn)
forecast.describe_dataset(DatasetArn=related_datasetArn)

arn:aws:forecast:us-east-1:889750940888:dataset/walmart_m5_ds_related


{'DatasetArn': 'arn:aws:forecast:us-east-1:889750940888:dataset/walmart_m5_ds_related',
 'DatasetName': 'walmart_m5_ds_related',
 'Domain': 'RETAIL',
 'DatasetType': 'RELATED_TIME_SERIES',
 'DataFrequency': 'D',
 'Schema': {'Attributes': [{'AttributeName': 'timestamp',
    'AttributeType': 'timestamp'},
   {'AttributeName': 'item_id', 'AttributeType': 'string'},
   {'AttributeName': 'snap_CA', 'AttributeType': 'integer'},
   {'AttributeName': 'snap_TX', 'AttributeType': 'integer'},
   {'AttributeName': 'snap_WI', 'AttributeType': 'integer'},
   {'AttributeName': 'sell_price', 'AttributeType': 'float'},
   {'AttributeName': 'black_friday', 'AttributeType': 'integer'},
   {'AttributeName': 'Easter', 'AttributeType': 'integer'},
   {'AttributeName': 'LaborDay', 'AttributeType': 'integer'},
   {'AttributeName': 'Purim_End', 'AttributeType': 'integer'},
   {'AttributeName': 'StPatricksDay', 'AttributeType': 'integer'},
   {'AttributeName': 'SuperBowl', 'AttributeType': 'integer'}]},
 'Encry

## 2d. Related Time Series Dataset Import

In [72]:
datasetImportJobName = 'DSIMPORT_JOB_RELATEDPOC_2'
related_ds_import_job_response=forecast.create_dataset_import_job(DatasetImportJobName=datasetImportJobName,
                                                          DatasetArn=related_datasetArn,
                                                          DataSource= {
                                                              "S3Config" : {
                                                                 "Path":s3_related,
                                                                 "RoleArn": role_arn
                                                              } 
                                                          },
                                                          TimestampFormat=TIMESTAMP_FORMAT
                                                         )

In [73]:
rel_ds_import_job_arn=related_ds_import_job_response['DatasetImportJobArn']
print(rel_ds_import_job_arn)

arn:aws:forecast:us-east-1:889750940888:dataset-import-job/walmart_m5_ds_related/DSIMPORT_JOB_RELATEDPOC_2


In [74]:
#while True:
#    dataImportStatus = forecast.describe_dataset_import_job(DatasetImportJobArn=rel_ds_import_job_arn)['Status']
#    print(dataImportStatus)
#    if dataImportStatus != 'ACTIVE' and dataImportStatus != 'CREATE_FAILED':
#        sleep(30)
#    else:
#        break

## 2e. Item Metadata 생성

In [75]:
# Upload Item Metadata File
boto3.Session().resource('s3').Bucket(bucket_name).Object(s3_path + "/" + item_file_name).upload_file(local_item)

In [76]:
df_item.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 200 entries, 2011-01-29 to 2011-01-29
Data columns (total 6 columns):
id          200 non-null object
item_id     200 non-null object
dept_id     200 non-null object
cat_id      200 non-null object
store_id    200 non-null object
state_id    200 non-null object
dtypes: object(6)
memory usage: 10.9+ KB


In [77]:
item_schema ={
   "Attributes":[
      {
         "AttributeName":"item_id",
         "AttributeType":"string"
      },
       {
         "AttributeName":"item_id_not_combined",
         "AttributeType":"string"
      },
       {
         "AttributeName":"dept_id",
         "AttributeType":"string"
      },
       {
         "AttributeName":"cat_id",
         "AttributeType":"string"
      },
       {
         "AttributeName":"store_id",
         "AttributeType":"string"
      },
       {
         "AttributeName":"state_id",
         "AttributeType":"string"
      }
   ]
}

In [78]:
item_DSN = datasetName + "_item"
response=forecast.create_dataset(
                    Domain="RETAIL",
                    DatasetType='ITEM_METADATA',
                    DatasetName=item_DSN,
                    Schema = item_schema
)

In [79]:
item_datasetArn = response['DatasetArn']
print(item_datasetArn)
forecast.describe_dataset(DatasetArn=item_datasetArn)

arn:aws:forecast:us-east-1:889750940888:dataset/walmart_m5_ds_item


{'DatasetArn': 'arn:aws:forecast:us-east-1:889750940888:dataset/walmart_m5_ds_item',
 'DatasetName': 'walmart_m5_ds_item',
 'Domain': 'RETAIL',
 'DatasetType': 'ITEM_METADATA',
 'Schema': {'Attributes': [{'AttributeName': 'item_id',
    'AttributeType': 'string'},
   {'AttributeName': 'item_id_not_combined', 'AttributeType': 'string'},
   {'AttributeName': 'dept_id', 'AttributeType': 'string'},
   {'AttributeName': 'cat_id', 'AttributeType': 'string'},
   {'AttributeName': 'store_id', 'AttributeType': 'string'},
   {'AttributeName': 'state_id', 'AttributeType': 'string'}]},
 'EncryptionConfig': {},
 'Status': 'ACTIVE',
 'CreationTime': datetime.datetime(2020, 4, 14, 1, 31, 47, 770000, tzinfo=tzlocal()),
 'LastModificationTime': datetime.datetime(2020, 4, 14, 1, 31, 47, 770000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': '78db3d90-8506-4a94-a5ac-03ff60741f9c',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Tue, 14 Apr 2020 

## 2f. Item Metadata Dataset Import

In [80]:
datasetImportJobName = 'DSIMPORT_JOB_ITEMPOC'
item_ds_import_job_response=forecast.create_dataset_import_job(DatasetImportJobName=datasetImportJobName,
                                                          DatasetArn=item_datasetArn,
                                                          DataSource= {
                                                              "S3Config" : {
                                                                 "Path":s3_item,
                                                                 "RoleArn": role_arn
                                                              } 
                                                          }
                                                         )

In [81]:
item_ds_import_job_arn=item_ds_import_job_response['DatasetImportJobArn']
print(item_ds_import_job_arn)

arn:aws:forecast:us-east-1:889750940888:dataset-import-job/walmart_m5_ds_item/DSIMPORT_JOB_ITEMPOC


## 2g. Check Dataset Import Status

In [82]:
import time 

start_time = time.time()

while True:
    TargetdataImportStatus  = forecast.describe_dataset_import_job(DatasetImportJobArn=ds_import_job_arn)['Status']
    RelateddataImportStatus = forecast.describe_dataset_import_job(DatasetImportJobArn=rel_ds_import_job_arn)['Status']
    ItemdataImportStatus    = forecast.describe_dataset_import_job(DatasetImportJobArn=item_ds_import_job_arn)['Status']
    
    print("Dataset {} status : {}".format(target_datasetArn, TargetdataImportStatus))
    print("Dataset {} status : {}".format(related_datasetArn, RelateddataImportStatus))
    print("Dataset {} status : {}".format(item_datasetArn, ItemdataImportStatus))
    print("--------------------------------------------------------------------")
    
    if TargetdataImportStatus != 'ACTIVE' or RelateddataImportStatus != 'ACTIVE' or ItemdataImportStatus != 'ACTIVE':
        sleep(30)
    else:
        break
print('작업 수행된 시간 : %f 초' % (time.time() - start_time))

Dataset arn:aws:forecast:us-east-1:889750940888:dataset/walmart_m5_ds_target status : CREATE_IN_PROGRESS
Dataset arn:aws:forecast:us-east-1:889750940888:dataset/walmart_m5_ds_related status : CREATE_PENDING
Dataset arn:aws:forecast:us-east-1:889750940888:dataset/walmart_m5_ds_item status : CREATE_PENDING
--------------------------------------------------------------------
Dataset arn:aws:forecast:us-east-1:889750940888:dataset/walmart_m5_ds_target status : CREATE_IN_PROGRESS
Dataset arn:aws:forecast:us-east-1:889750940888:dataset/walmart_m5_ds_related status : CREATE_IN_PROGRESS
Dataset arn:aws:forecast:us-east-1:889750940888:dataset/walmart_m5_ds_item status : CREATE_IN_PROGRESS
--------------------------------------------------------------------
Dataset arn:aws:forecast:us-east-1:889750940888:dataset/walmart_m5_ds_target status : CREATE_IN_PROGRESS
Dataset arn:aws:forecast:us-east-1:889750940888:dataset/walmart_m5_ds_related status : CREATE_IN_PROGRESS
Dataset arn:aws:forecast:us-eas

In [83]:
response = forecast.update_dataset_group(
    DatasetGroupArn=datasetGroupArn,
    DatasetArns=[
        target_datasetArn,
        related_datasetArn,
        item_datasetArn
    ]
)

아래 스크린 캡쳐와 같이 3가지 Dataset이 모두 Import되었는지 확인한 후 "3. Create Predictor" 단계로 넘어 간다.
Import 상태가 "Falied"인 경우 세부 오류 메시지를 확인한다.

<img src="../img/datasets.png" align="left">

# 3. Create Predictor (20~30분 소요)


In [84]:
forecastHorizon = 30 # 30 days
NumberOfBacktestWindows = 4
BackTestWindowOffset = 30
ForecastFrequency = "D"

In [85]:
prophet_algorithmArn = 'arn:aws:forecast:::algorithm/Prophet'
deepAR_Plus_algorithmArn = 'arn:aws:forecast:::algorithm/Deep_AR_Plus'

## 3a. Prophet

In [86]:
# Prophet Specifics
prophet_predictorName= project+'_prophet_algo_1'

In [87]:
# Build Prophet:
prophet_create_predictor_response=forecast.create_predictor(PredictorName=prophet_predictorName, 
                                                  AlgorithmArn=prophet_algorithmArn,
                                                  ForecastHorizon=forecastHorizon,
                                                  PerformAutoML= False,
                                                  PerformHPO=False,
                                                  EvaluationParameters= {"NumberOfBacktestWindows": NumberOfBacktestWindows, 
                                                                         "BackTestWindowOffset": BackTestWindowOffset}, 
                                                  InputDataConfig= {"DatasetGroupArn": datasetGroupArn, "SupplementaryFeatures": [ 
                                                                     { 
                                                                        "Name": "holiday",
                                                                        "Value": "US"
                                                                     }
                                                                  ]},
                                                  FeaturizationConfig= {"ForecastFrequency": ForecastFrequency, 
                                                                        "Featurizations": 
                                                                        [
                                                                          {"AttributeName": "demand", 
                                                                           "FeaturizationPipeline": 
                                                                            [
                                                                              {"FeaturizationMethodName": "filling", 
                                                                               "FeaturizationMethodParameters": 
                                                                                {"frontfill": "none", 
                                                                                 "middlefill": "zero", 
                                                                                 "backfill": "zero"}
                                                                              }
                                                                            ]
                                                                          }
                                                                        ]
                                                                       }
                                                 )

## 3b. DeepAR Plus

In [88]:
# Prophet Specifics
deeparplus_predictorName= project+'_deeparplus_algo_1'

In [89]:
# Build DeepAR Plus:
deeparplus_create_predictor_response=forecast.create_predictor(PredictorName=deeparplus_predictorName, 
                                                  AlgorithmArn=deepAR_Plus_algorithmArn,
                                                  ForecastHorizon=forecastHorizon,
                                                  PerformAutoML= False,
                                                  PerformHPO=False,
                                                  EvaluationParameters= {"NumberOfBacktestWindows": NumberOfBacktestWindows, 
                                                                         "BackTestWindowOffset": BackTestWindowOffset}, 
                                                  InputDataConfig= {"DatasetGroupArn": datasetGroupArn, "SupplementaryFeatures": [ 
                                                                     { 
                                                                        "Name": "holiday",
                                                                        "Value": "US"
                                                                     }
                                                                  ]},
                                                  FeaturizationConfig= {"ForecastFrequency": ForecastFrequency, 
                                                                        "Featurizations": 
                                                                        [
                                                                          {"AttributeName": "demand", 
                                                                           "FeaturizationPipeline": 
                                                                            [
                                                                              {"FeaturizationMethodName": "filling", 
                                                                               "FeaturizationMethodParameters": 
                                                                                {"frontfill": "none", 
                                                                                 "middlefill": "zero", 
                                                                                 "backfill": "zero"}
                                                                              }
                                                                            ]
                                                                          }
                                                                        ]
                                                                       },
                                                 TrainingParameters= { 
                                                          "likelihood" : "negative-binomial" 
                                                       }
                                                 )

- 일반적으로 Prophet predictor 학습은 DeepAR+ 보다 빨리 끝난다.
- Prophet predictor 학습이 완료되어 predictor status가 `ACTIVE`인 경우 Prophet predictor를 이용하여 Forecast를 생성한다.

## 3c. Check Predictor Creation Status

In [None]:
import time 

start_time = time.time()
while True:
    ProphetArn = prophet_create_predictor_response['PredictorArn']
    #ProphetArn = "arn:aws:forecast:us-east-1:889750940888:predictor/walmart_prophet_algo_1"
    DeepARPlusArn = deeparplus_create_predictor_response['PredictorArn']
    #DeepARPlusArn = "arn:aws:forecast:us-east-1:889750940888:predictor/walmart_deeparplus_algo_1"
    
    ProphetStatus = forecast.describe_predictor(PredictorArn = prophet_create_predictor_response['PredictorArn'])['Status']
    DeepARPlusStatus = forecast.describe_predictor(PredictorArn = deeparplus_create_predictor_response['PredictorArn'])['Status']
    #DeepARPlusStatus = forecast.describe_predictor(PredictorArn = DeepARPlusArn)['Status']
    
    print("Predictor {} status : {}".format(ProphetArn, ProphetStatus))
    print("Predictor {} status : {}".format(DeepARPlusArn, DeepARPlusStatus))
    print("--------------------------------------------------------------------")
    
    if ProphetStatus != 'ACTIVE' or DeepARPlusStatus != 'ACTIVE':
        sleep(30)
    else:
        break
print('작업 수행된 시간 : %f 초' % (time.time() - start_time))

Predictor arn:aws:forecast:us-east-1:889750940888:predictor/walmart_m5_prophet_algo_1 status : CREATE_PENDING
Predictor arn:aws:forecast:us-east-1:889750940888:predictor/walmart_m5_deeparplus_algo_1 status : CREATE_PENDING
--------------------------------------------------------------------
Predictor arn:aws:forecast:us-east-1:889750940888:predictor/walmart_m5_prophet_algo_1 status : CREATE_IN_PROGRESS
Predictor arn:aws:forecast:us-east-1:889750940888:predictor/walmart_m5_deeparplus_algo_1 status : CREATE_IN_PROGRESS
--------------------------------------------------------------------
Predictor arn:aws:forecast:us-east-1:889750940888:predictor/walmart_m5_prophet_algo_1 status : CREATE_IN_PROGRESS
Predictor arn:aws:forecast:us-east-1:889750940888:predictor/walmart_m5_deeparplus_algo_1 status : CREATE_IN_PROGRESS
--------------------------------------------------------------------
Predictor arn:aws:forecast:us-east-1:889750940888:predictor/walmart_m5_prophet_algo_1 status : CREATE_IN_PRO

## 3d. Examining the Predictors
- AWS Forecast에서 생성된 Predictor별 Metric을 확인한다.
- 참고 : https://docs.aws.amazon.com/ko_kr/forecast/latest/dg/metrics.html

# 4. Create Forecast
- Predictor별 Forecast를 만든다.
- 5분 ~ 10분 소요
- 참고 : https://docs.aws.amazon.com/ko_kr/forecast/latest/dg/gs-console.html 의 "3단계 - 예상 생성"
- ForecastTypes : The quantiles at which probabilistic forecasts are generated. You can currently specify up to 5 quantiles per forecast. Accepted values include 0.01 to 0.99 (increments of .01 only) and mean. The mean forecast is different from the median (0.50) when the distribution is not symmetric (for example, Beta and Negative Binomial). The default value is ["0.1", "0.5", "0.9"].

## 4a. Create Prophet, DeepAR+ Forecast

In [None]:
deeparplus_forecastName= project+'_deepAR_algo_forecast'
prophet_forecastname= project+'_prophet_algo_forecast'
ForecastTypes=["0.1", "0.5", "0.9", "mean"]

**DeepAR+**

In [None]:
create_forecast_response=forecast.create_forecast(ForecastName=prophet_forecastname,
                                                  ForecastTypes=ForecastTypes,
                                                  PredictorArn=ProphetArn
                                                 )
prophet_forecastArn = create_forecast_response['ForecastArn']

In [None]:
deeparplus_forecastName

**Prophet**

In [None]:
create_forecast_response=forecast.create_forecast(ForecastName=deeparplus_forecastName,
                                                  ForecastTypes=ForecastTypes,
                                                  PredictorArn = DeepARPlusArn
                                                 )
deeparplus_forecastArn = create_forecast_response['ForecastArn']

In [None]:
prophet_forecastArn

## 4b. Check Forecast Creation Status

In [None]:
import time 

start_time = time.time()
while True:
    deeparplus_forecast_status = forecast.describe_forecast(ForecastArn=deeparplus_forecastArn)['Status']
    prophet_forecast_status = forecast.describe_forecast(ForecastArn=prophet_forecastArn)['Status']
    
    print("Predictor {} status : {}".format(deeparplus_forecastArn, deeparplus_forecast_status))
    print("Predictor {} status : {}".format(prophet_forecastArn, prophet_forecast_status))
    print("--------------------------------------------------------------------")
    
    if deeparplus_forecast_status != 'ACTIVE' or prophet_forecast_status != 'ACTIVE':
        sleep(30)
    else:
        break
print('작업 수행된 시간 : %f 초' % (time.time() - start_time))

## 4c. Get Forecast & Visualization
Predictor별 Forecast를 생성한 후 id별 p10, p50, p90, mean 값을 확인할 수 있다.

In [None]:
# 200개의 Sample item 중 top10
sampled[["id", "sales_total"]].head(10)

In [None]:
# 200개의 Sample item 중 worst10
sampled[["id", "sales_total"]].tail(10)

In [None]:
def get_forecast(id):
    for forecastArn in [deeparplus_forecastArn, prophet_forecastArn]:
        forecastResponse = forecast_query.query_forecast(
                            ForecastArn=forecastArn,
                            Filters={"item_id":id}
                            )

        mean = pd.DataFrame(forecastResponse['Forecast']['Predictions']['mean'])
        mean.Timestamp = mean.Timestamp.apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S"))
        mean.set_index("Timestamp", inplace=True)
        mean.rename(columns = {'Value' : 'mean'}, inplace = True)

        p10 = pd.DataFrame(forecastResponse['Forecast']['Predictions']['p10'])
        p10.Timestamp = p10.Timestamp.apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S"))
        p10.set_index("Timestamp", inplace=True)
        p10.rename(columns = {'Value' : 'p10'}, inplace = True)

        p50 = pd.DataFrame(forecastResponse['Forecast']['Predictions']['p50'])
        p50.Timestamp = p50.Timestamp.apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S"))
        p50.set_index("Timestamp", inplace=True)
        p50.rename(columns = {'Value' : 'p50'}, inplace = True)

        p90 = pd.DataFrame(forecastResponse['Forecast']['Predictions']['p90'])
        p90.Timestamp = p90.Timestamp.apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S"))
        p90.set_index("Timestamp", inplace=True)
        p90.rename(columns = {'Value' : 'p90'}, inplace = True)

        plot_start_ts = mean.index.min() - timedelta(days=0.5 * 365/12)
        plot_end_ts   = mean.index.max() + timedelta(days=0.5 * 365/12)
        plot_start_str = datetime.strptime(str(plot_start_ts), '%Y-%m-%d %H:%M:%S')
        plot_end_str   = datetime.strptime(str(plot_end_ts), '%Y-%m-%d %H:%M:%S')
        plot_start_date = str(plot_start_str.year) + "-" + str(plot_start_str.month) + "-" + str(plot_start_str.day)
        plot_end_date   = str(plot_end_str.year) + "-" + str(plot_end_str.month) + "-" + str(plot_end_str.day)

        observations = df_merged[df_merged["id"] == id].loc[plot_start_date:plot_end_date].sales
        
        fig = plt.figure(figsize=(20, 5))

        plt.title("Forecast for {}, Predictor : {}".format(id, forecastArn))
        plt.plot(observations, color='gray', linewidth=1, label="observation")
        plt.plot(p90, label='p90')
        plt.plot(mean, label='mean')
        plt.plot(p50, label='p50')
        plt.plot(p10, label='p10')
        plt.axvline(x=datetime(2015, 11, 27), color='r', linestyle='--', linewidth=3) # Adding Vertical line for Black Friday
        plt.legend()
     
    return

### Sample중 Top5

In [None]:
sampled[["id", "sales_total"]].head()

In [None]:
sampled.id.head(5)

In [None]:
for item in sampled.id.head(5):
    get_forecast(item)

### Sample중 Worst5

In [None]:
for item in sampled.id.tail(5):
    get_forecast(item)