In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
holiday = pd.read_csv("政府行政機關辦公日曆表_0006188957259157373306.csv")

In [5]:
holiday_data = holiday[['date', 'isHoliday']].values

In [21]:
holiday_list = [x[0] for x in holiday_data[holiday_data[:,1] == '是']]

In [30]:
isholiday = []
import calendar
for i in range(2017, 2019):
    for j in range(1, 13):
        num_of_day = calendar.monthrange(i,j)[1]
        temp = np.zeros(num_of_day, dtype=int)
        for day in range(1, num_of_day+1):
            if str(i)+'/'+str(j)+'/'+str(day) in holiday_list:
                temp[day - 1] = 1
        isholiday.extend(temp)

In [35]:
pickle.dump(isholiday, open("../data/is_holiday.pkl", "wb"))

In [2]:
import pickle
import torch
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import numpy as np


In [3]:
def get_formatted_time(idx):
    days_in_months = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

    year = idx // 365

    days = idx % 365

    month = None
    for m, days_in_m in enumerate(days_in_months):
        if days < days_in_m:
            month = m + 1
            break

        days -= days_in_m

    date = days + 1

    return int('201{}{:02d}{:02d}'.format(year + 7, month, date))

In [4]:
def get_valid_commodity_codes(commodity_dataframe, sales_data):
    summed_data = sales_data.groupby('商品代號').agg('sum')

    sales_data_lt_0 = summed_data.loc[summed_data['銷售數量'] > 20]

    commodity_codes = sales_data_lt_0.index

    expirations = commodity_dataframe['有效期限'].str.split('', n=2, expand=True)

    condition = (
        ((expirations[1] == 'D') & (pd.to_numeric(expirations[2]) < 6))
        | ((expirations[1] == 'H') & (pd.to_numeric(expirations[2]) < 6 * 24))
    )
    expiration_st_6 = commodity_dataframe.loc[condition]

    return set(commodity_codes).intersection(expiration_st_6['商品代號'].unique())

In [5]:
data_dir = Path('../data')

commodity_dataframe = pd.read_csv(
    data_dir / '商品主檔.txt', sep='\t')

store_codes = (commodity_dataframe.loc[:, '原始店號']
               .sort_values().unique())
commodity_codes = (commodity_dataframe.loc[:, '商品代號']
                   .sort_values().unique())

group_data = commodity_dataframe[['商品代號', '品番', '群番']].values
group_data = np.unique(group_data, axis = 0)

commodity_codes2p = {}
commodity_codes2c = {}
for group in group_data:
    commodity_codes2p[group[0]] = group[1]
    commodity_codes2c[group[0]] = group[2]

In [7]:
sales_dataframes = []
for year in [2017, 2018]:
    sales_dataframes.append(
        pd.read_csv(
            data_dir / '銷售數量{}.txt'.format(year), sep='\t'
        ).loc[:, ['原始店號', '日期', '商品代號', '銷售數量']]
    )
sales_data = pd.concat(sales_dataframes, axis=0)
del sales_dataframes

In [9]:
commodity_codes = get_valid_commodity_codes(commodity_dataframe, sales_data)
print(len(commodity_codes))

759


In [11]:
sales_data = sales_data.groupby(['日期', '原始店號', '商品代號'])

In [14]:
processed_sales_data, processed_order_data = [], []
for day_i in tqdm(range(2 * 365)):
    time = get_formatted_time(day_i)
    # print(time, end=' ', flush=True)

    for sc in store_codes:
        for cc in commodity_codes:
            # print(cc)
            try:
                # print(sales_data.get_group((time, sc, cc))['銷售數量'].agg('sum'))
                processed_sales_data.append(
                    sales_data.get_group((time, sc, cc))['銷售數量'].agg('sum'))
            except KeyError:
                processed_sales_data.append(0)

100%|████████████████████████████████████████████████████████████████████████████| 730/730 [02:13<00:00,  6.04it/s]


In [20]:
processed_sales_data = torch.tensor(processed_sales_data, dtype=torch.float)
processed_sales_data = processed_sales_data.view(730, 5, -1)

In [23]:
processed_sales_data.shape

torch.Size([730, 5, 759])