In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from collections import defaultdict
import gc
import sys
import pickle

In [None]:
class Category:
    def __init__(self, unique_items, item_to_id={}):
        self.items = unique_items
        self.item_to_id = item_to_id
        for _id, item_name in enumerate(self.items):
            self.item_to_id[item_name] = _id
        self.type = np.int16
        if len(self.items) > 30000:
            self.type = np.int32
    
    def encode_series(self, items):
        return np.array([self.item_to_id[_item] for _item in items]).astype(self.type)

In [None]:
sales_train_val = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_validation.csv')

In [None]:
sales_train_eval = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_evaluation.csv')

In [None]:
sales_train_val.columns

In [None]:
cat_feats = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

In [None]:
cat_feats_name2idx = {ft:i for i, ft in enumerate(cat_feats)}

In [None]:
cat_feats_objs = []
for col in cat_feats:
    cat_feats_objs.append(Category(sorted(sales_train_eval[col].unique())))

In [None]:
sell_prices = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')

In [None]:
sell_prices.set_index(['item_id', 'store_id', 'wm_yr_wk'], inplace=True)

In [None]:
calendar = pd.read_csv('../input/m5-forecasting-calendar-events-minfeats/calendar_events.csv', parse_dates=['date'])

In [None]:
calendar = calendar.iloc[:,1:]

In [None]:
calendar.set_index('d', inplace=True)

In [None]:
calendar

In [None]:
col_names = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'day_id',
       'week_id', 'year', 'month', 'day_of_month', 'day_of_week', 'snap',
       'sell_price']

In [None]:
with open('../input/m5-forecasting-calendar-events-minfeats/events.pkl', 'rb') as f:
    events = pickle.load(f)

In [None]:
event_feats = []
for event in events:
    event_feats.append(event+"_lag")

In [None]:
for ef in event_feats:
    calendar[ef] = calendar[ef].astype(np.int16)

In [None]:
col_names = col_names + event_feats

In [None]:
len(col_names)

In [None]:
col_names

In [None]:
def show_progress(progress_str, n_cols=80):
    sys.stdout.write(progress_str + (" " * max(0, n_cols - len(progress_str))) + "\r")
    sys.stdout.flush()

In [None]:
#d_start=1900
#d_end=1906

def build_lgbm_input_df(sales_df, d_start, d_end, col_names=col_names):
    item_store_df = pd.DataFrame({ cat_feats[i]:cat_feats_objs[i].encode_series(sales_df[cat_feats[i]]) for i in range(len(cat_feats)) })
    all_cols = defaultdict(list)
    sales = []
    for d in range(d_start, d_end+1):
        show_progress(f"Processing day {d} of {d_end}")
        sales.append(sales_df['d_'+str(d)].values)
        for feat in cat_feats:
            all_cols[feat].append(item_store_df[feat].values)
        all_cols['day_id'].append(np.array([d] * len(item_store_df)).astype(np.int16))
        wk_id = calendar.loc['d_'+str(d), 'wm_yr_wk']
        all_cols['week_id'].append(np.array([wk_id] * len(item_store_df)).astype(np.int16))
        year = calendar.loc['d_'+str(d),'year']
        all_cols['year'].append(np.array([year] * len(item_store_df)).astype(np.int16))
        month = calendar.loc['d_'+str(d),'month']
        all_cols['month'].append(np.array([month] * len(item_store_df)).astype(np.int8))
        day_of_month = calendar.loc['d_'+str(d),'date'].day
        all_cols['day_of_month'].append(np.array([day_of_month] * len(item_store_df)).astype(np.int8))
        day_of_week = calendar.loc['d_'+str(d), 'wday']
        all_cols['day_of_week'].append(np.array([day_of_week] * len(item_store_df)).astype(np.int8))
        state = [cat_feats_objs[cat_feats_name2idx['state_id']].items[s_id] for s_id in item_store_df.state_id]
        all_cols['snap'].append(np.array([calendar.loc['d_'+str(d), 'snap_'+state[i]] for i in range(len(item_store_df))]).astype(np.int8))
        sp = []
        for i in range(len(item_store_df)):
            try:
                _item = cat_feats_objs[cat_feats_name2idx['item_id']].items[item_store_df.iloc[i,0]]
                _store = cat_feats_objs[cat_feats_name2idx['store_id']].items[item_store_df.iloc[i,3]]
                _sp = sell_prices.loc[(_item,_store,wk_id)].values[0]
            except:
                _sp = -1
            sp.append(_sp)
        all_cols['sell_price'].append(np.array(sp).astype(np.float32))
        for ef in event_feats:
            all_cols[ef].append(np.array([calendar.loc['d_'+str(d), ef]] * len(item_store_df)))

    print("")
    all_cols_temp = {}
    for k,v in all_cols.items():
        all_cols_temp[k] = np.concatenate(v)
    all_cols = all_cols_temp
    del all_cols_temp
    gc.collect()
    return pd.DataFrame(all_cols)[col_names], np.concatenate(sales).astype(np.float32)

In [None]:
# train_train_split
# upto -28 days
# ~90 days
# d_end: 1885
# d_start: 1795
%time X_train_df, y_train = build_lgbm_input_df(sales_train_eval, 1100, 1913)

In [None]:
X_train_df.to_pickle('X_train_df')
np.save('y_train', y_train)

In [None]:
gc.collect()

In [None]:
X_val_df, y_val = build_lgbm_input_df(sales_train_eval, 1914, 1941)

In [None]:
X_val_df.to_pickle('X_val_df')
np.save('y_val', y_val)

In [None]:
X_train_full_df = pd.concat((X_train_df, X_val_df))

In [None]:
y_train_full = np.concatenate((y_train, y_val))

In [None]:
X_train_full_df.to_pickle('X_train_full_df')
np.save('y_train_full', y_train_full)

In [None]:
gc.collect()

In [None]:
with open('cat_feats.pkl', 'wb') as f:
    pickle.dump(cat_feats, f)

In [None]:
with open('cat_feats_name2idx.pkl', 'wb') as f:
    pickle.dump(cat_feats_name2idx, f)

In [None]:
with open('cat_feats_objs.pkl', 'wb') as f:
    pickle.dump(cat_feats_objs, f)

In [None]:
! ls -alrh