In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/m5-forecasting-calendar-events-minfeats/events.pkl
/kaggle/input/m5-forecasting-calendar-events-minfeats/custom.css
/kaggle/input/m5-forecasting-calendar-events-minfeats/__notebook__.ipynb
/kaggle/input/m5-forecasting-calendar-events-minfeats/__results__.html
/kaggle/input/m5-forecasting-calendar-events-minfeats/__output__.json
/kaggle/input/m5-forecasting-calendar-events-minfeats/calendar_events.csv
/kaggle/input/m5-forecasting-accuracy/sell_prices.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv
/kaggle/input/m5-forecasting-accuracy/sample_submission.csv
/kaggle/input/m5-forecasting-accuracy/calendar.csv
/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv


In [2]:
from collections import defaultdict
import gc
import sys
import pickle

In [3]:
class Category:
    def __init__(self, unique_items, item_to_id={}):
        self.items = unique_items
        self.item_to_id = item_to_id
        for _id, item_name in enumerate(self.items):
            self.item_to_id[item_name] = _id
        self.type = np.int16
        if len(self.items) > 30000:
            self.type = np.int32
    
    def encode_series(self, items):
        return np.array([self.item_to_id[_item] for _item in items]).astype(self.type)

In [4]:
sales_train_val = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_validation.csv')

In [5]:
sales_train_eval = pd.read_csv('../input/m5-forecasting-accuracy/sales_train_evaluation.csv')

In [6]:
sales_train_val.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1',
       'd_2', 'd_3', 'd_4',
       ...
       'd_1904', 'd_1905', 'd_1906', 'd_1907', 'd_1908', 'd_1909', 'd_1910',
       'd_1911', 'd_1912', 'd_1913'],
      dtype='object', length=1919)

In [7]:
cat_feats = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

In [8]:
cat_feats_name2idx = {ft:i for i, ft in enumerate(cat_feats)}

In [9]:
cat_feats_objs = []
for col in cat_feats:
    cat_feats_objs.append(Category(sorted(sales_train_eval[col].unique())))

In [10]:
sell_prices = pd.read_csv('../input/m5-forecasting-accuracy/sell_prices.csv')

In [11]:
sell_prices.set_index(['item_id', 'store_id', 'wm_yr_wk'], inplace=True)

In [12]:
calendar = pd.read_csv('../input/m5-forecasting-calendar-events-minfeats/calendar_events.csv', parse_dates=['date'])

In [13]:
calendar = calendar.iloc[:,1:]

In [14]:
calendar.set_index('d', inplace=True)

In [15]:
calendar

Unnamed: 0_level_0,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,...,LentStart_lag,Thanksgiving_lag,Purim End_lag,VeteransDay_lag,Ramadan starts_lag,Mother's day_lag,NBAFinalsStart_lag,Easter_lag,NBAFinalsEnd_lag,LaborDay_lag
d,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
d_1,2011-01-29,11101,Saturday,1,1,2011,,,,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
d_2,2011-01-30,11101,Sunday,2,1,2011,,,,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
d_3,2011-01-31,11101,Monday,3,1,2011,,,,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
d_4,2011-02-01,11101,Tuesday,4,2,2011,,,,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
d_5,2011-02-02,11101,Wednesday,5,2,2011,,,,,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
d_1965,2016-06-15,11620,Wednesday,5,6,2016,,,,,...,126,202,83,217,8,38,13,80,365,282
d_1966,2016-06-16,11620,Thursday,6,6,2016,,,,,...,127,203,84,218,9,39,14,81,366,283
d_1967,2016-06-17,11620,Friday,7,6,2016,,,,,...,128,204,85,219,10,40,15,82,367,284
d_1968,2016-06-18,11621,Saturday,1,6,2016,,,,,...,129,205,86,220,11,41,16,83,368,285


In [16]:
col_names = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'day_id',
       'week_id', 'year', 'month', 'day_of_month', 'day_of_week', 'snap',
       'sell_price']

In [17]:
with open('../input/m5-forecasting-calendar-events-minfeats/events.pkl', 'rb') as f:
    events = pickle.load(f)

In [18]:
event_feats = []
for event in events:
    event_feats.append(event+"_lag")

In [19]:
for ef in event_feats:
    calendar[ef] = calendar[ef].astype(np.int16)

In [20]:
col_names = col_names + event_feats

In [21]:
len(col_names)

43

In [22]:
col_names

['item_id',
 'dept_id',
 'cat_id',
 'store_id',
 'state_id',
 'day_id',
 'week_id',
 'year',
 'month',
 'day_of_month',
 'day_of_week',
 'snap',
 'sell_price',
 'Chanukah End_lag',
 'SuperBowl_lag',
 'StPatricksDay_lag',
 'LaborDay_lag',
 'Pesach End_lag',
 'PresidentsDay_lag',
 'NewYear_lag',
 'ColumbusDay_lag',
 'Ramadan starts_lag',
 'Purim End_lag',
 'LentWeek2_lag',
 'Thanksgiving_lag',
 'IndependenceDay_lag',
 'Christmas_lag',
 'NBAFinalsStart_lag',
 "Father's day_lag",
 'LentStart_lag',
 'EidAlAdha_lag',
 'VeteransDay_lag',
 'MartinLutherKingDay_lag',
 'Cinco De Mayo_lag',
 'Halloween_lag',
 'Easter_lag',
 'NBAFinalsEnd_lag',
 'OrthodoxChristmas_lag',
 'ValentinesDay_lag',
 'MemorialDay_lag',
 'Eid al-Fitr_lag',
 'OrthodoxEaster_lag',
 "Mother's day_lag"]

In [23]:
def show_progress(progress_str, n_cols=80):
    sys.stdout.write(progress_str + (" " * max(0, n_cols - len(progress_str))) + "\r")
    sys.stdout.flush()

In [24]:
#d_start=1900
#d_end=1906

def build_lgbm_input_df(sales_df, d_start, d_end, col_names=col_names):
    item_store_df = pd.DataFrame({ cat_feats[i]:cat_feats_objs[i].encode_series(sales_df[cat_feats[i]]) for i in range(len(cat_feats)) })
    all_cols = defaultdict(list)
    sales = []
    for d in range(d_start, d_end+1):
        show_progress(f"Processing day {d} of {d_end}")
        sales.append(sales_df['d_'+str(d)].values)
        for feat in cat_feats:
            all_cols[feat].append(item_store_df[feat].values)
        all_cols['day_id'].append(np.array([d] * len(item_store_df)).astype(np.int16))
        wk_id = calendar.loc['d_'+str(d), 'wm_yr_wk']
        all_cols['week_id'].append(np.array([wk_id] * len(item_store_df)).astype(np.int16))
        year = calendar.loc['d_'+str(d),'year']
        all_cols['year'].append(np.array([year] * len(item_store_df)).astype(np.int16))
        month = calendar.loc['d_'+str(d),'month']
        all_cols['month'].append(np.array([month] * len(item_store_df)).astype(np.int8))
        day_of_month = calendar.loc['d_'+str(d),'date'].day
        all_cols['day_of_month'].append(np.array([day_of_month] * len(item_store_df)).astype(np.int8))
        day_of_week = calendar.loc['d_'+str(d), 'wday']
        all_cols['day_of_week'].append(np.array([day_of_week] * len(item_store_df)).astype(np.int8))
        state = [cat_feats_objs[cat_feats_name2idx['state_id']].items[s_id] for s_id in item_store_df.state_id]
        all_cols['snap'].append(np.array([calendar.loc['d_'+str(d), 'snap_'+state[i]] for i in range(len(item_store_df))]).astype(np.int8))
        sp = []
        for i in range(len(item_store_df)):
            try:
                _item = cat_feats_objs[cat_feats_name2idx['item_id']].items[item_store_df.iloc[i,0]]
                _store = cat_feats_objs[cat_feats_name2idx['store_id']].items[item_store_df.iloc[i,3]]
                _sp = sell_prices.loc[(_item,_store,wk_id)].values[0]
            except:
                _sp = -1
            sp.append(_sp)
        all_cols['sell_price'].append(np.array(sp).astype(np.float32))
        for ef in event_feats:
            all_cols[ef].append(np.array([calendar.loc['d_'+str(d), ef]] * len(item_store_df)))

    print("")
    all_cols_temp = {}
    for k,v in all_cols.items():
        all_cols_temp[k] = np.concatenate(v)
    all_cols = all_cols_temp
    del all_cols_temp
    gc.collect()
    return pd.DataFrame(all_cols)[col_names], np.concatenate(sales).astype(np.float32)

In [25]:
# train_train_split
# upto -28 days
# ~90 days
# d_end: 1885
# d_start: 1795
%time X_train_df, y_train = build_lgbm_input_df(sales_train_eval, 1100, 1913)

Processing day 1913 of 1913                                                     
CPU times: user 1h 57min 7s, sys: 9.02 s, total: 1h 57min 16s
Wall time: 1h 57min 17s


In [26]:
X_train_df.to_pickle('X_train_df')
np.save('y_train', y_train)

In [27]:
gc.collect()

20

In [28]:
X_val_df, y_val = build_lgbm_input_df(sales_train_eval, 1914, 1941)

Processing day 1941 of 1941                                                     


In [29]:
X_val_df.to_pickle('X_val_df')
np.save('y_val', y_val)

In [30]:
X_train_full_df = pd.concat((X_train_df, X_val_df))

In [31]:
y_train_full = np.concatenate((y_train, y_val))

In [32]:
X_train_full_df.to_pickle('X_train_full_df')
np.save('y_train_full', y_train_full)

In [33]:
gc.collect()

100

In [34]:
with open('cat_feats.pkl', 'wb') as f:
    pickle.dump(cat_feats, f)

In [35]:
with open('cat_feats_name2idx.pkl', 'wb') as f:
    pickle.dump(cat_feats_name2idx, f)

In [36]:
with open('cat_feats_objs.pkl', 'wb') as f:
    pickle.dump(cat_feats_objs, f)

In [37]:
! ls -alrh

total 4.4G
-rw-r--r-- 1 root root 3.3M Jun 25 20:22 y_val.npy
-rw-r--r-- 1 root root  98M Jun 25 20:22 y_train_full.npy
-rw-r--r-- 1 root root  95M Jun 25 20:18 y_train.npy
-rw-r--r-- 1 root root  91K Jun 25 20:22 cat_feats_objs.pkl
-rw-r--r-- 1 root root   89 Jun 25 20:22 cat_feats_name2idx.pkl
-rw-r--r-- 1 root root   79 Jun 25 20:22 cat_feats.pkl
---------- 1 root root  11K Jun 25 18:20 __notebook__.ipynb
-rw-r--r-- 1 root root  69M Jun 25 20:22 X_val_df
-rw-r--r-- 1 root root 2.2G Jun 25 20:22 X_train_full_df
-rw-r--r-- 1 root root 2.0G Jun 25 20:18 X_train_df
drwxr-xr-x 6 root root 4.0K Jun 25 18:20 ..
drwxr-xr-x 2 root root 4.0K Jun 25 20:22 .
