In [1]:
import numpy as np
import pandas as pd
import os
import random
import math
from decimal import Decimal as dec
import datetime
import time
import gc
import lightgbm as lgb
import pickle
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
def load_data():
    train_df = reduce_mem_usage(pd.read_csv("C:\\Users\Lenovo\Downloads\m5-forecasting-accuracy\sales_train_evaluation.csv"))
    prices_df = reduce_mem_usage(pd.read_csv("C:\\Users\Lenovo\Downloads\m5-forecasting-accuracy\sell_prices.csv"))
    calendar_df = reduce_mem_usage(pd.read_csv("C:\\Users\Lenovo\Downloads\m5-forecasting-accuracy\calendar.csv"))
    submission_df = reduce_mem_usage(pd.read_csv("C:\\Users\Lenovo\Downloads\m5-forecasting-accuracy\sample_submission.csv"))
    return train_df, prices_df, calendar_df, submission_df

In [4]:
train_df, prices_df, calendar_df, submission_df = load_data()

Mem. usage decreased to 96.13 Mb (78.8% reduction)
Mem. usage decreased to 143.53 Mb (31.2% reduction)
Mem. usage decreased to  0.12 Mb (41.9% reduction)
Mem. usage decreased to  2.09 Mb (84.5% reduction)


In [5]:
def generate_base_grid(train_df, end_train_day_x, predict_horizon):
    index_columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    grid_df = pd.melt(train_df, id_vars=index_columns, var_name='d', value_name='sales')
    grid_df = reduce_mem_usage(grid_df, verbose=False)
    grid_df['d_org'] = grid_df['d']
    grid_df['d'] = grid_df['d'].apply(lambda x: x[2:]).astype(np.int16)
    time_mask = (grid_df['d'] > end_train_day_x) &  (grid_df['d'] <= end_train_day_x + predict_horizon)
    holdout_df = grid_df.loc[time_mask, ["id", "d", "sales"]].reset_index(drop=True)
    holdout_df.to_feather(f"holdout_df_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")
    del holdout_df
    gc.collect()
    grid_df = grid_df[grid_df['d'] <= end_train_day_x]
    grid_df['d'] = grid_df['d_org']
    grid_df = grid_df.drop('d_org', axis=1)
    add_grid = pd.DataFrame()
    for i in range(predict_horizon):
        temp_df = train_df[index_columns]
        temp_df = temp_df.drop_duplicates()
        temp_df['d'] = 'd_' + str(end_train_day_x + i + 1)
        temp_df['sales'] = np.nan
        add_grid = pd.concat([add_grid, temp_df])

    grid_df = pd.concat([grid_df, add_grid])
    grid_df = grid_df.reset_index(drop=True)

    for col in index_columns:
        grid_df[col] = grid_df[col].astype('category')

    grid_df = reduce_mem_usage(grid_df, verbose=False)
    grid_df.to_feather(f"grid_df_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")
    del grid_df
    gc.collect()

In [6]:
def merge_by_concat(df1, df2, merge_on):
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf)
                   if col not in merge_on]
    df1[new_columns] = merged_gf[new_columns]
    return df1

In [7]:
def calc_release_week(prices_df, end_train_day_x, predict_horizon):
    index_columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

    grid_df = pd.read_feather(f"grid_df_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")

    release_df = prices_df.groupby(['store_id', 'item_id'])['wm_yr_wk'].agg(['min']).reset_index()
    release_df.columns = ['store_id', 'item_id', 'release']

    grid_df = merge_by_concat(grid_df, release_df, ['store_id', 'item_id'])

    del release_df
    grid_df = reduce_mem_usage(grid_df, verbose=False)
    gc.collect()

    grid_df = merge_by_concat(grid_df, calendar_df[['wm_yr_wk', 'd']], ['d'])
    grid_df = grid_df.reset_index(drop=True)
    grid_df['release'] = grid_df['release'] - grid_df['release'].min()
    grid_df['release'] = grid_df['release'].astype(np.int16)

    grid_df = reduce_mem_usage(grid_df, verbose=False)
    grid_df.to_feather(f"grid_df_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")
    del(grid_df)
    gc.collect()

In [8]:
def generate_grid_price(prices_df, calendar_df, end_train_day_x, predict_horizon):
    grid_df = pd.read_feather(f"grid_df_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")
    prices_df['price_max'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('max')
    prices_df['price_min'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('min')
    prices_df['price_std'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('std')
    prices_df['price_mean'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('mean')
    prices_df['price_norm'] = prices_df['sell_price'] / prices_df['price_max']
    prices_df['price_nunique'] = prices_df.groupby(['store_id', 'item_id'])['sell_price'].transform('nunique')
    prices_df['item_nunique'] = prices_df.groupby(['store_id', 'sell_price'])['item_id'].transform('nunique')
    calendar_prices = calendar_df[['wm_yr_wk', 'month', 'year']]
    calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk'])
    prices_df = prices_df.merge(calendar_prices[['wm_yr_wk', 'month', 'year']], on=['wm_yr_wk'], how='left')

    del calendar_prices
    gc.collect()

    prices_df['price_momentum'] = prices_df['sell_price'] / prices_df.groupby(['store_id', 'item_id'])[
        'sell_price'].transform(lambda x: x.shift(1))
    prices_df['price_momentum_m'] = prices_df['sell_price'] / prices_df.groupby(['store_id', 'item_id', 'month'])[
        'sell_price'].transform('mean')
    prices_df['price_momentum_y'] = prices_df['sell_price'] / prices_df.groupby(['store_id', 'item_id', 'year'])[
        'sell_price'].transform('mean')
    prices_df['sell_price_cent'] = [math.modf(p)[0] for p in prices_df['sell_price']]
    prices_df['price_max_cent'] = [math.modf(p)[0] for p in prices_df['price_max']]
    prices_df['price_min_cent'] = [math.modf(p)[0] for p in prices_df['price_min']]
    del prices_df['month'], prices_df['year']
    prices_df = reduce_mem_usage(prices_df, verbose=False)
    gc.collect()

    original_columns = list(grid_df)
    grid_df = grid_df.merge(prices_df, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')
    del(prices_df)
    gc.collect()

    keep_columns = [col for col in list(grid_df) if col not in original_columns]
    grid_df = grid_df[['id', 'd'] + keep_columns]
    grid_df = reduce_mem_usage(grid_df, verbose=False)
    grid_df.to_feather(f"grid_price_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")
    del(grid_df)
    gc.collect()

In [9]:
def get_moon_phase(d):  # 0=new, 4=full; 4 days/phase
    diff = datetime.datetime.strptime(d, '%Y-%m-%d') - datetime.datetime(2001, 1, 1)
    days = dec(diff.days) + (dec(diff.seconds) / dec(86400))
    lunations = dec("0.20439731") + (days * dec("0.03386319269"))
    phase_index = math.floor((lunations % dec(1) * dec(8)) + dec('0.5'))
    return int(phase_index) & 7

In [12]:
def generate_grid_calendar(calendar_df, end_train_day_x, predict_horizon):

    grid_df = pd.read_feather(
        f"grid_df_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")
    grid_df = grid_df[['id', 'd']]
    gc.collect()
    calendar_df['moon'] = calendar_df.date.apply(get_moon_phase)
    # Merge calendar partly
    icols = ['date',
             'd',
             'event_name_1',
             'event_type_1',
             'event_name_2',
             'event_type_2',
             'snap_CA',
             'snap_TX',
             'snap_WI',
             'moon',
             ]
    grid_df = grid_df.merge(calendar_df[icols], on=['d'], how='left')
    icols = ['event_name_1',
             'event_type_1',
             'event_name_2',
             'event_type_2',
             'snap_CA',
             'snap_TX',
             'snap_WI']

    for col in icols:
        grid_df[col] = grid_df[col].astype('category')
    grid_df['date'] = pd.to_datetime(grid_df['date'])
    grid_df['tm_d'] = grid_df['date'].dt.day.astype(np.int8)
    grid_df['tm_w'] = grid_df['date'].dt.isocalendar().week.astype(np.int8)
    grid_df['tm_m'] = grid_df['date'].dt.month.astype(np.int8)
    grid_df['tm_y'] = grid_df['date'].dt.year
    grid_df['tm_y'] = (grid_df['tm_y'] - grid_df['tm_y'].min()).astype(np.int8)
    grid_df['tm_wm'] = grid_df['tm_d'].apply(lambda x: math.ceil(x / 7)).astype(np.int8)
    grid_df['tm_dw'] = grid_df['date'].dt.dayofweek.astype(np.int8)
    grid_df['tm_w_end'] = (grid_df['tm_dw'] >= 5).astype(np.int8)

    del(grid_df['date'])
    grid_df = reduce_mem_usage(grid_df, verbose=False)
    grid_df.to_feather(f"grid_calendar_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")

    del(grid_df)
    del(calendar_df)
    gc.collect()


In [13]:
def modify_grid_base(end_train_day_x, predict_horizon):
    grid_df = pd.read_feather(f"grid_df_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")
    grid_df['d'] = grid_df['d'].apply(lambda x: x[2:]).astype(np.int16)
    del grid_df['wm_yr_wk']

    grid_df = reduce_mem_usage(grid_df, verbose=False)
    grid_df.to_feather(f"grid_df_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")

    del(grid_df)
    gc.collect()

In [None]:
def generate_lag_feature(end_train_day_x, predict_horizon):
    grid_df = pd.read_feather(f"grid_df_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")
    grid_df = grid_df[['id', 'd', 'sales']]

    num_lag_day_list = []
    num_lag_day = 15
    for col in range(predict_horizon, predict_horizon + num_lag_day):
        num_lag_day_list.append(col)

    grid_df = grid_df.assign(**{
        '{}_lag_{}'.format(col, l): grid_df.groupby(['id'])['sales'].transform(lambda x: x.shift(l))
        for l in num_lag_day_list
    })
    for col in list(grid_df):
        if 'lag' in col:
            grid_df[col] = grid_df[col].astype(np.float16)
    num_rolling_day_list = [7, 14, 30, 60, 180]
    for num_rolling_day in num_rolling_day_list:
        grid_df['rolling_mean_' + str(num_rolling_day)] = grid_df.groupby(['id'])['sales'].transform(
            lambda x: x.shift(predict_horizon).rolling(num_rolling_day).mean()).astype(np.float16)
        grid_df['rolling_std_' + str(num_rolling_day)] = grid_df.groupby(['id'])['sales'].transform(
            lambda x: x.shift(predict_horizon).rolling(num_rolling_day).std()).astype(np.float16)
    grid_df = reduce_mem_usage(grid_df, verbose=False)
    grid_df.to_feather(f"lag_feature_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")

    del(grid_df)
    gc.collect()

In [None]:
def generate_target_encoding_feature(end_train_day_x, predict_horizon):
    grid_df = pd.read_feather(f"grid_df_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")

    grid_df.loc[grid_df['d'] > (end_train_day_x - predict_horizon), 'sales'] = np.nan
    base_cols = list(grid_df)
    icols = [
        ['state_id'],
        ['store_id'],
        ['cat_id'],
        ['dept_id'],
        ['state_id', 'cat_id'],
        ['state_id', 'dept_id'],
        ['store_id', 'cat_id'],
        ['store_id', 'dept_id'],
        ['item_id'],
        ['item_id', 'state_id'],
        ['item_id', 'store_id']
    ]
    for col in icols:
        col_name = '_' + '_'.join(col) + '_'
        grid_df['enc' + col_name + 'mean'] = grid_df.groupby(col)['sales'].transform('mean').astype(
            np.float16)
        grid_df['enc' + col_name + 'std'] = grid_df.groupby(col)['sales'].transform('std').astype(
            np.float16)
    keep_cols = [col for col in list(grid_df) if col not in base_cols]
    grid_df = grid_df[['id', 'd'] + keep_cols]
    grid_df = reduce_mem_usage(grid_df, verbose=False)
    grid_df.to_feather(f"target_encoding_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")

    del(grid_df)
    gc.collect()

In [None]:
def assemble_grid_by_store(train_df, end_train_day_x, predict_horizon):
    grid_df = pd.concat([pd.read_feather(f"grid_df_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather"),
                         pd.read_feather(f"grid_price_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather").iloc[:, 2:],
                         pd.read_feather(f"grid_calendar_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather").iloc[:, 2:]],
                        axis=1)
    gc.collect()
    store_id_set_list = list(train_df['store_id'].unique())
    index_store = dict()
    for store_id in store_id_set_list:
        extract = grid_df[grid_df['store_id'] == store_id]
        index_store[store_id] = extract.index.to_numpy()
        extract = extract.reset_index(drop=True)
        extract.to_feather(f"grid_full_store_{store_id}_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")
    del(grid_df)
    gc.collect()

    mean_features = [
        'enc_cat_id_mean', 'enc_cat_id_std',
        'enc_dept_id_mean', 'enc_dept_id_std',
        'enc_item_id_mean', 'enc_item_id_std'
    ]
    df2 = pd.read_feather(f"target_encoding_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")[mean_features]
    for store_id in store_id_set_list:
        df = pd.read_feather(f"grid_full_store_{store_id}_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")
        df = pd.concat([df, df2[df2.index.isin(index_store[store_id])].reset_index(drop=True)], axis=1)
        df.to_feather(f"grid_full_store_{store_id}_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")
    del(df2)
    gc.collect()

    df3 = pd.read_feather(f"lag_feature_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather").iloc[:, 3:]
    for store_id in store_id_set_list:
        df = pd.read_feather(f"grid_full_store_{store_id}_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")
        df = pd.concat([df, df3[df3.index.isin(index_store[store_id])].reset_index(drop=True)], axis=1)
        df.to_feather(f"grid_full_store_{store_id}_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")
    del(df3)
    del(store_id_set_list)
    gc.collect()

In [None]:
def load_grid_by_store(end_train_day_x, predict_horizon, store_id):
    df = pd.read_feather(f"grid_full_store_{store_id}_{end_train_day_x}_to_{end_train_day_x + predict_horizon}.feather")

    remove_features = ['id', 'state_id', 'store_id', 'date', 'wm_yr_wk', 'd', 'sales']
    enable_features = [col for col in list(df) if col not in remove_features]
    df = df[['id', 'd', 'sales'] + enable_features]
    df = reduce_mem_usage(df, verbose=False)
    gc.collect()

    return df, enable_features

In [None]:
def train(train_df, seed, end_train_day_x, predict_horizon):

    lgb_params = {
        'boosting_type': 'goss',
        'objective': 'tweedie',
        'tweedie_variance_power': 1.1,
        'metric': 'rmse',
        #'subsample': 0.5,
        #'subsample_freq': 1,
        'learning_rate': 0.03,
        'num_leaves': 2 ** 11 - 1,
        'min_data_in_leaf': 2 ** 12 - 1,
        'feature_fraction': 0.5,
        'max_bin': 100,
        'boost_from_average': False,
        'num_boost_round': 1400,
        'verbose': -1,
        'num_threads': os.cpu_count(),
        'force_row_wise': True,
    }
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

    lgb_params['seed'] = seed
    store_id_set_list = list(train_df['store_id'].unique())
    print(f"training stores: {store_id_set_list}")

    feature_importance_all_df = pd.DataFrame()
    for store_index, store_id in enumerate(store_id_set_list):
        print(f'now training {store_id} store')
        grid_df, enable_features = load_grid_by_store(end_train_day_x, predict_horizon, store_id)
        train_mask = grid_df['d'] <= end_train_day_x
        valid_mask = train_mask & (grid_df['d'] > (end_train_day_x - predict_horizon))
        preds_mask = grid_df['d'] > (end_train_day_x - 100)
        train_data = lgb.Dataset(grid_df[train_mask][enable_features],
                                 label=grid_df[train_mask]['sales'])
        valid_data = lgb.Dataset(grid_df[valid_mask][enable_features],
                                 label=grid_df[valid_mask]['sales'])
        # Saving part of the dataset for later predictions
        # Removing features that we need to calculate recursively
        grid_df = grid_df[preds_mask].reset_index(drop=True)
        grid_df.to_feather(f'test_{store_id}_{predict_horizon}.feather')
        del(grid_df)
        gc.collect()

        estimator = lgb.train(lgb_params,
                              train_data,
                              valid_sets=[valid_data],
                              callbacks=[lgb.log_evaluation(period=100, show_stdv=False)],
                              )
        model_name = str(f'lgb_model_{store_id}_{predict_horizon}.bin')
        feature_importance_store_df = pd.DataFrame(sorted(zip(enable_features, estimator.feature_importance())),
                                                   columns=['feature_name', 'importance'])
        feature_importance_store_df = feature_importance_store_df.sort_values('importance', ascending=False)
        feature_importance_store_df['store_id'] = store_id
        feature_importance_store_df.to_csv(f'feature_importance_{store_id}_{predict_horizon}.csv', index=False)
        feature_importance_all_df = pd.concat([feature_importance_all_df, feature_importance_store_df])
        pickle.dump(estimator, open(model_name, 'wb'))
        del([train_data, valid_data, estimator])
        gc.collect()
    feature_importance_all_df.to_csv(f'feature_importance_all_{predict_horizon}.csv', index=False)
    feature_importance_agg_df = feature_importance_all_df.groupby(
        'feature_name')['importance'].agg(['mean', 'std']).reset_index()
    feature_importance_agg_df.columns = ['feature_name', 'importance_mean', 'importance_std']
    feature_importance_agg_df = feature_importance_agg_df.sort_values('importance_mean', ascending=False)
    feature_importance_agg_df.to_csv(f'feature_importance_agg_{predict_horizon}.csv', index=False)

In [None]:
def train_pipeline(train_df, prices_df, calendar_df,
                   end_train_day_x_list, prediction_horizon_list):

    for end_train_day_x in end_train_day_x_list:

        for predict_horizon in prediction_horizon_list:

            print(f"end training point day: {end_train_day_x} - prediction horizon: {predict_horizon} days")
            # Data preparation
            generate_base_grid(train_df, end_train_day_x, predict_horizon)
            calc_release_week(prices_df, end_train_day_x, predict_horizon)
            generate_grid_price(prices_df, calendar_df, end_train_day_x, predict_horizon)
            generate_grid_calendar(calendar_df, end_train_day_x, predict_horizon)
            modify_grid_base(end_train_day_x, predict_horizon)
            generate_lag_feature(end_train_day_x, predict_horizon)
            generate_target_encoding_feature(end_train_day_x, predict_horizon)
            assemble_grid_by_store(train_df, end_train_day_x, predict_horizon)
            # Modelling
            train(train_df, seed, end_train_day_x, predict_horizon)

In [None]:
end_train_day_x_list = [1941]
prediction_horizon_list = [7]
seed = 42
train_pipeline(train_df, prices_df, calendar_df, end_train_day_x_list, prediction_horizon_list)

In [None]:
end_train_day_x_list = [1941]
prediction_horizon_list = [14]
seed = 42
train_pipeline(train_df, prices_df, calendar_df, end_train_day_x_list, prediction_horizon_list)

In [None]:
end_train_day_x_list = [1941]
prediction_horizon_list = [21]
seed = 42
train_pipeline(train_df, prices_df, calendar_df, end_train_day_x_list, prediction_horizon_list)

In [None]:
end_train_day_x_list = [1941]
prediction_horizon_list = [28]
seed = 42
train_pipeline(train_df, prices_df, calendar_df, end_train_day_x_list, prediction_horizon_list)