### Import Dependencies

In [1]:
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random

# custom imports
from multiprocessing import Pool        # Multiprocess Runs

warnings.filterwarnings('ignore')

# TYPE = 'validation'
TYPE = 'evaluation'


### Helpers

In [1]:
# seed
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

    
# multiprocess runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

### Read data

In [3]:
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    # Leave only relevant store
    df = df[df['store_id']==store]

    # With memory limits we have to read 
    # lags and mean encoding features
    # separately and drop items that we don't need.
    # As our Features Grids are aligned 
    # we can use index to keep only necessary rows
    # Alignment is good for us as concat uses less memory than merge.
    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2 # to not reach memory limit 
    
    df = pd.concat([df, df3], axis=1)
    del df3 # to not reach memory limit 
    
    # Create features list
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    # Skipping first n rows
    df = df[df['d']>=START_TRAIN].reset_index(drop=True)
    
    return df, features

# recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle('test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test


# helper to make dynamic rolling lags
def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(LAG_DAY):
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).rolling(roll_wind).mean())
    return lag_df[[col_name]]

### Model Parameters

In [4]:
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1400,
                    'boost_from_average': False,
                    'verbose': -1,
                } 



### Features

In [5]:
VER = 1                          # Our model version
SEED = 42                        # We want all things
seed_everything(SEED)            # to be as deterministic 
lgb_params['seed'] = SEED        # as possible
N_CORES = psutil.cpu_count()     # Available CPU cores


#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)

if TYPE == 'validmyself':
    END_TRAIN = 1913-28 
elif TYPE == 'validation':
    END_TRAIN = 1913
elif TYPE == 'evaluation':
    END_TRAIN = 1913+28
else:
    print('WRONG!!!')    # Last day in train set
    
    
P_HORIZON   = 28                 # Prediction horizon
USE_AUX     = False               # Use or not pretrained models

#FEATURES to remove
## These features lead to overfit
## or values not present in test set
remove_features = ['id','state_id','store_id',
                   'date','wm_yr_wk','d',TARGET]
mean_features   = ['enc_cat_id_mean','enc_cat_id_std',
                   'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

#PATHS for Features
ORIGINAL = '../input/m5-forecasting-accuracy/'
BASE     = 'grid_part_1.pkl'
PRICE    = 'grid_part_2.pkl'
CALENDAR = 'grid_part_3.pkl'
LAGS     = 'lags_df_28.pkl'
MEAN_ENC = 'mean_encoding_df.pkl'


# AUX(pretrained) Models paths
AUX_MODELS = '../input/m5-aux-models/'


#STORES ids
STORES_IDS = pd.read_csv(ORIGINAL+'sales_train_validation.csv')['store_id']
STORES_IDS = list(STORES_IDS.unique())


#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        ROLS_SPLIT.append([i,j])

### Fit Models 

In [7]:
for store_id in STORES_IDS:
    print('Train', store_id)
    
    # Get grid for current store
    grid_df, features_columns = get_data_by_store(store_id)
    
    # Masks for 
    # Train (All data less than 1913)
    # "Validation" (Last 28 days - not real validatio set)
    # Test (All data greater than 1913 day, 
    #       with some gap for recursive features)
    train_mask = grid_df['d']<=END_TRAIN
    valid_mask = train_mask&(grid_df['d']>(END_TRAIN-P_HORIZON))
    preds_mask = grid_df['d']>(END_TRAIN-100)
    
    # Apply masks and save lgb dataset as bin
    # to reduce memory spikes during dtype convertations
    # https://github.com/Microsoft/LightGBM/issues/1032
    # "To avoid any conversions, you should always use np.float32"
    # or save to bin before start training
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/53773
    train_data = lgb.Dataset(grid_df[train_mask][features_columns], 
                       label=grid_df[train_mask][TARGET])
    train_data.save_binary('train_data.bin')
    train_data = lgb.Dataset('train_data.bin')
    
    valid_data = lgb.Dataset(grid_df[valid_mask][features_columns], 
                       label=grid_df[valid_mask][TARGET])
    
    # Saving part of the dataset for later predictions
    # Removing features that we need to calculate recursively 
    grid_df = grid_df[preds_mask].reset_index(drop=True)
    keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
    grid_df = grid_df[keep_cols]
    grid_df.to_pickle('test_'+store_id+'.pkl')
    del grid_df
    
    # Launch seeder again to make lgb training 100% deterministic
    # with each "code line" np.random "evolves" 
    # so we need (may want) to "reset" it
    seed_everything(SEED)
    estimator = lgb.train(lgb_params,
                          train_data,
                          valid_sets = [valid_data],
                          verbose_eval = 100,
                          )
    
    # Save model - it's not real '.bin' but a pickle file
    # estimator = lgb.Booster(model_file='model.txt')
    # can only predict with the best iteration (or the saving iteration)
    # pickle.dump gives us more flexibility
    # like estimator.predict(TEST, num_iteration=100)
    # num_iteration - number of iteration want to predict with, 
    # NULL or <= 0 means use best iteration
    model_name = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin'
    pickle.dump(estimator, open(model_name, 'wb'))

    # Remove temporary files and objects 
    # to free some hdd space and ram memory
    !rm train_data.bin
    del train_data, valid_data, estimator
    gc.collect()
    
    # "Keep" models features for predictions
    MODEL_FEATURES = features_columns

Train CA_1
[100]	valid_0's rmse: 2.01638
[200]	valid_0's rmse: 1.98408
[300]	valid_0's rmse: 1.97417
[400]	valid_0's rmse: 1.96781
[500]	valid_0's rmse: 1.96263
[600]	valid_0's rmse: 1.95721
[700]	valid_0's rmse: 1.95204
[800]	valid_0's rmse: 1.94672
[900]	valid_0's rmse: 1.94219
[1000]	valid_0's rmse: 1.93713
[1100]	valid_0's rmse: 1.93219
[1200]	valid_0's rmse: 1.92775
[1300]	valid_0's rmse: 1.92265
[1400]	valid_0's rmse: 1.91802
Train CA_2
[100]	valid_0's rmse: 1.94672
[200]	valid_0's rmse: 1.88961
[300]	valid_0's rmse: 1.87449
[400]	valid_0's rmse: 1.86686
[500]	valid_0's rmse: 1.86041
[600]	valid_0's rmse: 1.85396
[700]	valid_0's rmse: 1.84797
[800]	valid_0's rmse: 1.84255
[900]	valid_0's rmse: 1.83811
[1000]	valid_0's rmse: 1.83284
[1100]	valid_0's rmse: 1.82821
[1200]	valid_0's rmse: 1.82388
[1300]	valid_0's rmse: 1.81953
[1400]	valid_0's rmse: 1.8152
Train CA_3
[100]	valid_0's rmse: 2.38892
[200]	valid_0's rmse: 2.34715
[300]	valid_0's rmse: 2.33069
[400]	valid_0's rmse: 2.32
[

### Predict

In [8]:
# Create Dummy DataFrame to store predictions
all_preds = pd.DataFrame()

# Join back the Test dataset with 
# a small part of the training data 
# to make recursive features
base_test = get_base_test()

# Timer to measure predictions time 
main_time = time.time()

# Loop over each prediction day
# As rolling lags are the most timeconsuming
# we will calculate it for whole day
for PREDICT_DAY in range(1,29):    
    print('Predict | Day:', PREDICT_DAY)
    start_time = time.time()

    # Make temporary grid to calculate rolling lags
    grid_df = base_test.copy()
    grid_df = pd.concat([grid_df, df_parallelize_run(make_lag_roll, ROLS_SPLIT)], axis=1)
        
    for store_id in STORES_IDS:
        
        # Read all our models and make predictions
        # for each day/store pairs
        model_path = 'lgb_model_'+store_id+'_v'+str(VER)+'.bin' 
        if USE_AUX:
            model_path = AUX_MODELS + model_path
        
        estimator = pickle.load(open(model_path, 'rb'))
        
        day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY)
        store_mask = base_test['store_id']==store_id
        
        mask = (day_mask)&(store_mask)
        base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])
    
    # Make good column naming and add 
    # to all_preds DataFrame
    temp_df = base_test[day_mask][['id',TARGET]]
    temp_df.columns = ['id','F'+str(PREDICT_DAY)]
    if 'id' in list(all_preds):
        all_preds = all_preds.merge(temp_df, on=['id'], how='left')
    else:
        all_preds = temp_df.copy()
        
    print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                  ' %0.2f min total |' % ((time.time() - main_time) / 60),
                  ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))
    del temp_df
    
all_preds = all_preds.reset_index(drop=True)
all_preds

Predict | Day: 1
##########  0.46 min round |  0.46 min total |  39824.51 day sales |
Predict | Day: 2
##########  0.46 min round |  0.91 min total |  37020.80 day sales |
Predict | Day: 3
##########  0.46 min round |  1.37 min total |  37048.21 day sales |
Predict | Day: 4
##########  0.46 min round |  1.83 min total |  37113.79 day sales |
Predict | Day: 5
##########  0.46 min round |  2.28 min total |  42103.53 day sales |
Predict | Day: 6
##########  0.46 min round |  2.74 min total |  50288.65 day sales |
Predict | Day: 7
##########  0.46 min round |  3.20 min total |  51139.13 day sales |
Predict | Day: 8
##########  0.45 min round |  3.65 min total |  45090.29 day sales |
Predict | Day: 9
##########  0.46 min round |  4.11 min total |  39057.84 day sales |
Predict | Day: 10
##########  0.46 min round |  4.56 min total |  44161.55 day sales |
Predict | Day: 11
##########  0.45 min round |  5.02 min total |  45257.33 day sales |
Predict | Day: 12
##########  0.46 min round |  5.47

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_evaluation,1.016019,0.820798,0.863089,0.860954,1.060584,1.325702,1.143586,1.153598,0.927167,...,1.081411,1.452636,1.212648,1.036319,0.892346,0.917212,0.983924,1.222466,1.336801,1.096542
1,HOBBIES_1_002_CA_1_evaluation,0.209665,0.202462,0.201996,0.195878,0.231340,0.288717,0.318061,0.217098,0.190154,...,0.236810,0.289340,0.336094,0.206175,0.210754,0.214495,0.224534,0.263945,0.346883,0.397126
2,HOBBIES_1_003_CA_1_evaluation,0.532586,0.495946,0.497307,0.508057,0.666772,0.794551,0.853137,0.478137,0.494207,...,0.637718,0.720339,0.682010,0.507345,0.437814,0.487484,0.490248,0.659896,0.680418,0.721003
3,HOBBIES_1_004_CA_1_evaluation,1.508991,1.329832,1.337885,1.436322,1.893035,2.634785,2.909381,1.992759,1.411556,...,1.895158,2.644562,2.972997,1.770339,1.413260,1.470978,1.483530,1.891046,2.566123,2.627022
4,HOBBIES_1_005_CA_1_evaluation,0.933289,0.902085,0.926273,0.984511,1.021761,1.299919,1.382790,1.104397,1.005090,...,1.174641,1.503533,1.440640,1.119657,0.962330,1.026712,1.135623,1.382640,1.620631,1.368177
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,0.457488,0.422124,0.407579,0.456975,0.494427,0.519157,0.631172,0.484014,0.500971,...,0.555696,0.715410,0.850520,0.584696,0.611093,0.583971,0.463133,0.470809,0.567758,0.643348
30486,FOODS_3_824_WI_3_evaluation,0.269612,0.269652,0.226230,0.208012,0.224886,0.275140,0.283919,0.280258,0.254734,...,0.282436,0.361154,0.420467,0.337720,0.388915,0.420287,0.297426,0.257637,0.318984,0.346908
30487,FOODS_3_825_WI_3_evaluation,0.627418,0.522876,0.466312,0.437865,0.496810,0.599151,0.648246,0.585364,0.501346,...,0.843144,1.225290,1.355973,1.011092,1.054675,1.080341,0.675670,0.677950,0.818597,0.977020
30488,FOODS_3_826_WI_3_evaluation,1.076575,1.065661,1.060419,0.944252,1.032034,1.102889,1.050712,1.240361,1.011412,...,1.112941,1.422140,1.385522,1.288973,1.455521,1.347306,1.038740,1.137330,1.276893,1.357613


### Export

In [9]:
# Reading competition sample submission and
# merging our predictions
# As we have predictions only for "_validation" data
# we need to do fillna() for "_evaluation" items



submission = pd.read_csv(ORIGINAL+'sample_submission.csv')[['id']]

    
submission = submission.merge(all_preds, on=['id'], how='left').fillna(0)
submission.to_csv('submission_v'+str(VER)+'_'+TYPE+'.csv', index=False)

In [10]:
submission

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_evaluation,1.016019,0.820798,0.863089,0.860954,1.060584,1.325702,1.143586,1.153598,0.927167,...,1.081411,1.452636,1.212648,1.036319,0.892346,0.917212,0.983924,1.222466,1.336801,1.096542
1,HOBBIES_1_002_CA_1_evaluation,0.209665,0.202462,0.201996,0.195878,0.231340,0.288717,0.318061,0.217098,0.190154,...,0.236810,0.289340,0.336094,0.206175,0.210754,0.214495,0.224534,0.263945,0.346883,0.397126
2,HOBBIES_1_003_CA_1_evaluation,0.532586,0.495946,0.497307,0.508057,0.666772,0.794551,0.853137,0.478137,0.494207,...,0.637718,0.720339,0.682010,0.507345,0.437814,0.487484,0.490248,0.659896,0.680418,0.721003
3,HOBBIES_1_004_CA_1_evaluation,1.508991,1.329832,1.337885,1.436322,1.893035,2.634785,2.909381,1.992759,1.411556,...,1.895158,2.644562,2.972997,1.770339,1.413260,1.470978,1.483530,1.891046,2.566123,2.627022
4,HOBBIES_1_005_CA_1_evaluation,0.933289,0.902085,0.926273,0.984511,1.021761,1.299919,1.382790,1.104397,1.005090,...,1.174641,1.503533,1.440640,1.119657,0.962330,1.026712,1.135623,1.382640,1.620631,1.368177
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,0.457488,0.422124,0.407579,0.456975,0.494427,0.519157,0.631172,0.484014,0.500971,...,0.555696,0.715410,0.850520,0.584696,0.611093,0.583971,0.463133,0.470809,0.567758,0.643348
30486,FOODS_3_824_WI_3_evaluation,0.269612,0.269652,0.226230,0.208012,0.224886,0.275140,0.283919,0.280258,0.254734,...,0.282436,0.361154,0.420467,0.337720,0.388915,0.420287,0.297426,0.257637,0.318984,0.346908
30487,FOODS_3_825_WI_3_evaluation,0.627418,0.522876,0.466312,0.437865,0.496810,0.599151,0.648246,0.585364,0.501346,...,0.843144,1.225290,1.355973,1.011092,1.054675,1.080341,0.675670,0.677950,0.818597,0.977020
30488,FOODS_3_826_WI_3_evaluation,1.076575,1.065661,1.060419,0.944252,1.032034,1.102889,1.050712,1.240361,1.011412,...,1.112941,1.422140,1.385522,1.288973,1.455521,1.347306,1.038740,1.137330,1.276893,1.357613
