In [None]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random
from sklearn.model_selection import GroupKFold
# custom imports
from multiprocessing import Pool        
from Custom_Metric import custom_metric

warnings.filterwarnings('ignore')

In [None]:
########################### Helpers
#################################################################################
## Seeder
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

    
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

In [None]:
########################### Helper to load data by store ID
#################################################################################

## Features to remove
## State based features only
def removeFeatures(store):
    if 'CA' in store:
        remove_features = ['id','state_id','store_id','date','tm_w_end','nwd_TX','nwd_WI',
                           'groups','wm_yr_wk','d',TARGET]
    elif 'TX' in store:
        remove_features = ['id','state_id','store_id','date','tm_w_end','nwd_CA','nwd_WI',
                           'groups','wm_yr_wk','d',TARGET]
    else:
        remove_features = ['id','state_id','store_id','date','tm_w_end','nwd_CA','nwd_TX',
                           'groups','wm_yr_wk','d',TARGET]
    return remove_features

# MEAN FEATURES
mean_features   = ['enc_cat_id_mean','enc_cat_id_std',
                   'enc_dept_id_mean','enc_dept_id_std',
                   'enc_item_id_mean','enc_item_id_std'] 

rolls = ['rolling_mean_tmp_1_7','rolling_mean_tmp_1_14','rolling_mean_tmp_1_30',
            'rolling_mean_tmp_7_7','rolling_mean_tmp_7_14','rolling_mean_tmp_7_30',
            'rolling_mean_tmp_14_7','rolling_mean_tmp_14_14','rolling_mean_tmp_14_30']
# Read data
def get_data_by_store(store):
    
    # Read and contact basic feature
    df = pd.concat([pd.read_pickle(BASE),
                    pd.read_pickle(PRICE).iloc[:,2:],
                    pd.read_pickle(CALENDAR).iloc[:,2:]],
                    axis=1)
    
    # Only relevant store
    df = df[df['store_id']==store]

    # Reading seperately so that memory limit is not reached
    df2 = pd.read_pickle(MEAN_ENC)[mean_features]
    df2 = df2[df2.index.isin(df.index)]
    
    df3 = pd.read_pickle(LAGS).iloc[:,3:]
    df3 = df3[df3.index.isin(df.index)]
    
    df = pd.concat([df, df2], axis=1)
    del df2 # to not reach memory limit 
    
    df = pd.concat([df, df3], axis=1)
    del df3 # to not reach memory limit 
    
    remove_features = removeFeatures(store)
    # Create features list
    features = [col for col in list(df) if col not in remove_features]
    df = df[['id','d',TARGET]+features]
    
    # Skipping first n rows
    df = df[df['d']>=START_TRAIN].reset_index(drop=True)
    
    return df, features

# Recombine Test set after training
def get_base_test():
    base_test = pd.DataFrame()

    for store_id in STORES_IDS:
        temp_df = pd.read_pickle('/My Drive/Walmart_Data/models/test_'+store_id+'.pkl')
        temp_df['store_id'] = store_id
        base_test = pd.concat([base_test, temp_df]).reset_index(drop=True)
    
    return base_test


########################### dynamic rolling lags
#################################################################################
def make_lag(LAG_DAY):
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'sales_lag_'+str(LAG_DAY)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(LAG_DAY)).astype(np.float16)
    return lag_df[[col_name]]


def make_lag_roll(LAG_DAY):
    shift_day = LAG_DAY[0]
    roll_wind = LAG_DAY[1]
    lag_df = base_test[['id','d',TARGET]]
    col_name = 'rolling_mean_tmp_'+str(shift_day)+'_'+str(roll_wind)
    lag_df[col_name] = lag_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(shift_day).ewm(span=roll_wind).mean())
    return lag_df[[col_name]]

In [None]:
########################### Model params
#################################################################################
import lightgbm as lgb
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'custom',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.03,
                    'num_leaves': 2**11-1,
                    'min_data_in_leaf': 2**12-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 1400,
                    'boost_from_average': False,
                    'verbose': 1,
                }

In [None]:
########################### Vars
#################################################################################
SEED = 42                        
seed_everything(SEED)            
lgb_params['seed'] = SEED        
N_CORES = psutil.cpu_count()     


#LIMITS and const
TARGET      = 'sales'            # Our target
START_TRAIN = 0                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1941               # End day of our train set
P_HORIZON   = 56                 # Prediction horizon
USE_AUX     = True               # Pretrained models

#PATHS for Features
ORIGINAL = '/My Drive/Walmart_Data/train/'
BASE     = '/My Drive/Walmart_Data/grid_part_1.pkl'
PRICE    = '/My Drive/Walmart_Data/grid_part_2.pkl'
CALENDAR = '/My Drive/Walmart_Data/grid_part_3.pkl'
LAGS     = '/My Drive/Walmart_Data/lags_df_28.pkl'
MEAN_ENC = '/My Drive/Walmart_Data/mean_encoding_df.pkl'

# AUX(pretrained) Models paths
AUX_MODELS = '/My Drive/Walmart_Data/models/'
CV_FOLDS   = [0,1,2]

#STORES ids
STORES_IDS = pd.read_csv(ORIGINAL+'sales_train_validation.csv')['store_id']
STORES_IDS = list(STORES_IDS.unique())

#SPLITS for lags creation
SHIFT_DAY  = 28
N_LAGS     = 15
LAGS_SPLIT = [col for col in range(SHIFT_DAY,SHIFT_DAY+N_LAGS)]
ROLS_SPLIT = []
for i in [1,7,14]:
    for j in [7,14,30,60]:
        ROLS_SPLIT.append([i,j])

In [None]:
for store in STORES_IDS:
    print('Train', store)       
    grid, features_columns = get_data_by_store(store)

    train_mask = grid_df['d']<=END_TRAIN
    preds_mask = grid_df['d']>(END_TRAIN-100)

    ## We will use oof kfold to find "best round"
    folds = GroupKFold(n_splits=3)

    # get subgroups for each week, year pair
    grid_df['groups'] = grid_df['tm_w'].astype(str) + '_' + grid_df['tm_y'].astype(str)
    split_groups = grid_df[train_mask]['groups']

    # Main Data
    X,y = grid_df[train_mask][features_columns], grid_df[train_mask][TARGET]

    grid_df = grid_df[preds_mask].reset_index(drop=True)
    keep_cols = [col for col in list(grid_df) if '_tmp_' not in col]
    grid_df = grid_df[keep_cols]
    grid_df.to_pickle('/My Drive/Walmart_Data/models/test_'+store+'.pkl')
    del grid_df

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y, groups=split_groups)):
        print('Fold:',fold_)
        print(len(trn_idx),len(val_idx))
        tr_x, tr_y = X.iloc[trn_idx,:], y[trn_idx]
        v_X, v_y   = X.iloc[val_idx,:], y[val_idx] 
        train_data = lgb.Dataset(tr_x, label=tr_y)
        valid_data = lgb.Dataset(v_X, label=v_y)  

        estimator = lgb.train(
              lgb_params,
              train_data,
              valid_sets = [train_data, valid_data],
              verbose_eval = 100,
              metric=custom_metric
          )

        model_name = '/My Drive/Walmart_Data/models/lgb_model_'+store+'_'+str(fold_)+'.bin'
        pickle.dump(estimator, open(model_name, 'wb'))

        # Remove temporary files and objects
        del train_data, valid_data, estimator
        gc.collect()

In [None]:
########################### Predict
#################################################################################
for fold_ in CV_FOLDS:
    print("FOLD:", fold_)
    all_preds = pd.DataFrame()
    base_test = get_base_test()
    modelFeatures=base_test.columns
    
    # Timer to measure predictions time 
    main_time = time.time()

    # Loop over each prediction day
    # As rolling lags are the most timeconsuming
    # we will calculate it for whole day
    for PREDICT_DAY in range(1,29):    
        print('Predict | Day:', PREDICT_DAY)
        start_time = time.time()

        # Make temporary grid to calculate rolling lags
        grid_df = base_test.copy()
        grid_df = pd.concat([grid_df, df_parallelize_run(make_lag_roll, ROLS_SPLIT)], axis=1)
        MODEL_FEATURES=[col for col in modelFeatures if col not in removeFeatures(STORES_IDS[0])] + rolls

        for store_id in STORES_IDS:

            model_path = '/My Drive/Walmart_Data/models/lgb_model_'+store_id+'_'+str(fold_)+'.bin' 
            
            if USE_AUX:
                model_path = AUX_MODELS + model_path

            estimator = pickle.load(open(model_path, 'rb'))

            day_mask = base_test['d']==(END_TRAIN+PREDICT_DAY)
            store_mask = base_test['store_id']==store_id

            mask = (day_mask)&(store_mask)
            base_test[TARGET][mask] = estimator.predict(grid_df[mask][MODEL_FEATURES])


        temp_df = base_test[day_mask][['id',TARGET]]
        temp_df.columns = ['id','F'+str(PREDICT_DAY)]
        if 'id' in list(all_preds):
            all_preds = all_preds.merge(temp_df, on=['id'], how='left')
        else:
            all_preds = temp_df.copy()
        all_preds = all_preds.reset_index(drop=True)
        print('#'*10, ' %0.2f min round |' % ((time.time() - start_time) / 60),
                      ' %0.2f min total |' % ((time.time() - main_time) / 60),
                      ' %0.2f day sales |' % (temp_df['F'+str(PREDICT_DAY)].sum()))
    all_preds.to_csv('/My Drive/Walmart_Data/models/all_preds_CA_'+str(fold_)+'.csv',index=False)
    del temp_df, all_preds

In [None]:
# Load Predictions
all_preds_0=pd.read_csv(AUX_MODELS+'all_preds_CA_0'+'.csv')
all_preds_1=pd.read_csv(AUX_MODELS+'all_preds_CA_1'+'.csv')
all_preds_2=pd.read_csv(AUX_MODELS+'all_preds_CA_2'+'.csv')

# Create Dummy DataFrame to store predictions
final_all_preds = pd.DataFrame()
final_all_preds['id'] = all_preds_1['id']
for item in all_preds_1:
    if item!='id':
        final_all_preds[item]=(all_preds_0[item]*(1/3))+(all_preds_1[item]*(1/3))+(all_preds_2[item]*(1/3))
final_all_preds

In [None]:
########################### Export
#################################################################################

submission = pd.read_csv(ORIGINAL+'sample_submission.csv')[['id']]
submission = submission.merge(final_all_preds, on=['id'], how='inner').fillna(0)
submission.to_csv(AUX_MODELS+'submission_CA'+'.csv', index=False)