In [1]:
# -*- coding: utf-8 -*-
"""
Created on Tue May 26 18:53:11 2020

@author: pp9596
"""

# General imports
from  datetime import datetime, timedelta
import numpy as np
import pandas as pd
import os, sys, gc, time, warnings, pickle, psutil, random
from multiprocessing import Pool        # Multiprocess Runs
import lightgbm as lgb

# hyperopt set-up
from functools import partial
from hyperopt import hp, tpe, space_eval, STATUS_OK, Trials
from hyperopt.fmin import fmin 


# custom imports
import sys
import s3fs

from data_helper import seed_everything
from data_helper import demand, create_dt, create_static_fea, create_dynamic_fea, create_ds
from model_helper import run_model, save_fin_sub, get_sum
from WRMSSEEvaluator import WRMSSEEvaluator, WRMSSEForLightGBM
from aws_helper import write_csv_s3, read_csv_s3
warnings.filterwarnings('ignore')

# read statistics dataset
from sys import platform
if platform == "linux" or platform == "linux2":
    DATA_PATH = "/home/ec2-user/m5/data"
    aux_model_path = r"/home/ec2-user/m5/python/aux_model"
    sub_path = r"/home/ec2-user/m5/submission"
    META_DATA_PATH = "/home/ec2-user/m5/data/meta_data"
elif platform == "win32":
    DATA_PATH = r"C:\Users\PP9596\Documents\Bitbucket\M5\data"
    aux_model_path = r"C:\Users\PP9596\Documents\Bitbucket\M5\python\aux_model"
    sub_path = r"C:\Users\PP9596\Documents\Bitbucket\M5\submission"
    
########################### Helpers #####################################################
    
## Multiprocess Runs
def df_parallelize_run(func, t_split):
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

def get_valid_data(validation_flag=False):
    ''' Get dataset for evaluation
    '''
    if validation_flag:
        train_df =pd.read_csv(os.path.join(DATA_PATH, "sales_train_validation.csv"))  
    else:
        train_df =pd.read_csv(os.path.join(DATA_PATH, "sales_train_evaluation.csv"))  
    calendar = pd.read_csv(os.path.join(DATA_PATH, "calendar.csv"))
    calendar["date"] = pd.to_datetime(calendar["date"])
    prices = pd.read_csv(os.path.join(DATA_PATH, "sell_prices.csv"))
    return train_df, calendar, prices  

def gen_eval_obj(validation_flag=True, tr_start_data=120, tr_end_date=1913):
    # load data for estimator
    train_df, calendar, prices = get_valid_data(validation_flag)
    train_df.sort_values("id", inplace = True)
    train_df.reset_index(drop=True, inplace = True)

    # create WRMSSE object
    # Load train fold data for WRMSSEE
    catcol = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    tr_numcol = catcol + [f"d_{i}" for i in range(tr_start_data,tr_end_date+1)]
    vl_numcol = [f"d_{i}" for i in range(tr_end_date+1, tr_end_date+29)]    
    # train_fold_df = train_df.iloc[:, :-28]
    # valid_fold_df = train_df.iloc[:, -28:]
    train_fold_df = train_df[tr_numcol]
    valid_fold_df = train_df[vl_numcol]

    # load WRMSSEE class
    evaluator = WRMSSEEvaluator(train_fold_df, valid_fold_df, calendar, prices)
    valid_col = valid_fold_df.columns
    valid_col = valid_col.insert(0, "id")
    return evaluator, valid_col, train_df["id"]
        
# Seed everything
SEED = 1010                        # We want all things
seed_everything(SEED)            # to be as deterministic

In [4]:

########################### Model params ######################################################
VER = 24                        # Our model version
VERSION_U = 4                    # Uncertainity version
N_CORES = psutil.cpu_count()     # Available CPU cores

# LGB Parameter
lgb_params = {'boosting_type': 'gbdt', 
                'objective': 'poisson', 
                'learning_rate': 0.07,
                'force_row_wise': True,
                'metric': 'rmse', 
                'n_estimators': 4000, 
                'num_leaves': 2**6-1, 
                'min_child_samples': 2**5-1, 
                'subsample': 0.7,
                'subsample_freq': 1,
                'colsample_bytree': 0.6, 
                'lambda_l2': 0.9,
                'nthread' : 12}

# lgb_params_logit = lgb_params
# lgb_params_logit['objective'] ='binary' 

#LIMITS and const
TARGET      = 'sales'            # Our target
P_HORIZON   = 28                 # Prediction horizon
USE_AUX     = False              # Use or not pretrained models 
max_lags = 60
tagg = 1
END_TRAIN   = 1913               # End day of our train set

In [5]:
def get_pred_loss(evaluator, val_id, sub_fin_dict, Validation_Sum=None):
    # generate submission
    sub_fin = pd.DataFrame() 
    for k in sub_fin_dict.keys():
        sub_fin = sub_fin.append(sub_fin_dict[k])
    val_sum = get_sum(sub_fin, PRED_DURATION=28, inc_eval=False)

    val_id = pd.DataFrame(val_id)
    val_id["id"] = val_id["id"].str.replace("evaluation$", "validation")
    ix = sub_fin.id.str.contains("validation")
    sub_fin_val = sub_fin[ix]
    sub_fin_val.reset_index(drop=True, inplace = True)
    sub_fin_val = val_id.merge(sub_fin_val, on= "id", copy = False)
    # sub_fin.sort_values("id", inplace = True)
    # sub_fin.reset_index(drop=True, inplace = True)
    sub_fin_val.columns = valid_col


    # RMSSE loss function    
    loss = evaluator.score(sub_fin_val.loc[:,valid_col[1:]])
    print("Test WRMSSE Loss : ", loss)
    
    # Projected loss
    # Split & constant test
    if Validation_Sum is None:
        Validation_Sum = 1231764
    TOTAL_SUM = sum(sub_fin_val.loc[:,valid_col[1:]].sum())
    Factor = Validation_Sum/TOTAL_SUM
    sub_fin_val_test = sub_fin_val.loc[:,valid_col[1:]]*Factor
    ploss = evaluator.score(sub_fin_val_test.loc[:,valid_col[1:]])
    print("Test WRMSSE Loss : ", ploss)

    return loss, ploss, val_sum

In [4]:
loss_summary =[]
START_TRAIN = 150 # 150 1183 
dt, train_cols, train_mask,cat_feats = create_ds(START_TRAIN=START_TRAIN, END_TRAIN=END_TRAIN, stype=None)

Processing dataset....
INFO: CREATE LAG FEATURE
INFO: CREATE ROLLING FEATURE
rmean with lag -  28
rvar with lag -  28
rpz with lag -  28
rpz_28_28
rcv2_28_28
INFO: MERGING STATICAL FEATURE
INFO: CREATE LAG FEATURE
INFO: CREATE ROLLING FEATURE


In [8]:
loss_summary =[]
START_TRAIN = 150 # 150 1183                  # We can skip some rows (Nans/faster training)

# LGB Modelling
lgb_params = {'boosting_type':'gbdt', 
                'objective': 'poisson', 
                'learning_rate': 0.04,
                'force_row_wise': True,
                'metric': 'rmse', 
                'n_estimators': 4000, 
                'num_leaves': 2**14-1, 
                'min_child_samples': 2**15-1, # 15
                'subsample': 0.7,
                'subsample_freq': 1,
                'colsample_bytree': 0.6, #0.6, 
                'lambda_l2': 0.9,
                'nthread' : 12}

# Run at catagory level
# STYPE = ['Intermittent', 'Lumpy', 'Erratic', 'Smooth']
evaluator, valid_col, val_id = gen_eval_obj(validation_flag=False, tr_start_data=1, tr_end_date=END_TRAIN)
sub_fin_dict={}
get_level_sum = []
STYPE = [None]
for i, s in enumerate(STYPE):
    print("Processing for type - ", s)    
    dt, train_cols, train_mask,cat_feats = create_ds(START_TRAIN=START_TRAIN, END_TRAIN=END_TRAIN, stype=s)
    print("Number of records - ", dt.shape)
    create_dynamic_fea(dt, lags=[7, 14], wins=[7, 14])
    modelname = "estimator_t1_TIME_FULL_DS_VER_"+ str(VER) + "_TIMEAGG_" + str(i) + ".pkl"
    sub_fname = "full_data_t1_TIME_Agg_VER_" + str(VER) + "_TIMEAGG_" + str(i)
    train_cols, pred_val = run_model(dt, modelname=modelname, sub_fname=sub_fname, END_TRAIN=max(dt['d']), P_HORIZON=28, 
                           lgb_params=lgb_params, VER=VER, TIME_FILTER=0, logit_model=False, 
                           use_fake_val=False, val_prop=0.2, wrmssee_flag=True)
    key_level_sub = save_fin_sub(modelname, sub_fname, train_cols, VER=VER, lvl=0, PRED_DURATION=28, fday=max(dt['d']), save_sub=True, stype=s)
    sub_fin_dict[s] = key_level_sub
    get_level_sum.append(get_sum(sub_fin_dict[s], PRED_DURATION=28, inc_eval=False))    
    del dt
    loss, ploss, val_sum = get_pred_loss(evaluator, val_id, sub_fin_dict)
    loss_summary.append([n, loss, ploss, val_sum])

Processing for type -  None
Processing dataset....
INFO: CREATE LAG FEATURE
INFO: CREATE ROLLING FEATURE
rmean with lag -  28
rvar with lag -  28
rpz with lag -  28
rpz_28_28
rcv2_28_28
INFO: MERGING STATICAL FEATURE
INFO: CREATE LAG FEATURE
INFO: CREATE ROLLING FEATURE
Number of records -  (42267758, 41)
INFO: CREATE LAG FEATURE
INFO: CREATE ROLLING FEATURE
Training until validation scores don't improve for 100 rounds
[20]	valid_0's rmse: 2.81226	valid_0's WRMSSE: 1.64937
[40]	valid_0's rmse: 2.40404	valid_0's WRMSSE: 1.23727
[60]	valid_0's rmse: 2.20036	valid_0's WRMSSE: 0.967703
[80]	valid_0's rmse: 2.10434	valid_0's WRMSSE: 0.794309
[100]	valid_0's rmse: 2.06003	valid_0's WRMSSE: 0.688284
[120]	valid_0's rmse: 2.03949	valid_0's WRMSSE: 0.620416
[140]	valid_0's rmse: 2.02937	valid_0's WRMSSE: 0.578696
[160]	valid_0's rmse: 2.02281	valid_0's WRMSSE: 0.552234
[180]	valid_0's rmse: 2.0191	valid_0's WRMSSE: 0.534957
[200]	valid_0's rmse: 2.01645	valid_0's WRMSSE: 0.523217
[220]	valid_0'

[700]	valid_0's rmse: 2.00757	valid_0's WRMSSE: 0.487561
[720]	valid_0's rmse: 2.00698	valid_0's WRMSSE: 0.487608
[740]	valid_0's rmse: 2.00669	valid_0's WRMSSE: 0.487535
[760]	valid_0's rmse: 2.00613	valid_0's WRMSSE: 0.487276
[780]	valid_0's rmse: 2.00595	valid_0's WRMSSE: 0.487039
[800]	valid_0's rmse: 2.00541	valid_0's WRMSSE: 0.487126
[820]	valid_0's rmse: 2.00507	valid_0's WRMSSE: 0.487481
[840]	valid_0's rmse: 2.0045	valid_0's WRMSSE: 0.487042
[860]	valid_0's rmse: 2.00426	valid_0's WRMSSE: 0.486754
[880]	valid_0's rmse: 2.00392	valid_0's WRMSSE: 0.486718
[900]	valid_0's rmse: 2.00361	valid_0's WRMSSE: 0.486513
[920]	valid_0's rmse: 2.00343	valid_0's WRMSSE: 0.48662
[940]	valid_0's rmse: 2.00314	valid_0's WRMSSE: 0.486725
[960]	valid_0's rmse: 2.00295	valid_0's WRMSSE: 0.486688
[980]	valid_0's rmse: 2.00273	valid_0's WRMSSE: 0.487984
[1000]	valid_0's rmse: 2.00247	valid_0's WRMSSE: 0.487866
Early stopping, best iteration is:
[902]	valid_0's rmse: 2.00359	valid_0's WRMSSE: 0.4864

In [7]:
loss_summary

Total Sum -  1194711.9260449107
Test WRMSSE Loss :  0.5265785702652702
Test WRMSSE Loss :  0.4915122330684103


# Re-build with re-training

In [6]:
loss_summary =[]
START_TRAIN = 150 # 150 1183                  # We can skip some rows (Nans/faster training)
END_TRAIN   = 1913               # End day of our train set

# LGB Parameter
lgb_params = {'boosting_type':'gbdt', # 'dart', 
                'objective': 'poisson', 
                # 'tweedie_variance_power': 1.52, 
                'learning_rate': 0.04,
                'force_row_wise': True,
                'metric': 'rmse', 
                'n_estimators': 4000, 
                'num_leaves': 2**14-1, 
                'min_child_samples': 2**15-1, 
                'subsample': 0.7,
                'subsample_freq': 1,
                'colsample_bytree': 0.6, #0.6, 
                'lambda_l2': 0.9,
                'nthread' : 12}

# Run at catagory level
# STYPE = ['Intermittent', 'Lumpy', 'Erratic', 'Smooth']
evaluator, valid_col, val_id = gen_eval_obj(validation_flag=False, tr_start_data=1, tr_end_date=END_TRAIN)
sub_fin_dict={}
get_level_sum = []
STYPE = [None]
for i, s in enumerate(STYPE):
    print("Processing for type - ", s)    
    dt, train_cols, train_mask,cat_feats = create_ds(START_TRAIN=START_TRAIN, END_TRAIN=END_TRAIN, stype=s)
    print("Number of records - ", dt.shape)
    create_dynamic_fea(dt, lags=[1, 7, 14], wins=[1, 7, 14])
    modelname = "estimator_t1_TIME_FULL_DS_VER_"+ str(VER) + "_TIMEAGG_" + str(i) + ".pkl"
    sub_fname = "full_data_t1_TIME_Agg_VER_" + str(VER) + "_TIMEAGG_" + str(i)
    train_cols, pred_val = run_model(dt, modelname=modelname, sub_fname=sub_fname, END_TRAIN=max(dt['d']), P_HORIZON=28, 
                           lgb_params=lgb_params, VER=VER, TIME_FILTER=0, logit_model=False, 
                           use_fake_val=False, val_prop=0.2, wrmssee_flag=False)
    key_level_sub_val = save_fin_sub(modelname, sub_fname, train_cols, VER=VER, lvl=0, PRED_DURATION=28, fday=max(dt['d']), save_sub=True, stype=s)

    # load full trained model
    estimator = pickle.load(open(os.path.join(aux_model_path , modelname), 'rb'))
    modelname1 = "full_estimator_t1_TIME_FULL_DS_VER_"+ str(VER) + "_TIMEAGG_" + str(i) + ".pkl"
    train_cols, val_data_reg1 = run_model(dt, modelname=modelname1, sub_fname=sub_fname, END_TRAIN=max(dt['d']), P_HORIZON=28, 
                                     lgb_params=lgb_params, VER=VER, TIME_FILTER=0, logit_model=False, 
                                     estimator=estimator, use_fake_val=False, val_prop=0.2, wrmssee_flag=False)    

    # Replace modelname
    sub_fname = "full_data_t1_TIME_Agg_VER_" + str(VER) + "_TIMEAGG_" + str(i)
    key_level_sub_full = save_fin_sub(modelname1, sub_fname, train_cols, VER=VER, lvl=0, PRED_DURATION=28, fday=max(dt['d']), save_sub=True, stype=s)
    sub_fin_dict[s] = key_level_sub_val
    get_level_sum.append(get_sum(sub_fin_dict[s], PRED_DURATION=28, inc_eval=False))    
    del dt
    loss, ploss, val_sum = get_pred_loss(evaluator, val_id, sub_fin_dict)
    loss_summary.append([n, loss, ploss, val_sum])

Processing for type -  None
Processing dataset....
INFO: CREATE LAG FEATURE
INFO: CREATE ROLLING FEATURE
rmean with lag -  28
rvar with lag -  28
rpz with lag -  28
rpz_28_28
rcv2_28_28
INFO: MERGING STATICAL FEATURE
INFO: CREATE LAG FEATURE
INFO: CREATE ROLLING FEATURE
Number of records -  (42267758, 47)
INFO: CREATE LAG FEATURE
INFO: CREATE ROLLING FEATURE
Training until validation scores don't improve for 50 rounds
[200]	valid_0's rmse: 1.92119
[400]	valid_0's rmse: 1.90539
[600]	valid_0's rmse: 1.89642
[800]	valid_0's rmse: 1.89134
[1000]	valid_0's rmse: 1.88797
[1200]	valid_0's rmse: 1.88546
[1400]	valid_0's rmse: 1.88325
[1600]	valid_0's rmse: 1.88147
[1800]	valid_0's rmse: 1.87989
[2000]	valid_0's rmse: 1.87856
[2200]	valid_0's rmse: 1.87733
[2400]	valid_0's rmse: 1.87647
[2600]	valid_0's rmse: 1.8755
[2800]	valid_0's rmse: 1.87486
[3000]	valid_0's rmse: 1.87421
[3200]	valid_0's rmse: 1.87362
[3400]	valid_0's rmse: 1.87312
[3600]	valid_0's rmse: 1.87278
[3800]	valid_0's rmse: 1.

Total Sum -  1288897.2706909135
Test WRMSSE Loss :  0.5641493181717369
Test WRMSSE Loss :  0.4724673839051214


In [27]:
valid_col

Index(['id', 'd_1886', 'd_1887', 'd_1888', 'd_1889', 'd_1890', 'd_1891',
       'd_1892', 'd_1893', 'd_1894', 'd_1895', 'd_1896', 'd_1897', 'd_1898',
       'd_1899', 'd_1900', 'd_1901', 'd_1902', 'd_1903', 'd_1904', 'd_1905',
       'd_1906', 'd_1907', 'd_1908', 'd_1909', 'd_1910', 'd_1911', 'd_1912',
       'd_1913'],
      dtype='object')

In [28]:
key_level_sub_full.to_csv(os.path.join(sub_path, "Point_forecast_full_ds_pre_validation_v23pks.csv"), index=False)
key_level_sub_val.to_csv(os.path.join(sub_path, "Point_forecast_val_ds_pre_validation_v23pks.csv"), index=False)