In [173]:
import os
import gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# Loading Data

In [174]:
train = pd.read_csv('train.csv', parse_dates=['date'])
test = pd.read_csv('test.csv', parse_dates=['date'])
sample_sub = pd.read_csv('sample_submission.csv')
print('Train shape:{}, Test shape:{}'.format(train.shape, test.shape))
train.head()

Train shape:(913000, 4), Test shape:(45000, 4)


Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


# Feature Engineering

In [175]:
train['train_or_test'] = 'train'
test['train_or_test'] = 'test'
df = pd.concat([train,test])
print('Combined df shape:{}'.format(df.shape))
del train, test
gc.collect()

Combined df shape:(958000, 6)


89

Date Features

In [176]:
# Extracting date features
df['dayofmonth'] = df.date.dt.day
df['dayofyear'] = df.date.dt.dayofyear
df['dayofweek'] = df.date.dt.dayofweek
df['month'] = df.date.dt.month
df['year'] = df.date.dt.year
df['weekofyear'] = df.date.dt.weekofyear
df['is_month_start'] = (df.date.dt.is_month_start).astype(int)
df['is_month_end'] = (df.date.dt.is_month_end).astype(int)
df.head()

Unnamed: 0,date,id,item,sales,store,train_or_test,dayofmonth,dayofyear,dayofweek,month,year,weekofyear,is_month_start,is_month_end
0,2013-01-01,,1,13.0,1,train,1,1,1,1,2013,1,1,0
1,2013-01-02,,1,11.0,1,train,2,2,2,1,2013,1,0,0
2,2013-01-03,,1,14.0,1,train,3,3,3,1,2013,1,0,0
3,2013-01-04,,1,13.0,1,train,4,4,4,1,2013,1,0,0
4,2013-01-05,,1,10.0,1,train,5,5,5,1,2013,1,0,0


In [177]:
# Sorting the dataframe by store, then item, then date
df.sort_values(by=['store','item','date'], axis=0, inplace=True)

Monthwise aggregated sales values

In [178]:
def create_sales_agg_monthwise_features(df, gpby_cols, target_col, agg_funcs):
    gpby = df.groupby(gpby_cols)
    newdf = df[gpby_cols].drop_duplicates().reset_index(drop=True)
    
    for agg_name, agg_func in agg_funcs.items():
        aggdf = gpby[target_col].agg(agg_func).reset_index()
        aggdf.rename(columns={target_col:target_col+'_'+agg_name}, inplace=True)
        newdf = newdf.merge(aggdf, on=gpby_cols, how='left')
    return newdf

Features constructed from previous sales values

In [179]:
# Creating sales lag features
def create_sales_lag_feats(df, gpby_cols, target_col, lags):
    gpby = df.groupby(gpby_cols)
    for i in lags:
        df['_'.join([target_col, 'lag', str(i)])] = \
            gpby[target_col].shift(i).values
    return df

# Creating sales rolling mean features
def create_sales_rmean_feats(df, gpby_cols, target_col, windows, min_periods=2, shift=1,
                            win_type=None):
    gpby = df.groupby(gpby_cols)
    for w in windows:
        df['_'.join([target_col, 'rmean', str(w)])] = \
            gpby[target_col].shift(shift).rolling(window=w, min_periods=min_periods, 
                                             win_type=win_type).mean().values +\
        np.random.normal(scale=1.6, size=(len(df),))
    return df

# Creating sales rolling median features
def create_sales_rmed_feats(df, gpby_cols, target_col, windows, min_periods=2,
                           shift=1, win_type=None):
    gpby = df.groupby(gpby_cols)
    for w in windows:
        df['_'.join([target_col, 'rmed', str(w)])] = \
            gpby[target_col].shift(shift).rolling(window=w, min_periods=min_periods, 
                                              win_type=win_type).median().values +\
        np.random.normal(scale=1.6, size=(len(df),))
    return df

# Creating sales exponentially weighted mean features
def create_sales_ewm_feats(df, gpby_cols, target_col, alpha=[0.9], shift=[1]):
    gpby = df.groupby(gpby_cols)
    for a in alpha:
        for s in shift:
            df['_'.join([target_col, 'lag', str(s), 'ewm', str(a)])] =\
                gpby[target_col].shift(s).ewm(alpha=a).mean().values
    return df

OHE of categorical features

In [180]:
def one_hot_encoder(df, ohe_cols=['store','item','dayofmonth','dayofweek','month','weekofyear']):
    print('Creating OHE features, dataframe shape : ',df.shape)
    df = pd.get_dummies(df, columns=ohe_cols)
    print('New df shape : ', format(df.shape))
    return df

Log Sales

In [181]:
# Converting sales to log(1+sales)
df['sales'] = np.log1p(df.sales.values)
df.sample(2)

Unnamed: 0,date,id,item,sales,store,train_or_test,dayofmonth,dayofyear,dayofweek,month,year,weekofyear,is_month_start,is_month_end
222870,2013-04-09,,13,4.59512,3,train,9,99,1,4,2013,15,0,0
50229,2015-07-17,,3,4.007333,8,train,17,198,4,7,2015,29,0,0


# Time-based Validation set

For validation we can choose the last 3 months of training period(Oct, Nov, Dec 2017).

Onother choice is to keep months identical to test set, in this case we can choose (Jan, Feb, Mar 2017).

Here we will go with the latter choice

In [182]:
masked_series = (df.year==2017) & (df.month.isin([1,2,3]))
masked_series2 = (df.year==2017) & (~(df.month.isin([1,2,3])))
df.loc[(masked_series), 'train_or_test'] = 'val'
df.loc[(masked_series2), 'train_or_test'] = 'no_train'
print('Train shape : ', df.loc[df.train_or_test=='train'].shape)
print('Validation shape : ', df.loc[df.train_or_test=='val'].shape)
print('No train shape : ', df.loc[df.train_or_test=='no_train'].shape)
print('Test shape : ', df.loc[df.train_or_test=='test'].shape)

Train shape :  (730500, 14)
Validation shape :  (45000, 14)
No train shape :  (137500, 14)
Test shape :  (45000, 14)


# Model Validation

In [185]:
train = df[df.train_or_test.isin(['train', 'val'])]
Y_train = train[train.train_or_test=='train']['sales'].values
Y_val = train.loc[train.train_or_test=='val']['sales'].values

# Creating sales lag, rolling mean, rolling median, ohe features of the above train set
train = create_sales_lag_feats(train, gpby_cols=['store','item'], target_col='sales',
                              lags = [91,98,105,112,119,126,182,364,546,728])

train = create_sales_rmean_feats(train, gpby_cols=['store','item'], target_col='sales',
                                 windows=[364,546], min_periods=10, win_type='triang')

train = create_sales_ewm_feats(train, gpby_cols=['store','item'], target_col='sales',
                              alpha=[0.95, 0.9, 0.8, 0.7, 0.6, 0.5],
                              shift=[91,98,105,112,119,126,182,364,546,728])

train = one_hot_encoder(train, ohe_cols=['store','item','dayofweek','month'])

val = train[train.train_or_test=='val']
train = train[train.train_or_test=='train']
print('Train shape : {}, Val shape : {}'.format(train.shape, val.shape))

Creating OHE features, dataframe shape :  (775500, 86)
New df shape :  (775500, 161)
Train shape : (730500, 161), Val shape : (45000, 161)


# LightGBM Model

In [186]:
avoid_cols = ['date','sales','train_or_test','id','year']
cols = [col for col in train.columns if col not in avoid_cols]
print('No of training features : {}, and they are : {}'.format(len(cols), cols))

No of training features : 156, and they are : ['dayofmonth', 'dayofyear', 'weekofyear', 'is_month_start', 'is_month_end', 'sales_lag_91', 'sales_lag_98', 'sales_lag_105', 'sales_lag_112', 'sales_lag_119', 'sales_lag_126', 'sales_lag_182', 'sales_lag_364', 'sales_lag_546', 'sales_lag_728', 'sales_rmean_364', 'sales_rmean_546', 'sales_lag_91_ewm_0.95', 'sales_lag_98_ewm_0.95', 'sales_lag_105_ewm_0.95', 'sales_lag_112_ewm_0.95', 'sales_lag_119_ewm_0.95', 'sales_lag_126_ewm_0.95', 'sales_lag_182_ewm_0.95', 'sales_lag_364_ewm_0.95', 'sales_lag_546_ewm_0.95', 'sales_lag_728_ewm_0.95', 'sales_lag_91_ewm_0.9', 'sales_lag_98_ewm_0.9', 'sales_lag_105_ewm_0.9', 'sales_lag_112_ewm_0.9', 'sales_lag_119_ewm_0.9', 'sales_lag_126_ewm_0.9', 'sales_lag_182_ewm_0.9', 'sales_lag_364_ewm_0.9', 'sales_lag_546_ewm_0.9', 'sales_lag_728_ewm_0.9', 'sales_lag_91_ewm_0.8', 'sales_lag_98_ewm_0.8', 'sales_lag_105_ewm_0.8', 'sales_lag_112_ewm_0.8', 'sales_lag_119_ewm_0.8', 'sales_lag_126_ewm_0.8', 'sales_lag_182_ewm

In [212]:
def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds==0)&(target==0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds-target)
    denom = np.abs(preds) + np.abs(target)
    smape_val = (200*np.sum(num/denom))/n
    return smape_val

def lgbm_smape(preds, train_data):
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return 'SMAPE', smape_val, False

In [213]:
# LightGBM parameters
lgb_params = {'task':'train',
             'bossting_type':'gbdt',
             'objective':'regression',
             'metric':'mae',
             'num_leaves':10,
             'learning_rate':0.02,
             'feature_fraction':0.8,
             'max_depth':5,
             'verbose':100,
             'num_boost_round':15000,
             'early_stopping_rounds':200,
             'nthread':-1}

In [214]:
train[cols].head()

Unnamed: 0,dayofmonth,dayofyear,weekofyear,is_month_start,is_month_end,sales_lag_91,sales_lag_98,sales_lag_105,sales_lag_112,sales_lag_119,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,1,1,1,1,0,,,,,,...,0,0,0,0,0,0,0,0,0,0
1,2,2,1,0,0,,,,,,...,0,0,0,0,0,0,0,0,0,0
2,3,3,1,0,0,,,,,,...,0,0,0,0,0,0,0,0,0,0
3,4,4,1,0,0,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,5,5,1,0,0,,,,,,...,0,0,0,0,0,0,0,0,0,0


In [206]:
# Creating lgbtrain & lgbval
lgbtrain = lgb.Dataset(data=train[cols].values, label=Y_train)
lgbval = lgb.Dataset(data=val[cols].values, label=Y_val)

In [215]:
def lgb_validation(params, lgbtrain, lgbval, X_val, Y_val):
    model = lgb.train(params, lgbtrain, valid_sets=[lgbtrain, lgbval], verbose_eval = 200, 
                     feval = lgbm_smape)
    pred_Y_val = model.predict(X_val, num_iteration=model.best_iteration)
    pred_Y_val = np.expm1(pred_Y_val)
    val_df = pd.DataFrame(columns=['true_Y_val', 'pred_Y_val'])
    val_df['pred_Y_val'] = pred_Y_val
    val_df['true_Y_val'] = Y_val
    print(val_df.shape)
    print('SMAPE for validation data is : ', smape(pred_Y_val, Y_val))
    return model, val_df

In [216]:
model, val_df = lgb_validation(lgb_params, lgbtrain, lgbval, val[cols], Y_val)

Training until validation scores don't improve for 200 rounds.
[200]	training's l1: 0.163577	training's SMAPE: 16.677	valid_1's l1: 0.150426	valid_1's SMAPE: 15.4215
[400]	training's l1: 0.152156	training's SMAPE: 15.5622	valid_1's l1: 0.145717	valid_1's SMAPE: 14.9511
[600]	training's l1: 0.146181	training's SMAPE: 14.971	valid_1's l1: 0.143487	valid_1's SMAPE: 14.7276
[800]	training's l1: 0.142233	training's SMAPE: 14.5761	valid_1's l1: 0.141603	valid_1's SMAPE: 14.5383
[1000]	training's l1: 0.139409	training's SMAPE: 14.2927	valid_1's l1: 0.140173	valid_1's SMAPE: 14.3946
[1200]	training's l1: 0.13731	training's SMAPE: 14.0816	valid_1's l1: 0.139054	valid_1's SMAPE: 14.2821
[1400]	training's l1: 0.135654	training's SMAPE: 13.9149	valid_1's l1: 0.138047	valid_1's SMAPE: 14.1808
[1600]	training's l1: 0.134388	training's SMAPE: 13.7873	valid_1's l1: 0.137359	valid_1's SMAPE: 14.1116
[1800]	training's l1: 0.133396	training's SMAPE: 13.6872	valid_1's l1: 0.136816	valid_1's SMAPE: 14.0569

KeyboardInterrupt: 