Create lagged features
https://www.kaggle.com/c/rossmann-store-sales/discussion/17919

In [2]:
import os
import pandas as pd
import lightgbm as lgb

In [7]:
## Define parameters
NUM_ROUNDS = 12
TRAIN_START_WEEK = 40
TRAIN_END_WEEK_LIST = range(135, 159, 2)
TEST_START_WEEK_LIST = range(137, 161, 2)
TEST_END_WEEK_LIST = range(138, 162, 2)

In [3]:
train_data_path = '../../data/train'
train_df = pd.read_csv(os.path.join(train_data_path, 'train_round_1.csv'))
train_df.head(3)

Unnamed: 0,store,brand,week,logmove,constant,price1,price2,price3,price4,price5,price6,price7,price8,price9,price10,price11,deal,feat,profit
0,2,1,40,9.018695,1,0.060469,0.060497,0.042031,0.029531,0.049531,0.053021,0.038906,0.041406,0.028906,0.024844,0.038984,1,0.0,37.992326
1,2,1,46,8.723231,1,0.060469,0.060312,0.045156,0.046719,0.049531,0.047813,0.045781,0.027969,0.042969,0.042031,0.038984,0,0.0,30.126667
2,2,1,47,8.253228,1,0.060469,0.060312,0.045156,0.046719,0.037344,0.053021,0.045781,0.041406,0.048125,0.032656,0.038984,0,0.0,30.0


In [17]:
## Fill missing values
r = 0
store_list = train_df['store'].unique()
brand_list = train_df['brand'].unique()
week_list = range(TRAIN_START_WEEK, TRAIN_END_WEEK_LIST[r]+1)
d = {'store': store_list,
     'brand': brand_list,
     'week': week_list}

In [25]:
def df_from_cartesian_product(dict_in):
    """Generate a Pandas dataframe from Cartesian product of lists
    
    Args: 
        dict_in: Dictionary containing multiple lists
        
    Returns:
        df: Pandas dataframe corresponding to the Caresian product of the lists
    """
    from collections import OrderedDict
    from itertools import product
    od = OrderedDict(sorted(dict_in.items()))
    cart = list(product(*od.values()))
    df = pd.DataFrame(cart, columns=od.keys())
    return df
data_grid = df_from_cartesian_product(d)
train_filled = pd.merge(data_grid, train_df, how='left', 
                        on=['store', 'brand', 'week'])

In [30]:
train_filled[train_filled.isnull().any(axis=1)].shape

(3465, 19)

In [40]:
train_filled2 = train_filled.groupby(['store','brand']). \
                             apply(lambda x: x.fillna(method='ffill').fillna(method='bfill')) 
train_filled2
train_sub = train_filled2[(train_filled2.store==2) & (train_filled2.brand==1)]

In [41]:
train_sub

Unnamed: 0,brand,store,week,logmove,constant,price1,price2,price3,price4,price5,price6,price7,price8,price9,price10,price11,deal,feat,profit
0,1,2,40,9.018695,1.0,0.060469,0.060497,0.042031,0.029531,0.049531,0.053021,0.038906,0.041406,0.028906,0.024844,0.038984,1.0,0.0,37.992326
1,1,2,41,9.018695,1.0,0.060469,0.060497,0.042031,0.029531,0.049531,0.053021,0.038906,0.041406,0.028906,0.024844,0.038984,1.0,0.0,37.992326
2,1,2,42,9.018695,1.0,0.060469,0.060497,0.042031,0.029531,0.049531,0.053021,0.038906,0.041406,0.028906,0.024844,0.038984,1.0,0.0,37.992326
3,1,2,43,9.018695,1.0,0.060469,0.060497,0.042031,0.029531,0.049531,0.053021,0.038906,0.041406,0.028906,0.024844,0.038984,1.0,0.0,37.992326
4,1,2,44,9.018695,1.0,0.060469,0.060497,0.042031,0.029531,0.049531,0.053021,0.038906,0.041406,0.028906,0.024844,0.038984,1.0,0.0,37.992326
5,1,2,45,9.018695,1.0,0.060469,0.060497,0.042031,0.029531,0.049531,0.053021,0.038906,0.041406,0.028906,0.024844,0.038984,1.0,0.0,37.992326
6,1,2,46,8.723231,1.0,0.060469,0.060312,0.045156,0.046719,0.049531,0.047813,0.045781,0.027969,0.042969,0.042031,0.038984,0.0,0.0,30.126667
7,1,2,47,8.253228,1.0,0.060469,0.060312,0.045156,0.046719,0.037344,0.053021,0.045781,0.041406,0.048125,0.032656,0.038984,0.0,0.0,30.000000
8,1,2,48,8.987197,1.0,0.060469,0.060312,0.049844,0.037344,0.049531,0.053021,0.045781,0.041406,0.042344,0.032656,0.038984,0.0,0.0,29.950000
9,1,2,49,8.987197,1.0,0.060469,0.060312,0.049844,0.037344,0.049531,0.053021,0.045781,0.041406,0.042344,0.032656,0.038984,0.0,0.0,29.950000


In [46]:
train_sub[['logmove','price1']].shift(1)

Unnamed: 0,logmove,price1
0,,
1,9.018695,0.060469
2,9.018695,0.060469
3,9.018695,0.060469
4,9.018695,0.060469
5,9.018695,0.060469
6,9.018695,0.060469
7,8.723231,0.060469
8,8.253228,0.060469
9,8.987197,0.060469


In [51]:
def lagged_features(df, lags):
    """Create lagged features based on time series data
    
    Args:
        df (dataframe): Input time series data sorted by time
        lags (list): Lag lengths
        
    Returns:
        fea (dataframe): Lagged features 
    """
    df_list = []
    for lag in lags:
        df_shifted = df.shift(lag)
        df_shifted.columns = [x + '_lag' + str(lag) for x in df_shifted.columns]
        df_list.append(df_shifted)
    fea = pd.concat(df_list, axis=1)
    return fea

In [56]:
train_sub.columns

Index(['brand', 'store', 'week', 'logmove', 'constant', 'price1', 'price2',
       'price3', 'price4', 'price5', 'price6', 'price7', 'price8', 'price9',
       'price10', 'price11', 'deal', 'feat', 'profit'],
      dtype='object')

In [61]:
lags = [1,2]
lagged_fea = lagged_features(train_sub[['logmove','price1', 'deal', 'feat']], lags)
train_sub2 = pd.concat([train_sub[['week', 'logmove']], lagged_fea], axis=1)
train_sub2

Unnamed: 0,week,logmove,logmove_lag1,price1_lag1,deal_lag1,feat_lag1,logmove_lag2,price1_lag2,deal_lag2,feat_lag2
0,40,9.018695,,,,,,,,
1,41,9.018695,9.018695,0.060469,1.0,0.0,,,,
2,42,9.018695,9.018695,0.060469,1.0,0.0,9.018695,0.060469,1.0,0.0
3,43,9.018695,9.018695,0.060469,1.0,0.0,9.018695,0.060469,1.0,0.0
4,44,9.018695,9.018695,0.060469,1.0,0.0,9.018695,0.060469,1.0,0.0
5,45,9.018695,9.018695,0.060469,1.0,0.0,9.018695,0.060469,1.0,0.0
6,46,8.723231,9.018695,0.060469,1.0,0.0,9.018695,0.060469,1.0,0.0
7,47,8.253228,8.723231,0.060469,0.0,0.0,9.018695,0.060469,1.0,0.0
8,48,8.987197,8.253228,0.060469,0.0,0.0,8.723231,0.060469,0.0,0.0
9,49,8.987197,8.987197,0.060469,0.0,0.0,8.253228,0.060469,0.0,0.0


In [66]:
train_sub2.drop('logmove', axis=1, inplace=False)

Unnamed: 0,week,logmove_lag1,price1_lag1,deal_lag1,feat_lag1,logmove_lag2,price1_lag2,deal_lag2,feat_lag2
0,40,,,,,,,,
1,41,9.018695,0.060469,1.0,0.0,,,,
2,42,9.018695,0.060469,1.0,0.0,9.018695,0.060469,1.0,0.0
3,43,9.018695,0.060469,1.0,0.0,9.018695,0.060469,1.0,0.0
4,44,9.018695,0.060469,1.0,0.0,9.018695,0.060469,1.0,0.0
5,45,9.018695,0.060469,1.0,0.0,9.018695,0.060469,1.0,0.0
6,46,9.018695,0.060469,1.0,0.0,9.018695,0.060469,1.0,0.0
7,47,8.723231,0.060469,0.0,0.0,9.018695,0.060469,1.0,0.0
8,48,8.253228,0.060469,0.0,0.0,8.723231,0.060469,0.0,0.0
9,49,8.987197,0.060469,0.0,0.0,8.253228,0.060469,0.0,0.0


In [76]:
print("Training and predicting models...")
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 10, #200,
    'learning_rate': 0.001, #0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}
MAX_ROUNDS = 5000
dtrain= lgb.Dataset(
            train_sub2.drop('logmove', axis=1, inplace=False), label=train_sub2['logmove'],
            categorical_feature=['deal_lag1', 'deal_lag2', 'feat_lag1', 'feat_lag2']
)
bst = lgb.train(
    params, dtrain, num_boost_round=MAX_ROUNDS,
    valid_sets=[dtrain], early_stopping_rounds=125, verbose_eval=50
)
   

Training and predicting models...
Training until validation scores don't improve for 125 rounds.
[50]	training's l2: 0.37524
[100]	training's l2: 0.368041
[150]	training's l2: 0.36132




[200]	training's l2: 0.35488
[250]	training's l2: 0.349029
[300]	training's l2: 0.343492
[350]	training's l2: 0.338337
[400]	training's l2: 0.33302
[450]	training's l2: 0.328257
[500]	training's l2: 0.323468
[550]	training's l2: 0.318836
[600]	training's l2: 0.314451
[650]	training's l2: 0.310373
[700]	training's l2: 0.3063
[750]	training's l2: 0.302378
[800]	training's l2: 0.298326
[850]	training's l2: 0.294713
[900]	training's l2: 0.291499
[950]	training's l2: 0.288361
[1000]	training's l2: 0.285173
[1050]	training's l2: 0.28215
[1100]	training's l2: 0.279426
[1150]	training's l2: 0.276575
[1200]	training's l2: 0.273941
[1250]	training's l2: 0.271118
[1300]	training's l2: 0.268525
[1350]	training's l2: 0.265975
[1400]	training's l2: 0.263431
[1450]	training's l2: 0.261057
[1500]	training's l2: 0.25881
[1550]	training's l2: 0.256649
[1600]	training's l2: 0.25424
[1650]	training's l2: 0.252275
[1700]	training's l2: 0.250009
[1750]	training's l2: 0.247866
[1800]	training's l2: 0.245946


In [77]:
bst.predict(train_sub2)

array([8.83564446, 9.00096889, 9.06679147, 9.06679147, 9.06679147,
       9.06679147, 9.04282538, 9.17703801, 9.08244741, 9.08244741,
       9.13604988, 9.08568805, 9.16865963, 9.00826912, 9.10467846,
       9.12256032, 9.12256032, 9.18076645, 9.18124091, 9.12256032,
       9.25593887, 9.18409705, 9.21488136, 9.265507  , 9.24475885,
       9.18766315, 9.21059354, 9.35259654, 9.31579156, 9.30455896,
       9.31594884, 9.33790935, 9.22869741, 9.27351536, 9.33768376,
       9.16302032, 9.12168741, 9.17595537, 9.12168741, 9.10813003,
       9.63312322, 9.59847332, 9.61065089, 9.62717827, 9.53177742,
       9.73162317, 9.64854833, 9.58320511, 9.67742413, 9.54515365,
       9.76711067, 9.7542102 , 9.6659487 , 9.59542762, 9.68846222,
       9.8366578 , 9.8366578 , 9.74913121, 9.72854446, 9.78704466,
       9.70256628, 9.70256628, 9.70256628, 9.73583167, 9.81775351,
       9.49159886, 9.47837845, 9.49227687, 9.3925822 , 9.38323446,
       9.37020492, 9.36515425, 9.4931966 , 9.4589449 , 9.37917

In [79]:
list(train_sub2.logmove)

[9.0186954877,
 9.0186954877,
 9.0186954877,
 9.0186954877,
 9.0186954877,
 9.0186954877,
 8.7232312748,
 8.253227645599999,
 8.987196820700001,
 8.987196820700001,
 9.0933570165,
 8.8773819547,
 9.2946815204,
 8.954673629,
 9.0492322116,
 9.0492322116,
 9.0492322116,
 8.613230379600001,
 8.6806716604,
 9.0340804066,
 8.6914825765,
 8.8317119178,
 9.1286963829,
 9.4059071555,
 9.4471501141,
 8.7838558966,
 8.7232312748,
 9.9579757378,
 9.4267412424,
 9.1560953571,
 9.793672686499999,
 9.1493156701,
 8.743850562,
 8.8410143105,
 9.7272275871,
 8.743850562,
 8.979164649,
 8.7232312748,
 8.979164649,
 8.9629041281,
 8.712759975,
 10.649606618,
 8.5026885052,
 10.292281126,
 9.2087390906,
 10.468801362,
 10.083138881,
 8.8684132847,
 10.106918073,
 8.7540029335,
 8.712759975,
 10.420374768,
 9.491601876599999,
 8.7335940619,
 9.2708708717,
 10.707102186,
 10.707102186,
 9.9082760693,
 9.1217277136,
 9.9966135305,
 9.515469357999999,
 9.515469357999999,
 9.515469357999999,
 8.3332703533,
 1