In [1]:
import pandas as pd
import numpy as no
import lightgbm as lgb
from sklearn import preprocessing, model_selection, metrics
from sklearn.model_selection import KFold,train_test_split,TimeSeriesSplit
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
print("Train rows and columns : ", train.shape)
print("Test rows and columns : ", test.shape)

Train rows and columns :  (913000, 4)
Test rows and columns :  (45000, 4)


In [3]:
df = pd.concat([train,test])
print(df.shape)
df.head()

(958000, 5)


Unnamed: 0,date,id,item,sales,store
0,2013-01-01,,1,13.0,1
1,2013-01-02,,1,11.0,1
2,2013-01-03,,1,14.0,1
3,2013-01-04,,1,13.0,1
4,2013-01-05,,1,10.0,1


In [4]:
df['date'] = pd.to_datetime(df['date'],infer_datetime_format=True)

In [5]:
df['month'] = df['date'].dt.month
df['weekday'] = df['date'].dt.dayofweek
df['year'] = df['date'].dt.year
df['week_of_year']  = df.date.dt.weekofyear
df.head()

Unnamed: 0,date,id,item,sales,store,month,weekday,year,week_of_year
0,2013-01-01,,1,13.0,1,1,1,2013,1
1,2013-01-02,,1,11.0,1,1,2,2013,1
2,2013-01-03,,1,14.0,1,1,3,2013,1
3,2013-01-04,,1,13.0,1,1,4,2013,1
4,2013-01-05,,1,10.0,1,1,5,2013,1


In [6]:
df["median-store_item-month"] = df.groupby(['month',"item","store"])["sales"].transform("median")
df["mean-store_item-week"] = df.groupby(['week_of_year',"item","store"])["sales"].transform("mean")
df["item-month-sum"] = df.groupby(['month',"item"])["sales"].transform("sum") # total sales of that item  for all stores
df["store-month-sum"] = df.groupby(['month',"store"])["sales"].transform("sum") # total sales of that store  for all items

In [7]:
# get shifted features for grouped data. Note need to sort first! 
df['store_item_shifted-90'] = df.groupby(["item","store"])['sales'].transform(lambda x:x.shift(90)) # sales for that item 90 days = 3 months ago
df['store_item_shifted-180'] = df.groupby(["item","store"])['sales'].transform(lambda x:x.shift(180)) # sales for that item 180 days = 3 months ago
df['store_item_shifted-365'] = df.groupby(["item","store"])['sales'].transform(lambda x:x.shift(365)) # sales for that 1 year  ago

In [8]:
df["item-week_shifted-90"] = df.groupby(['week_of_year',"item"])["sales"].transform(lambda x:x.shift(12).sum()) # shifted total sales for that item 12 weeks (3 months) ago
df["store-week_shifted-90"] = df.groupby(['week_of_year',"store"])["sales"].transform(lambda x:x.shift(12).sum()) # shifted total sales for that store 12 weeks (3 months) ago
df["item-week_shifted-90"] = df.groupby(['week_of_year',"item"])["sales"].transform(lambda x:x.shift(12).mean()) # shifted mean sales for that item 12 weeks (3 months) ago
df["store-week_shifted-90"] = df.groupby(['week_of_year',"store"])["sales"].transform(lambda x:x.shift(12).mean()) # shifted mean sales for that store 12 weeks (3 months) ago

In [9]:
col = [i for i in df.columns if i not in ['date','id']]
y = 'sales'

In [10]:
train = df.loc[~df.sales.isna()]
print("new train",train.shape)
test = df.loc[df.sales.isna()]
print("new test",test.shape)

new train (913000, 18)
new test (45000, 18)


In [11]:
X_train = train.drop(['date','sales','id'], axis=1)
y_train = train['sales'].values
X_test = test.drop(['id','date','sales'], axis=1)

In [12]:
def run_xgb(train_X, train_y, val_X, val_y, test_X):
    params = {
             'colsample_bytree': 0.67,
             'gamma': 0.19,
             'learning_rate': 0.1,
             'max_depth': 6,
             'eval_metric' : 'mae',
             'min_child_weight': 3,
             'nthread': 5,
             'objective': 'reg:linear',
             'scale_pos_weight': 1,
             'subsample': 0.60,
             'random_state': 2018,
             'n_estimators': 400,
             'reg_alpha': 1.92,
             'reg_lambda': 6.28
             }
    tr_data = xgb.DMatrix(train_X, train_y)
    va_data = xgb.DMatrix(val_X, val_y)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model_xgb = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds = 50, verbose_eval=-1)
    
    dtest = xgb.DMatrix(test_X)
    xgb_pred_y = model_xgb.predict(dtest, ntree_limit=model_xgb.best_ntree_limit)
    
    return xgb_pred_y, model_xgb

In [13]:
tscv = TimeSeriesSplit(n_splits=3)
pred_test_full_xgb = 0
for dev_index, val_index in tscv.split(X_train):
    dev_X, val_X = X_train.loc[dev_index,:], X_train.loc[val_index,:]
    dev_y, val_y = y_train[dev_index], y_train[val_index]
    pred_test, model = run_xgb(dev_X, dev_y, val_X, val_y, X_test)
    pred_test_full_xgb += pred_test
pred_test_full_xgb /= 3.

[0]	train-mae:46.9971	valid-mae:51.1138
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 50 rounds.
[1]	train-mae:42.3134	valid-mae:46.0432
[2]	train-mae:38.0928	valid-mae:41.6034
[3]	train-mae:34.299	valid-mae:37.5066
[4]	train-mae:30.8839	valid-mae:33.79
[5]	train-mae:27.8086	valid-mae:30.4568
[6]	train-mae:25.0501	valid-mae:27.4732
[7]	train-mae:22.5697	valid-mae:24.7675
[8]	train-mae:20.344	valid-mae:22.3509
[9]	train-mae:18.3534	valid-mae:20.1932
[10]	train-mae:16.5843	valid-mae:18.2527
[11]	train-mae:15.0103	valid-mae:16.5363
[12]	train-mae:13.6319	valid-mae:15.0136
[13]	train-mae:12.4334	valid-mae:13.6879
[14]	train-mae:11.3699	valid-mae:12.5148
[15]	train-mae:10.481	valid-mae:11.5643
[16]	train-mae:9.68911	valid-mae:10.6788
[17]	train-mae:9.01198	valid-mae:9.92317
[18]	train-mae:8.46382	valid-mae:9.29384
[19]	train-mae:7.9692	valid-mae:8.7303
[20]	train-mae:7.56211	valid-mae:8.28136
[21]	train-ma

[196]	train-mae:5.44097	valid-mae:5.85248
[197]	train-mae:5.44055	valid-mae:5.85238
[198]	train-mae:5.44031	valid-mae:5.85246
[199]	train-mae:5.43981	valid-mae:5.8525
[200]	train-mae:5.43928	valid-mae:5.85251
[201]	train-mae:5.43856	valid-mae:5.85246
[202]	train-mae:5.4382	valid-mae:5.85248
[203]	train-mae:5.43767	valid-mae:5.85271
[204]	train-mae:5.43729	valid-mae:5.85281
[205]	train-mae:5.43675	valid-mae:5.85284
[206]	train-mae:5.43649	valid-mae:5.85304
[207]	train-mae:5.43593	valid-mae:5.85312
[208]	train-mae:5.43553	valid-mae:5.85313
[209]	train-mae:5.43504	valid-mae:5.85295
[210]	train-mae:5.43457	valid-mae:5.85328
[211]	train-mae:5.43422	valid-mae:5.85348
[212]	train-mae:5.43354	valid-mae:5.85393
Stopping. Best iteration:
[162]	train-mae:5.45951	valid-mae:5.84937

[0]	train-mae:49.0272	valid-mae:48.4433
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 50 rounds.
[1]	train-mae:44.1342	valid-mae:43.5

[177]	train-mae:5.61706	valid-mae:5.66488
[178]	train-mae:5.61649	valid-mae:5.6645
[179]	train-mae:5.61612	valid-mae:5.66444
[180]	train-mae:5.61582	valid-mae:5.6643
[181]	train-mae:5.61563	valid-mae:5.66428
[182]	train-mae:5.61532	valid-mae:5.66429
[183]	train-mae:5.61479	valid-mae:5.66392
[184]	train-mae:5.61447	valid-mae:5.66393
[185]	train-mae:5.614	valid-mae:5.66369
[186]	train-mae:5.61368	valid-mae:5.66359
[187]	train-mae:5.61332	valid-mae:5.66352
[188]	train-mae:5.61283	valid-mae:5.66333
[189]	train-mae:5.6124	valid-mae:5.66317
[190]	train-mae:5.61212	valid-mae:5.66324
[191]	train-mae:5.61178	valid-mae:5.66318
[192]	train-mae:5.61131	valid-mae:5.66318
[193]	train-mae:5.61099	valid-mae:5.66301
[194]	train-mae:5.61071	valid-mae:5.663
[195]	train-mae:5.6104	valid-mae:5.66296
[196]	train-mae:5.60987	valid-mae:5.66294
[197]	train-mae:5.60944	valid-mae:5.66306
[198]	train-mae:5.60901	valid-mae:5.66294
[199]	train-mae:5.60874	valid-mae:5.66295
[200]	train-mae:5.60853	valid-mae:5.66292


[373]	train-mae:5.55657	valid-mae:5.65772
[374]	train-mae:5.55638	valid-mae:5.6578
[375]	train-mae:5.55608	valid-mae:5.6578
[376]	train-mae:5.5559	valid-mae:5.65783
[377]	train-mae:5.55551	valid-mae:5.65782
[378]	train-mae:5.55527	valid-mae:5.65778
[379]	train-mae:5.55495	valid-mae:5.65785
[380]	train-mae:5.55481	valid-mae:5.65787
[381]	train-mae:5.55453	valid-mae:5.65793
[382]	train-mae:5.55427	valid-mae:5.65797
[383]	train-mae:5.55404	valid-mae:5.65796
[384]	train-mae:5.55373	valid-mae:5.65782
[385]	train-mae:5.55351	valid-mae:5.65771
[386]	train-mae:5.55326	valid-mae:5.65778
[387]	train-mae:5.5531	valid-mae:5.65775
[388]	train-mae:5.55287	valid-mae:5.65787
[389]	train-mae:5.55267	valid-mae:5.65794
[390]	train-mae:5.55246	valid-mae:5.65799
[391]	train-mae:5.55208	valid-mae:5.65811
[392]	train-mae:5.55173	valid-mae:5.65802
[393]	train-mae:5.55152	valid-mae:5.65804
[394]	train-mae:5.55135	valid-mae:5.65803
[395]	train-mae:5.55107	valid-mae:5.65811
[396]	train-mae:5.55072	valid-mae:5.65

[148]	train-mae:5.63371	valid-mae:5.13784
[149]	train-mae:5.63328	valid-mae:5.13782
[150]	train-mae:5.63297	valid-mae:5.1376
[151]	train-mae:5.63251	valid-mae:5.13752
[152]	train-mae:5.63228	valid-mae:5.13726
[153]	train-mae:5.63191	valid-mae:5.13716
[154]	train-mae:5.63147	valid-mae:5.13692
[155]	train-mae:5.63106	valid-mae:5.13668
[156]	train-mae:5.6306	valid-mae:5.13637
[157]	train-mae:5.6303	valid-mae:5.13637
[158]	train-mae:5.62997	valid-mae:5.13626
[159]	train-mae:5.62959	valid-mae:5.13619
[160]	train-mae:5.62927	valid-mae:5.13617
[161]	train-mae:5.62886	valid-mae:5.13604
[162]	train-mae:5.62853	valid-mae:5.13602
[163]	train-mae:5.62829	valid-mae:5.13597
[164]	train-mae:5.62807	valid-mae:5.13598
[165]	train-mae:5.62768	valid-mae:5.13596
[166]	train-mae:5.62736	valid-mae:5.13593
[167]	train-mae:5.62705	valid-mae:5.13581
[168]	train-mae:5.62668	valid-mae:5.13581
[169]	train-mae:5.62638	valid-mae:5.13572
[170]	train-mae:5.62605	valid-mae:5.13559
[171]	train-mae:5.62556	valid-mae:5.1

[344]	train-mae:5.58187	valid-mae:5.12564
[345]	train-mae:5.58171	valid-mae:5.12567
[346]	train-mae:5.58152	valid-mae:5.12561
[347]	train-mae:5.58137	valid-mae:5.12561
[348]	train-mae:5.58116	valid-mae:5.12562
[349]	train-mae:5.58094	valid-mae:5.12564
[350]	train-mae:5.5807	valid-mae:5.12557
[351]	train-mae:5.58051	valid-mae:5.12556
[352]	train-mae:5.58014	valid-mae:5.12539
[353]	train-mae:5.57983	valid-mae:5.12534
[354]	train-mae:5.57963	valid-mae:5.12537
[355]	train-mae:5.57939	valid-mae:5.12544
[356]	train-mae:5.57926	valid-mae:5.12544
[357]	train-mae:5.57899	valid-mae:5.12537
[358]	train-mae:5.57885	valid-mae:5.12528
[359]	train-mae:5.57867	valid-mae:5.12529
[360]	train-mae:5.57851	valid-mae:5.12523
[361]	train-mae:5.5784	valid-mae:5.12525
[362]	train-mae:5.57818	valid-mae:5.12524
[363]	train-mae:5.57804	valid-mae:5.12524
[364]	train-mae:5.57789	valid-mae:5.12523
[365]	train-mae:5.57765	valid-mae:5.12519
[366]	train-mae:5.57739	valid-mae:5.12523
[367]	train-mae:5.57712	valid-mae:5.

In [15]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "mape",
        "boosting_type":"gbdt",
        "learning_rate" : 0.1,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "seed": 2018,
        "num_leaves":7,
        "max_depth":3,
        "eval_freq" : 25,
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 2000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=50, 
                      verbose_eval=-1, 
                      evals_result=evals_result)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

tscv = TimeSeriesSplit(n_splits=3)
pred_test_full_lgbm = 0
for dev_index, val_index in tscv.split(X_train):
    dev_X, val_X = X_train.loc[dev_index,:], X_train.loc[val_index,:]
    dev_y, val_y = y_train[dev_index], y_train[val_index]
    pred_test, model,evals_result = run_lgb(dev_X, dev_y, val_X, val_y, X_test)
    pred_test_full_lgbm += pred_test
pred_test_full_lgbm /= 3.

Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1328]	training's mape: 0.130956	valid_1's mape: 0.121112
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1390]	training's mape: 0.126229	valid_1's mape: 0.126295
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[1430]	training's mape: 0.126167	valid_1's mape: 0.140143


In [16]:
sample = pd.read_csv('sample_submission.csv')
sample['sales'] = (pred_test_full_lgbm + pred_test_full_xgb) / 2
sample.to_csv('Submission_LGBM_XGB.csv', index=False)

In [17]:
df.head(120)

Unnamed: 0,date,id,item,sales,store,month,weekday,year,week_of_year,median-store_item-month,mean-store_item-week,item-month-sum,store-month-sum,store_item_shifted-90,store_item_shifted-180,store_item_shifted-365,item-week_shifted-90,store-week_shifted-90
0,2013-01-01,,1,13.0,1,1,1,2013,1,13.0,13.970588,22987.0,249352.0,,,,15.038235,32.529412
1,2013-01-02,,1,11.0,1,1,2,2013,1,13.0,13.970588,22987.0,249352.0,,,,15.038235,32.529412
2,2013-01-03,,1,14.0,1,1,3,2013,1,13.0,13.970588,22987.0,249352.0,,,,15.038235,32.529412
3,2013-01-04,,1,13.0,1,1,4,2013,1,13.0,13.970588,22987.0,249352.0,,,,15.038235,32.529412
4,2013-01-05,,1,10.0,1,1,5,2013,1,13.0,13.970588,22987.0,249352.0,,,,15.038235,32.529412
5,2013-01-06,,1,12.0,1,1,6,2013,1,13.0,13.970588,22987.0,249352.0,,,,15.038235,32.529412
6,2013-01-07,,1,10.0,1,1,0,2013,2,13.0,13.200000,22987.0,249352.0,,,,14.982857,32.361143
7,2013-01-08,,1,9.0,1,1,1,2013,2,13.0,13.200000,22987.0,249352.0,,,,14.982857,32.361143
8,2013-01-09,,1,12.0,1,1,2,2013,2,13.0,13.200000,22987.0,249352.0,,,,14.982857,32.361143
9,2013-01-10,,1,9.0,1,1,3,2013,2,13.0,13.200000,22987.0,249352.0,,,,14.982857,32.361143
