In [1]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb
import os

In [2]:
data_folder = '..//data//'
submission_folder = '..//submissions//'
features_folder = '..//features//'
models_path = '..//models//' 

In [3]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [4]:
pd.options.display.max_columns = 50

In [5]:
h = 28 
max_lags = 57
tr_last = 1913
fday = datetime(2016,4, 25) 
fday

datetime.datetime(2016, 4, 25, 0, 0)

In [6]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
    prices = pd.read_csv(os.path.join(data_folder,'sell_prices.csv'), dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv(os.path.join(data_folder,'calendar.csv'), dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv(os.path.join(data_folder,'sales_train_validation.csv'), 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [7]:
def create_fea(dt):
    lags = [7,14,21,28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [8]:
FIRST_DAY = 350

In [9]:
df = create_dt(is_train=True, first_day= FIRST_DAY)
df.shape

(40718219, 22)

In [10]:
df.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,3.97
1,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,4.34
2,HOBBIES_1_005_CA_1_validation,4,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,2.48
3,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,0.5
4,HOBBIES_1_009_CA_1_validation,8,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,1.77


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40718219 entries, 0 to 40718218
Data columns (total 22 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            object        
 1   item_id       int16         
 2   dept_id       int16         
 3   store_id      int16         
 4   cat_id        int16         
 5   state_id      int16         
 6   d             object        
 7   sales         float32       
 8   date          datetime64[ns]
 9   wm_yr_wk      int16         
 10  weekday       int16         
 11  wday          int16         
 12  month         int16         
 13  year          int16         
 14  event_name_1  int16         
 15  event_type_1  int16         
 16  event_name_2  int16         
 17  event_type_2  int16         
 18  snap_CA       float32       
 19  snap_TX       float32       
 20  snap_WI       float32       
 21  sell_price    float32       
dtypes: datetime64[ns](1), float32(5), int16(14), object(2)
memory us

In [12]:
create_fea(df)
df.shape

(40718219, 37)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40718219 entries, 0 to 40718218
Data columns (total 37 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            object        
 1   item_id       int16         
 2   dept_id       int16         
 3   store_id      int16         
 4   cat_id        int16         
 5   state_id      int16         
 6   d             object        
 7   sales         float32       
 8   date          datetime64[ns]
 9   wm_yr_wk      int16         
 10  weekday       int16         
 11  wday          int16         
 12  month         int16         
 13  year          int16         
 14  event_name_1  int16         
 15  event_type_1  int16         
 16  event_name_2  int16         
 17  event_type_2  int16         
 18  snap_CA       float32       
 19  snap_TX       float32       
 20  snap_WI       float32       
 21  sell_price    float32       
 22  lag_7         float32       
 23  lag_14        float32       
 

In [14]:
df.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_7,lag_14,lag_21,lag_28,rmean_7_7,rmean_14_7,rmean_21_7,rmean_28_7,rmean_7_28,rmean_14_28,rmean_21_28,rmean_28_28,week,quarter,mday
0,HOBBIES_1_002_CA_1_validation,1,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,3.97,,,,,,,,,,,,,2,1,13
1,HOBBIES_1_004_CA_1_validation,3,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,4.34,,,,,,,,,,,,,2,1,13
2,HOBBIES_1_005_CA_1_validation,4,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,2.48,,,,,,,,,,,,,2,1,13
3,HOBBIES_1_008_CA_1_validation,7,0,0,0,0,d_350,0.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,0.5,,,,,,,,,,,,,2,1,13
4,HOBBIES_1_009_CA_1_validation,8,0,0,0,0,d_350,2.0,2012-01-13,11150,0,7,1,2012,0,0,0,0,0.0,1.0,0.0,1.77,,,,,,,,,,,,,2,1,13


In [15]:
df.dropna(inplace = True)
df.shape

(39041269, 37)

In [16]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[train_cols]
y_train = df["sales"]

In [17]:
np.random.seed(777)

fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

In [18]:
del df, X_train, y_train, fake_valid_inds,train_inds ; gc.collect()

60

In [19]:
params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
         "nthread" : -1,
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1200,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
}

In [20]:
%%time

m_lgb = lgb.train(params, train_data, valid_sets = [fake_valid_data], verbose_eval=20) 



[20]	valid_0's rmse: 3.00654
[40]	valid_0's rmse: 2.60826
[60]	valid_0's rmse: 2.50637
[80]	valid_0's rmse: 2.48065
[100]	valid_0's rmse: 2.47027
[120]	valid_0's rmse: 2.46247
[140]	valid_0's rmse: 2.45377
[160]	valid_0's rmse: 2.44524
[180]	valid_0's rmse: 2.43676
[200]	valid_0's rmse: 2.43034
[220]	valid_0's rmse: 2.42635
[240]	valid_0's rmse: 2.42061
[260]	valid_0's rmse: 2.41581
[280]	valid_0's rmse: 2.41086
[300]	valid_0's rmse: 2.40587
[320]	valid_0's rmse: 2.40072
[340]	valid_0's rmse: 2.39507
[360]	valid_0's rmse: 2.39233
[380]	valid_0's rmse: 2.38936
[400]	valid_0's rmse: 2.38574
[420]	valid_0's rmse: 2.38195
[440]	valid_0's rmse: 2.37849
[460]	valid_0's rmse: 2.37568
[480]	valid_0's rmse: 2.37285
[500]	valid_0's rmse: 2.3703
[520]	valid_0's rmse: 2.36689
[540]	valid_0's rmse: 2.36429
[560]	valid_0's rmse: 2.36144
[580]	valid_0's rmse: 2.35825
[600]	valid_0's rmse: 2.35652
[620]	valid_0's rmse: 2.35518
[640]	valid_0's rmse: 2.3532
[660]	valid_0's rmse: 2.35166
[680]	valid_0's 

In [21]:
m_lgb.save_model("copiedsolvemodel_1.lgb")

<lightgbm.basic.Booster at 0x23200ee48c8>

In [22]:
%%time
alphas = [1.028, 1.023, 1.018]
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for icount, (alpha, weight) in enumerate(zip(alphas, weights)):

    te = create_dt(False)
    cols = [f"F{i}" for i in range(1,29)]

    for tdelta in range(0, 28):
        day = fday + timedelta(days=tdelta)
        print(tdelta, day)
        tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
        create_fea(tst)
        tst = tst.loc[tst.date == day , train_cols]
        te.loc[te.date == day, "sales"] = alpha*m_lgb.predict(tst) # magic multiplier by kyakovlev

    te_sub = te.loc[te.date >= fday, ["id", "sales"]].copy()

    te_sub["F"] = [f"F{rank}" for rank in te_sub.groupby("id")["id"].cumcount()+1]
    te_sub = te_sub.set_index(["id", "F" ]).unstack()["sales"][cols].reset_index()
    te_sub.fillna(0., inplace = True)
    te_sub.sort_values("id", inplace = True)
    te_sub.reset_index(drop=True, inplace = True)
    te_sub.to_csv(f"submission_1_{icount}.csv",index=False)
    if icount == 0 :
        sub = te_sub
        sub[cols] *= weight
    else:
        sub[cols] += te_sub[cols]*weight
    print(icount, alpha, weight)

sub2 = sub.copy()
sub2["id"] = sub2["id"].str.replace("validation$", "evaluation")
sub = pd.concat([sub, sub2], axis=0, sort=False)
sub.to_csv("submission_1.csv",index=False)

0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2016-05-10 00:00:00
16 2016-05-11 00:00:00
17 2016-05-12 00:00:00
18 2016-05-13 00:00:00
19 2016-05-14 00:00:00
20 2016-05-15 00:00:00
21 2016-05-16 00:00:00
22 2016-05-17 00:00:00
23 2016-05-18 00:00:00
24 2016-05-19 00:00:00
25 2016-05-20 00:00:00
26 2016-05-21 00:00:00
27 2016-05-22 00:00:00
0 1.028 0.3333333333333333
0 2016-04-25 00:00:00
1 2016-04-26 00:00:00
2 2016-04-27 00:00:00
3 2016-04-28 00:00:00
4 2016-04-29 00:00:00
5 2016-04-30 00:00:00
6 2016-05-01 00:00:00
7 2016-05-02 00:00:00
8 2016-05-03 00:00:00
9 2016-05-04 00:00:00
10 2016-05-05 00:00:00
11 2016-05-06 00:00:00
12 2016-05-07 00:00:00
13 2016-05-08 00:00:00
14 2016-05-09 00:00:00
15 2

In [23]:
sub.head(10)

F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,0.924376,0.812505,0.842155,0.795653,1.032035,1.208074,1.174494,1.01429,0.94161,0.968394,0.905892,1.086157,1.322165,1.167043,1.045473,0.903791,0.950671,0.935604,1.084512,1.322252,1.220886,0.956697,0.829883,0.813368,0.827918,1.020799,1.214159,1.206567
1,FOODS_1_001_CA_2_validation,0.9074,0.949386,0.952461,1.089078,1.135036,1.39473,1.617556,0.965442,0.967987,0.942706,0.952684,1.103983,1.406678,1.328676,0.970489,0.936402,0.966163,0.979162,1.139836,1.457484,1.423784,1.013939,0.964491,0.939544,1.007749,1.23426,1.52041,1.302739
2,FOODS_1_001_CA_3_validation,1.054611,1.008,0.910706,0.899466,0.979719,1.345838,1.229482,1.016088,1.023457,0.932988,0.930335,1.05063,1.308627,1.277563,1.10333,1.096671,0.994463,1.012974,1.103657,1.542354,1.59123,1.120563,1.013249,0.932125,0.926137,1.036229,1.191677,1.225033
3,FOODS_1_001_CA_4_validation,0.407927,0.355031,0.34555,0.337473,0.398993,0.449471,0.474102,0.397965,0.412314,0.415268,0.394165,0.433619,0.458728,0.420849,0.382473,0.381323,0.395804,0.398856,0.453028,0.472586,0.491813,0.378936,0.35429,0.365423,0.369141,0.425287,0.462698,0.472763
4,FOODS_1_001_TX_1_validation,0.204671,0.172103,0.161163,0.157341,0.162186,0.164726,0.197123,0.40555,0.405143,0.428188,0.437954,0.446132,0.447745,0.370832,0.408182,0.456276,0.420531,0.374086,0.393256,0.3674,0.346165,0.29275,0.278274,0.273114,0.275366,0.298286,0.344262,0.333204
5,FOODS_1_001_TX_2_validation,0.448455,0.413596,0.416025,0.39268,0.457195,0.481632,0.53326,0.442461,0.443458,0.413891,0.484601,0.542434,0.590355,0.523871,0.462952,0.442204,0.456858,0.447359,0.51125,0.560843,0.579361,0.43959,0.410526,0.420823,0.410719,0.486712,0.523037,0.527316
6,FOODS_1_001_TX_3_validation,0.387992,0.360171,0.353137,0.417039,0.459702,0.463167,0.553815,0.456191,0.488881,0.446354,0.494592,0.511415,0.560184,0.480846,0.468245,0.440207,0.456491,0.493567,0.508798,0.5469,0.535873,0.446318,0.412041,0.418357,0.427737,0.472473,0.517087,0.497154
7,FOODS_1_001_WI_1_validation,0.320279,0.366089,0.345213,0.352646,0.454076,0.714665,0.567354,0.461692,0.470839,0.421968,0.446744,0.571518,0.844753,0.609846,0.547638,0.557485,0.547188,0.562922,0.67872,0.851233,0.765048,0.563251,0.547457,0.55732,0.568342,0.688668,0.914347,0.816129
8,FOODS_1_001_WI_2_validation,0.307215,0.322557,0.36674,0.333024,0.426817,0.400675,0.411327,0.435082,0.474717,0.433626,0.404509,0.483762,0.498508,0.41965,0.43061,0.405353,0.47711,0.451042,0.479633,0.509972,0.489635,0.387859,0.389887,0.377814,0.377342,0.415317,0.444708,0.420352
9,FOODS_1_001_WI_3_validation,0.234339,0.230831,0.222032,0.224245,0.265544,0.367142,0.360474,0.37328,0.388953,0.342757,0.362457,0.413965,0.46168,0.366616,0.346459,0.351708,0.355939,0.340982,0.409364,0.495913,0.429784,0.33236,0.332411,0.326996,0.324213,0.378321,0.421622,0.374476


In [24]:
sub.id.nunique(), sub["id"].str.contains("validation$").sum()

(60980, 30490)

In [25]:
sub.shape

(60980, 29)

In [26]:
print('hi')

hi
