# M5 Forecasting - Modelling with Target Encoding

## Load Data

In [1]:
## Load library 
import gc
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import lightgbm as lgb
from datetime import datetime, timedelta
from category_encoders import TargetEncoder 

# Load data
sales = pd.read_csv('data/sales_train_evaluation.csv')
prices = pd.read_csv('data/sell_prices.csv')
calendar = pd.read_csv('data/calendar.csv')

# Adding sales for test data: d_1942-d_1969
# for d in range(1942, 1970):
#     col = 'd_' + str(d)
#     sales[col] = 0
#     sales[col] = sales[col].astype(np.int16)
    
print(sales.info())
print(prices.info())
print(calendar.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1947 entries, id to d_1941
dtypes: int64(1941), object(6)
memory usage: 452.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6841121 entries, 0 to 6841120
Data columns (total 4 columns):
 #   Column      Dtype  
---  ------      -----  
 0   store_id    object 
 1   item_id     object 
 2   wm_yr_wk    int64  
 3   sell_price  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 208.8+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          1969 non-null   object
 1   wm_yr_wk      1969 non-null   int64 
 2   weekday       1969 non-null   object
 3   wday          1969 non-null   int64 
 4   month         1969 non-null   int64 
 5   year          1969 non-null   int64 
 6   d             1969 non-null   object
 7   event_n

## Downcasting

Credit: __[Time Series Forecasting - EDA, FE & Modelling](https://www.kaggle.com/anshuls235/time-series-forecasting-eda-fe-modelling)__

We see above the sales dataset 1941 int64 columns and 6 objects corresponding to the categorical variables in the data. The prices dataset has 4 columns: 2 objects which are the categorical variables, an int64 column and a float64 column both of which are numeric. 

Using the min and max value of a column, each column can be converted to a subtype that uses less memory, thus reducing the size of that column and saving memory. Subtypes are as follows
- int8/uint8: 1 byte
- float16/int16/uint16: 2 bytes
- float32/int32/uint32: 4 bytes
- float64/int64/uint64: 8 bytes

I will use the function below to downcast the data. This is used again after creating a merged dataset including all the sales, prices and calendar information.

In [2]:
## Function to downcast data
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
                
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
                
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df  

In [3]:
%%time

sales = downcast(sales)
prices = downcast(prices)
calendar = downcast(calendar)

Wall time: 2min 46s


**After downcasting**

The reduction in size is significant. The sales dataframe reduces from 454.5 MB to 96.6 MB. Prices dataframe reduces from 208.8 MB to 45.8 MB. The calendar dataframe was small in size to begin with.

In [4]:
print(sales.info())
print(prices.info())
print(calendar.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1947 entries, id to d_1941
dtypes: category(6), int16(1317), int8(624)
memory usage: 96.6 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6841121 entries, 0 to 6841120
Data columns (total 4 columns):
 #   Column      Dtype   
---  ------      -----   
 0   store_id    category
 1   item_id     category
 2   wm_yr_wk    int16   
 3   sell_price  float16 
dtypes: category(2), float16(1), int16(1)
memory usage: 45.8 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          1969 non-null   datetime64[ns]
 1   wm_yr_wk      1969 non-null   int16         
 2   weekday       1969 non-null   category      
 3   wday          1969 non-null   int8          
 4   month         1969 non-null   int8          
 5   year          1969

## Creating Data

Before moving on to modelling, the sales, prices and calendar data sets must be combined together. The function below creates the full dataset used to create the training and test datasets during modelling and prediction. One of the variables it takes as input is `first_day` which is the day number at which the train/test dataset begins (default is day 1200).

The sales data is converted to leng format using the pandas `melt` function. Using pandas `merge` function, the calendar dataset is merged with the sales data on the common indices `d`. The prices dataset is them merged using common indices `store_id`, `item_id` and `wm_yr_wk`.

Finally, categorical variables in the dataset need to be encoded. The categorical variables here are `item_id`, `dept_id`, `cat_id`, `store_id` and `state_id`. For each categorical variable, I encode the variable by calculating the mean sales for each group of the categorical variable. This is a form of target encoding as sales is the target variable in this case. 

In [5]:
## Preliminaries
nhz = 28
max_lags = 57
tr_last = 1913 + 28
fday = datetime(2016, 4, 25) + timedelta(days=28)

## Function to Create dataset
def create_data(istrain=True, nrows=True, first_day=1200, encode='mean'):
    for col,col_dtype in calendar.dtypes.items():
        if str(col_dtype) == 'category' and col != 'd':
            calendar[col] = calendar[col].cat.codes.astype('int16')
            calendar[col] -= calendar[col].min()
            
    
    start_day = max(1 if istrain else tr_last-max_lags, first_day)
    if istrain:
        numcols = [f'd_{d}' for d in range(start_day, tr_last+1)]
    else:
        numcols = [f'd_{d}' for d in range(start_day, tr_last+1+28)]
    catcols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
            
    if not istrain:
        for d in range(tr_last+1, tr_last+1+28):
            sales[f'd_{d}'] = np.nan
            
    df = pd.melt(sales[catcols+numcols], 
                 id_vars=catcols, 
                 value_vars=[col for col in sales[catcols+numcols].columns if col.startswith('d_')], 
                 var_name='d', 
                 value_name='sales')
    
    df = df.merge(calendar, on='d')
    df = df.merge(prices, on=['store_id', 'item_id', 'wm_yr_wk'])
    
    if encode == 'mean': #check for now, will include other encoders in the future
        meancols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
        for col in meancols:
            df[f'{col}_enc'] = df[[col, 'sales']].groupby(col)['sales'].transform(lambda x: x.mean(skipna=True))
    
    return df

In [6]:
%%time

df = create_data(istrain=True, first_day=350, encode='mean')

# for col,col_dtype in calendar.dtypes.items():
#     print('variable '+str(col)+' is dtype '+str(col_dtype))

# print(calendar.dtypes.items)

df.head()

Wall time: 2min 42s


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,sell_price,item_id_enc,dept_id_enc,cat_id_enc,store_id_enc,state_id_enc,event_name_1_enc,event_type_1_enc,event_name_2_enc,event_type_2_enc
0,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_350,0,2012-01-13,11150,...,3.970703,0.272927,0.816681,0.671637,1.594058,1.528121,1.39253,1.39253,1.388101,1.388101
1,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_350,2,2012-01-13,11150,...,4.339844,2.09843,0.816681,0.671637,1.594058,1.528121,1.39253,1.39253,1.388101,1.388101
2,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_350,0,2012-01-13,11150,...,2.480469,0.750691,0.816681,0.671637,1.594058,1.528121,1.39253,1.39253,1.388101,1.388101
3,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_350,0,2012-01-13,11150,...,0.5,4.328455,0.816681,0.671637,1.594058,1.528121,1.39253,1.39253,1.388101,1.388101
4,HOBBIES_1_009_CA_1_evaluation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,d_350,2,2012-01-13,11150,...,1.769531,0.802002,0.816681,0.671637,1.594058,1.528121,1.39253,1.39253,1.388101,1.388101


In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41571939 entries, 0 to 41571938
Data columns (total 31 columns):
 #   Column            Dtype         
---  ------            -----         
 0   id                category      
 1   item_id           category      
 2   dept_id           category      
 3   cat_id            category      
 4   store_id          category      
 5   state_id          category      
 6   d                 object        
 7   sales             int16         
 8   date              datetime64[ns]
 9   wm_yr_wk          int16         
 10  weekday           int16         
 11  wday              int8          
 12  month             int8          
 13  year              int16         
 14  event_name_1      int16         
 15  event_type_1      int16         
 16  event_name_2      int16         
 17  event_type_2      int16         
 18  snap_CA           int8          
 19  snap_TX           int8          
 20  snap_WI           int8          
 21  sell_p

The full dataset occupied 4.9GB of memory. I will attempt to downcast the data again to reduce the amount of space occupied. 

In [8]:
%%time

df = downcast(df)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41571939 entries, 0 to 41571938
Data columns (total 31 columns):
 #   Column            Dtype         
---  ------            -----         
 0   id                category      
 1   item_id           category      
 2   dept_id           category      
 3   cat_id            category      
 4   store_id          category      
 5   state_id          category      
 6   d                 category      
 7   sales             int16         
 8   date              datetime64[ns]
 9   wm_yr_wk          int16         
 10  weekday           int8          
 11  wday              int8          
 12  month             int8          
 13  year              int16         
 14  event_name_1      int8          
 15  event_type_1      int8          
 16  event_name_2      int8          
 17  event_type_2      int8          
 18  snap_CA           int8          
 19  snap_TX           int8          
 20  snap_WI           int8          
 21  sell_p

After downcasting the dataset now occupies 2.4GB.

## Creating Features

Creating some new features from existing information in the data, that can be used to better predict the outcome/target variable. In this case the outcome to be predicted is 28 day sales forecasts for all items in the dataset. Here we want to create some meaningful outcomes that will help better predict these 28-day sales. Some of these new features include:

* 7 and 28-day lag sales for each unique id.
* Rolling means for the 7 and 28-day lag sales for each unique id.
* Additional time information such as day of the week, week of the year, month , quarter, year and day of the month. These will help capture some of the seasonality present in the data. 

In [9]:
## Function to create new features

def create_features(data):
    # Create lags and rolling means
    lags = [7, 28]
    lag_cols = [f'lag_{lag}' for lag in lags]

    for lag,lag_col in zip(lags, lag_cols):
        data[lag_col] = data[['id', 'sales']].groupby('id')['sales'].shift(lag)

    windows = [7, 28]
    for win in windows:
        for lag,lag_col in zip(lags, lag_cols):
            data[f'rmean_{lag}_{win}'] = data[['id', lag_col]].groupby('id')[lag_col].transform(lambda x: x.rolling(win).mean())

    # Create time variables
    date_feats = {'wday':'weekday', 
                  'week':'weekofyear', 
                  'month':'month', 
                  'quarter':'quarter',
                  'year':'year',
                  'days':'day'}

    for date_name,date_attr in date_feats.items():
        if date_name in data.columns:
            data[date_name] = data[date_name].astype('int16')
        else:
            data[date_name] = getattr(data['date'].dt, date_attr).astype('int16') # note .dt changes date to something thats not datetime64
            # returns series indexed like original series and extracts datetime attribute

In [10]:
%%time

create_features(df)

df.head()

Wall time: 1min 21s


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,...,event_type_2_enc,lag_7,lag_28,rmean_7_7,rmean_28_7,rmean_7_28,rmean_28_28,week,quarter,days
0,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_350,0,2012-01-13,11150,...,1.387695,,,,,,,2,1,13
1,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_350,2,2012-01-13,11150,...,1.387695,,,,,,,2,1,13
2,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_350,0,2012-01-13,11150,...,1.387695,,,,,,,2,1,13
3,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_350,0,2012-01-13,11150,...,1.387695,,,,,,,2,1,13
4,HOBBIES_1_009_CA_1_evaluation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,d_350,2,2012-01-13,11150,...,1.387695,,,,,,,2,1,13


In [11]:
df = downcast(df)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41571939 entries, 0 to 41571938
Data columns (total 40 columns):
 #   Column            Dtype         
---  ------            -----         
 0   id                category      
 1   item_id           category      
 2   dept_id           category      
 3   cat_id            category      
 4   store_id          category      
 5   state_id          category      
 6   d                 category      
 7   sales             int16         
 8   date              datetime64[ns]
 9   wm_yr_wk          int16         
 10  weekday           int8          
 11  wday              int8          
 12  month             int8          
 13  year              int16         
 14  event_name_1      int8          
 15  event_type_1      int8          
 16  event_name_2      int8          
 17  event_type_2      int8          
 18  snap_CA           int8          
 19  snap_TX           int8          
 20  snap_WI           int8          
 21  sell_p

In [12]:
# Drop all NA values 
df.dropna(inplace=True)
df.shape

(39894989, 40)

In [13]:
df.to_csv('data/full_dataset.csv',index=False)

## Modelling

Now that the dataset is ready with its new features I will fit a model to the data and predict the 28-day future sales for all items. The model used here is LightGBM, which is a gradient boosting algorithm using decision trees. Unlike XGBoost where trees are split depth-wise, LightGBM splits the trees leaf-wise and allows better minimization of loss and hence more accurate predictions.

In [14]:
## Define certains things for modelling
cat_feats = ['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id'] + ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
del_cols = ['id', 'date', 'sales', 'd', 'wm_yr_wk', 'weekday'] + cat_feats + ['event_name_1_enc', 'event_type_1_enc', 'event_name_2_enc', 'event_type_2_enc']
train_cols = df.columns[~df.columns.isin(del_cols)]

# Get evaluation data: X and y
X_train = df[train_cols]
y_train = df['sales']

To implement `lightgbm` the dataset needs to be split into training and validation sets. The algorithm fits the model on the train data and validates the model and calculates the RMSE using the valid data.

In [17]:
%%time

## Create random training and validation datasets
np.random.seed(123)

# Get 2million random indices (WoR) from X_train and save as validation
valid_ind = np.random.choice(X_train.index.values, 2_000_000, replace=False)
# The rest of the data used for fitting the model
train_ind = np.setdiff1d(X_train.index.values, valid_ind)

# Data to fit model, lgb calls this the 'train' data
train_data = lgb.Dataset(X_train.loc[train_ind], 
                         label=y_train.loc[train_ind], 
                         free_raw_data=False)

# Data for validation
valid_data = lgb.Dataset(X_train.loc[valid_ind], 
                         label=y_train.loc[valid_ind], 
                         free_raw_data=False)

Wall time: 16.4 s


In [20]:
# Save the X and y train datasets
# X_train.to_csv('data/X_train.csv',index=False)
# y_train.to_csv('data/y_train.csv',index=False)

In [19]:
del df, X_train, y_train, valid_ind, train_ind
gc.collect();

In [20]:
%%time

## Run LightGBM

# Hyperparameters
params = {'objective':'poisson', 
          'metric':'rmse', 
          'force_row_wise':True, 
          'learning_rate': 0.075, 
          'bagging_fraction':0.75, 
          'bagging_freq':1, 
          'lambda_l2':0.1, 
          'verbosity':1, 
          'num_iterations':1500, 
          'num_leaves':128, 
          'min_data_in_leaf':100}

fit_lgb = lgb.train(params, train_set=train_data, valid_sets=[valid_data], verbose_eval=20)



[20]	valid_0's rmse: 2.88948
[40]	valid_0's rmse: 2.5748
[60]	valid_0's rmse: 2.49682
[80]	valid_0's rmse: 2.47225
[100]	valid_0's rmse: 2.46098
[120]	valid_0's rmse: 2.45288
[140]	valid_0's rmse: 2.44543
[160]	valid_0's rmse: 2.43845
[180]	valid_0's rmse: 2.43169
[200]	valid_0's rmse: 2.4251
[220]	valid_0's rmse: 2.419
[240]	valid_0's rmse: 2.41262
[260]	valid_0's rmse: 2.40685
[280]	valid_0's rmse: 2.40131
[300]	valid_0's rmse: 2.3966
[320]	valid_0's rmse: 2.39277
[340]	valid_0's rmse: 2.38876
[360]	valid_0's rmse: 2.38489
[380]	valid_0's rmse: 2.38153
[400]	valid_0's rmse: 2.37746
[420]	valid_0's rmse: 2.37471
[440]	valid_0's rmse: 2.37166
[460]	valid_0's rmse: 2.36878
[480]	valid_0's rmse: 2.36648
[500]	valid_0's rmse: 2.36367
[520]	valid_0's rmse: 2.36045
[540]	valid_0's rmse: 2.35868
[560]	valid_0's rmse: 2.35617
[580]	valid_0's rmse: 2.35459
[600]	valid_0's rmse: 2.3526
[620]	valid_0's rmse: 2.35058
[640]	valid_0's rmse: 2.34909
[660]	valid_0's rmse: 2.34672
[680]	valid_0's rmse

In [21]:
fit_lgb.save_model('fit_model_meanenc.lgb')

<lightgbm.basic.Booster at 0x1bf03faff48>

In [None]:
fit_lgb = lgb.Booster(model_file='fit_model.lgb')

## Prediction

Using results from the model fit now we can predict the sales for the next 28 days. The test dataset used to predict the model will be created using the `create_data` function written previously. The test dataset starts at day `d_1884` and goes till day `d_1969`. The period `d_1942` to `d_1969` are the 28 days we have to predict. In the prediction process we also use the `create_features` function to create the 7 and 28-day lags and rolling means in the test data. 

In [26]:
%%time

## Prediction
alphas = [1.023, 1.018, 1.013] # magic multiplier by kyakovlev
weights = [1/len(alphas)]*len(alphas)
sub = 0.

for iters,(alpha,weight) in enumerate(zip(alphas,weights)):
    test = create_data(istrain=False, encode='mean')
    cols = [f'F{d}' for d in range(1,29)]
    
    for tdelta in range(0,28):
        day = fday + timedelta(days=tdelta)
        print(tdelta,day)
        test_pred = test[(test.date >= day-timedelta(days=max_lags)) & (test.date <= day)].copy()
        ## Create features in test data
        create_features(test_pred)
        ## End creating features in test data
        test_pred = test_pred.loc[test_pred.date == day, train_cols]
        test.loc[test.date == day, 'sales'] = alpha*fit_lgb.predict(test_pred)
        
        
    test_sub = test.loc[test.date >= fday, ['id', 'sales']].copy()
    test_sub['F'] = [f'F{rank}' for rank in test_sub.groupby('id')['id'].cumcount()+1]
    test_sub = test_sub.set_index(['id', 'F']).unstack()['sales'][cols].reset_index()
#     test_sub.fillna(0., inplace=True)
    test_sub.sort_values('id', inplace=True)
    test_sub.reset_index(drop=True, inplace=True)
    test_sub.to_csv(f'submission_{iters}.csv', index=False)
    if iters==0:
        sub = test_sub
        sub[cols] *= weight
    else:
        sub[cols] += test_sub[cols]*weight
    print(iters, alpha, weight)
    

sub2 = pd.read_csv('data/sales_train_evaluation.csv', usecols=['id']+[f'd_{d}' for d in range(1914, 1914+28)])
sub2.rename(columns={f'd_{d}': f'F{d-1913}' for d in range(1914, 1914+28)}, inplace=True)
sub2['id'] = sub2['id'].str.replace('evaluation', 'validation')
    
sub = pd.concat([sub2, sub], axis=0, sort=False)
sub.to_csv('lgb_submission.csv', index=False)
print(sub.shape)

0 2016-05-23 00:00:00
1 2016-05-24 00:00:00
2 2016-05-25 00:00:00
3 2016-05-26 00:00:00
4 2016-05-27 00:00:00
5 2016-05-28 00:00:00
6 2016-05-29 00:00:00
7 2016-05-30 00:00:00
8 2016-05-31 00:00:00
9 2016-06-01 00:00:00
10 2016-06-02 00:00:00
11 2016-06-03 00:00:00
12 2016-06-04 00:00:00
13 2016-06-05 00:00:00
14 2016-06-06 00:00:00
15 2016-06-07 00:00:00
16 2016-06-08 00:00:00
17 2016-06-09 00:00:00
18 2016-06-10 00:00:00
19 2016-06-11 00:00:00
20 2016-06-12 00:00:00
21 2016-06-13 00:00:00
22 2016-06-14 00:00:00
23 2016-06-15 00:00:00
24 2016-06-16 00:00:00
25 2016-06-17 00:00:00
26 2016-06-18 00:00:00
27 2016-06-19 00:00:00
0 1.023 0.3333333333333333
0 2016-05-23 00:00:00
1 2016-05-24 00:00:00
2 2016-05-25 00:00:00
3 2016-05-26 00:00:00
4 2016-05-27 00:00:00
5 2016-05-28 00:00:00
6 2016-05-29 00:00:00
7 2016-05-30 00:00:00
8 2016-05-31 00:00:00
9 2016-06-01 00:00:00
10 2016-06-02 00:00:00
11 2016-06-03 00:00:00
12 2016-06-04 00:00:00
13 2016-06-05 00:00:00
14 2016-06-06 00:00:00
15 2

In [25]:
%%time 

## Prediction
# alpha = (1.023+1.018+1.013)/3
alpha=1.02 # magic multiplier by kyakovlev
sub = 0.

test = create_data(istrain=False, encode='mean')
cols = [f'F{d}' for d in range(1,29)]

for tdelta in range(0,28):
    day = fday + timedelta(days=tdelta)
    print(tdelta,day)
    test_pred = test[(test.date >= day-timedelta(days=max_lags)) & (test.date <= day)].copy()
    ## Create features in test data
    create_features(test_pred)
    ## End creating features in test data
    test_pred = test_pred.loc[test_pred.date == day, train_cols]
    test.loc[test.date == day, 'sales'] = alpha*fit_lgb.predict(test_pred)


sub = test.loc[test.date >= fday, ['id', 'sales']].copy()
sub['F'] = [f'F{rank}' for rank in sub.groupby('id')['id'].cumcount()+1]
sub = sub.set_index(['id', 'F']).unstack()['sales'][cols].reset_index()
#     test_sub.fillna(0., inplace=True)
sub.sort_values('id', inplace=True)
sub.reset_index(drop=True, inplace=True)


sub2 = pd.read_csv('data/sales_train_evaluation.csv', usecols=['id']+[f'd_{d}' for d in range(1914, 1914+28)])
sub2.rename(columns={f'd_{d}': f'F{d-1913}' for d in range(1914, 1914+28)}, inplace=True)
sub2['id'] = sub2['id'].str.replace('evaluation', 'validation')
    
sub = pd.concat([sub2, sub], axis=0, sort=False)
sub.to_csv('lgb_enc_submission.csv', index=False)
print(sub.shape)

0 2016-05-23 00:00:00
1 2016-05-24 00:00:00
2 2016-05-25 00:00:00
3 2016-05-26 00:00:00
4 2016-05-27 00:00:00
5 2016-05-28 00:00:00
6 2016-05-29 00:00:00
7 2016-05-30 00:00:00
8 2016-05-31 00:00:00
9 2016-06-01 00:00:00
10 2016-06-02 00:00:00
11 2016-06-03 00:00:00
12 2016-06-04 00:00:00
13 2016-06-05 00:00:00
14 2016-06-06 00:00:00
15 2016-06-07 00:00:00
16 2016-06-08 00:00:00
17 2016-06-09 00:00:00
18 2016-06-10 00:00:00
19 2016-06-11 00:00:00
20 2016-06-12 00:00:00
21 2016-06-13 00:00:00
22 2016-06-14 00:00:00
23 2016-06-15 00:00:00
24 2016-06-16 00:00:00
25 2016-06-17 00:00:00
26 2016-06-18 00:00:00
27 2016-06-19 00:00:00
(60980, 29)
Wall time: 20min 19s


In [26]:
sub.head()

F,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.0,0.0,0.0,2.0,0.0,3.0,5.0,0.0,0.0,...,2.0,4.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,1.0
1,HOBBIES_1_002_CA_1_validation,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,HOBBIES_1_003_CA_1_validation,0.0,0.0,1.0,1.0,0.0,2.0,1.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,2.0,3.0,0.0,1.0
3,HOBBIES_1_004_CA_1_validation,0.0,0.0,1.0,2.0,4.0,1.0,6.0,4.0,0.0,...,1.0,1.0,0.0,4.0,0.0,1.0,3.0,0.0,2.0,6.0
4,HOBBIES_1_005_CA_1_validation,1.0,0.0,2.0,3.0,1.0,0.0,3.0,2.0,3.0,...,0.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0,0.0


In [27]:
sub.id.nunique(), sub['id'].str.contains('evaluation$').sum()

(60980, 30490)