In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap

import os
import getpass
import gc
from pathlib import Path

import itertools

import xgboost as xgb
from lightgbm import LGBMRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
DATA_DIR = Path('/data', getpass.getuser(), 'kaggle', 'future_sales') #path to data files

sales = pd.read_csv(DATA_DIR / 'sales_train.csv')
items = pd.read_csv(DATA_DIR / 'items.csv')
item_categories = pd.read_csv(DATA_DIR / 'item_categories.csv')
shops = pd.read_csv(DATA_DIR /'shops.csv')

test = pd.read_csv(DATA_DIR / 'test.csv').set_index('ID')

In [3]:
sales.columns

Index(['date', 'date_block_num', 'shop_id', 'item_id', 'item_price',
       'item_cnt_day'],
      dtype='object')

In [4]:
items.columns

Index(['item_name', 'item_id', 'item_category_id'], dtype='object')

In [5]:
item_categories.columns

Index(['item_category_name', 'item_category_id'], dtype='object')

In [6]:
shops.columns

Index(['shop_name', 'shop_id'], dtype='object')

**Little preporcessing**

In [7]:
def shop_id_change(sample):
    sample.loc[sample['shop_id']==57, 'shop_id'] = 0
    sample.loc[sample['shop_id']==58, 'shop_id'] = 1
    sample.loc[sample['shop_id']==11, 'shop_id'] = 10
    
    return sample

In [8]:
#get rid of those objects but may be it's better to keep due to clipping to [0,20] all values of cnt
sales = sales[(sales['item_cnt_day'] <= 900) & (sales['item_price'] <= 60000) & ((sales['item_price'] >= 0))]

In [9]:
#change shop_id
sales = shop_id_change(sales)
shops = shop_id_change(shops)
test = shop_id_change(test)

In [10]:
#create feature - city
shops['shop_city'] = shops['shop_name'].apply(lambda x: x.lower()).str.replace('[^\w\s]', '').str.strip()
shops['shop_city'] = shops['shop_city'].apply(lambda x: x.split()[0])
shops['shop_city'] = LabelEncoder().fit_transform(shops['shop_city'])

In [11]:
#create feature - section of products
item_categories['item_section_name'] = item_categories['item_category_name'] \
                                        .apply(lambda x: x.lower().split('-')[0].split()[0].strip())
item_categories['item_section_name'] = LabelEncoder().fit_transform(item_categories['item_section_name'])


Since we need to predict monthly, so aggregate data

In [12]:
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = []

for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(itertools.product(*[cur_shops, cur_items, [block_num]])), dtype='int32'))

grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Aggregations
#sales['item_cnt_day'] = sales['item_cnt_day'].clip(0,20) #seems it's better to clip first
train = sales.groupby(['shop_id', 'item_id', 'date_block_num']) \
            .agg({'item_cnt_day':'sum', 'item_price':'mean'}).reset_index()
train = train.rename(columns = {'item_cnt_day' : 'item_cnt_month'})
train['item_cnt_month'] = train['item_cnt_month'].clip(0,20)

#merge aggregated sales and grid
train = pd.merge(grid, train, how='left',on=index_cols)
train['item_cnt_month'] = train['item_cnt_month'].fillna(0)

In [13]:
#try without price since we dont know on test set (i suppose they are not obliged to be equal)
train = train.drop(['item_price'], axis=1)
train.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
0,59,22154,0,1.0
1,59,2552,0,0.0


In [14]:
train.shape

(10913804, 4)

In [15]:
#add some features to train
train = train.merge(items[['item_id', 'item_category_id']], on='item_id')
train = train.merge(item_categories[['item_category_id', 'item_section_name']], on='item_category_id')
train = train.merge(shops[['shop_city', 'shop_id']], on='shop_id')
#is_returned item feature can be added

train = train.drop_duplicates()
train = train.sort_values(by=['date_block_num']).reset_index(drop=True)
train.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_category_id,item_section_name,shop_city
0,59,22154,0,1.0,37,7,30
1,16,6197,0,0.0,55,9,10


In [16]:
train.shape

(10913804, 7)

In [17]:
train[train['item_cnt_month']<=0].shape #seems strange

(9308180, 7)

In [21]:
test['date_block_num']= 34
test['item_cnt_month'] = -1 #since need to predict it
print(test.shape)
test.head(2)

(214200, 4)


Unnamed: 0_level_0,shop_id,item_id,date_block_num,item_cnt_month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5,5037,34,-1
1,5,5320,34,-1


In [22]:
test = test.merge(items[['item_id', 'item_category_id']], on='item_id', how='left')
test = test.merge(item_categories[['item_category_id', 'item_section_name']], on='item_category_id', how='left')
test = test.merge(shops[['shop_city', 'shop_id']], on='shop_id', how='left')
test = test.drop_duplicates().reset_index(drop=True)

In [23]:
print(test.shape)
test.head()

(214200, 7)


Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_category_id,item_section_name,shop_city
0,5,5037,34,-1,19,5,3.0
1,5,5320,34,-1,55,9,3.0
2,5,5233,34,-1,19,5,3.0
3,5,5232,34,-1,23,5,3.0
4,5,5268,34,-1,20,5,3.0


In [24]:
#concat all data to create mean encoded features and lag ones
all_data = pd.concat([train, test]).reset_index(drop=True)

In [25]:
# List of columns that we will use to create lags
cols_to_rename = ['item_cnt_month']

shift_range = list(range(1, 13))

for month_shift in tqdm(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: f'{x}_lag_{month_shift}' if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




**Let's try simple xgb right now**

In [170]:
train = all_data[all_data['date_block_num'] < 34]
test = all_data[all_data['date_block_num'] == 34]

#check if target is clipped
print(train['item_cnt_month'].min(), train['item_cnt_month'].max())

Xtrain, ytrain = train.drop('item_cnt_month', axis=1), train['item_cnt_month']
Xtest, ytest = test.drop('item_cnt_month', axis=1), test['item_cnt_month']

dtrain = xgb.DMatrix(data=Xtrain.to_numpy(), label=ytrain
                     , feature_names=Xtrain.columns)
dtest = xgb.DMatrix(data=Xtest.to_numpy(), label=ytest
                   , feature_names=Xtrain.columns)

del train, test, Xtrain, ytrain, Xtest, ytest
gc.collect()

In [26]:
default_params = {  'objective': 'reg:squarederror'                    
                    , 'subsample': 0.8
                    , 'colsample_bytree': 0.9
                    , 'learning_rate': 0.1
                    , 'tree_method': 'hist'
                    , 'grow_policy': 'lossguide'
                    , 'max_leaves': 63
                    , 'max_depth': 0
                    , 'gamma': 1
                    , 'reg_alpha': 1.2
                    , 'reg_lambda': 1.3
                    , 'min_child_weight': 30
                    , 'max_delta_step': 2
                    , 'n_jobs': 60
                    , 'random_state': 42
                    , 'eval_metric': 'rmse'
                }   

In [176]:
model1 = xgb.train(default_params, dtrain, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=100
           , evals=[(dtrain,'train')])

[0]	train-rmse:1.21688
Will train until train-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.93763
[200]	train-rmse:0.91475
[300]	train-rmse:0.90261
[400]	train-rmse:0.89267
[500]	train-rmse:0.88321
[600]	train-rmse:0.87743
[700]	train-rmse:0.87218
[800]	train-rmse:0.86728
[900]	train-rmse:0.86284
[999]	train-rmse:0.85886


In [180]:
ans = pd.DataFrame({'item_cnt_month': model1.predict(dtest)})
ans['item_cnt_month'] = ans['item_cnt_month'].clip(0, 20)
ans.insert(0, 'ID', ans.index)
ans.to_csv(DATA_DIR / 'baseline_xgb.csv', index=False)
#0.982 on public

In [27]:
#one more features since we have predict based on pairs shop_id and item_id
all_data['shop_item'] = (all_data['shop_id'].astype(str) + '0' + all_data['item_id'].astype(str)).astype(int)
#order of month
all_data['month'] = all_data['date_block_num'].apply(lambda x: x % 12)

**mean encoding**

In [28]:
def cv_mean_enc(sample, column, kf, globalmean, alpha):
    
    for train_ix, test_ix in kf.split(sample):
        train, test = sample.iloc[train_ix], sample.iloc[test_ix]
        
        #smoothing
        column_enc = (train.groupby(column)['item_cnt_month'].mean()
                        * train.groupby(column)['item_cnt_month'].count()
                        + globalmean * alpha) / (train.groupby(column)['item_cnt_month'].count() + alpha)
        #column_enc = train.groupby(column)['item_cnt_month'].mean()


        
        sample.loc[test_ix, column+'_enc'] = sample.loc[test_ix, column].map(column_enc)

    #fill nan with global mean
    sample[column+'_enc'].fillna(globalmean, inplace=True) 
    
    return sample
    
def mean_enconding(sample, column, alpha=5):

    kf = KFold(n_splits=5, shuffle=False)
    
    
    #globalmean_part = sample_part[column].mean()
    globalmean = sample[column].mean()
    
    #sample_part = cv_mean_enc(sample_part, column, kf, globalmean_part, alpha)
    sample = cv_mean_enc(sample, column, kf, globalmean, alpha)

    return sample

In [29]:
all_data_part = all_data[all_data['date_block_num'] < 34] #needs to create validation data for date_block_num=33

In [30]:
mean_enc_features = ['item_id', 'shop_id', 'item_category_id', 'item_section_name', 'shop_city', 'shop_item']
for col in tqdm(mean_enc_features):
    all_data = mean_enconding(all_data.copy(), col)
for col in tqdm(mean_enc_features):
    all_data_part = mean_enconding(all_data_part.copy(), col)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [31]:
all_data_part['month'] = all_data_part['date_block_num'].apply(lambda x: x % 12)
all_data['month'] = all_data['date_block_num'].apply(lambda x: x % 12)

In [32]:
train = all_data[all_data['date_block_num'] < 34]
test = all_data[all_data['date_block_num'] == 34]

#check if target is clipped
print(train['item_cnt_month'].min(), train['item_cnt_month'].max())

#Xtrain, ytrain = train.drop(['item_cnt_month'] + mean_enc_features, axis=1), train['item_cnt_month']
#Xtest, ytest = test.drop(['item_cnt_month'] + mean_enc_features, axis=1), test['item_cnt_month']

Xtrain, ytrain = train.drop(['item_cnt_month'] , axis=1), train['item_cnt_month']
Xtest, ytest = test.drop(['item_cnt_month'] , axis=1), test['item_cnt_month']

dtrain = xgb.DMatrix(data=Xtrain.to_numpy(), label=ytrain
                     , feature_names=Xtrain.columns)
dtest = xgb.DMatrix(data=Xtest.to_numpy(), label=ytest
                   , feature_names=Xtrain.columns)

del train, test, ytrain, Xtest, ytest
gc.collect()

0.0 20.0


31

In [35]:
#without mean_enc_features
model2 = xgb.train(default_params, dtrain, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=100
           , evals=[(dtrain,'train')])

[0]	train-rmse:1.21609
Will train until train-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.91115
[200]	train-rmse:0.88865
[300]	train-rmse:0.87355
[400]	train-rmse:0.86097
[500]	train-rmse:0.85054
[600]	train-rmse:0.84281
[700]	train-rmse:0.83558
[800]	train-rmse:0.82845
[900]	train-rmse:0.82256
[999]	train-rmse:0.81743


In [39]:
#with mean_enc_features
model3 = xgb.train(default_params, dtrain, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=100
           , evals=[(dtrain,'train')])

[0]	train-rmse:1.21622
Will train until train-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.90701
[200]	train-rmse:0.87937
[300]	train-rmse:0.86149
[400]	train-rmse:0.84731
[500]	train-rmse:0.83543
[600]	train-rmse:0.82463
[700]	train-rmse:0.81567
[800]	train-rmse:0.80654
[900]	train-rmse:0.79845
[999]	train-rmse:0.79209


In [41]:
ans = pd.DataFrame({'item_cnt_month': model3.predict(dtest)})
ans['item_cnt_month'] = ans['item_cnt_month'].clip(0, 20)
ans.insert(0, 'ID', ans.index)
ans.to_csv(DATA_DIR / 'baseline_xgb3.csv', index=False)
#0.968316

In [33]:
#with mean_enc_features and without clip sales during aggregation
model3_1 = xgb.train(default_params, dtrain, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=100
           , evals=[(dtrain,'train')])

[0]	train-rmse:1.21524
Will train until train-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.90551
[200]	train-rmse:0.87611
[300]	train-rmse:0.85987
[400]	train-rmse:0.84678
[500]	train-rmse:0.83572
[600]	train-rmse:0.82509
[700]	train-rmse:0.81562
[800]	train-rmse:0.80677
[900]	train-rmse:0.79910
[999]	train-rmse:0.79248


In [34]:
ans = pd.DataFrame({'item_cnt_month': model3_1.predict(dtest)})
ans['item_cnt_month'] = ans['item_cnt_month'].clip(0, 20)
ans.insert(0, 'ID', ans.index)
ans.to_csv(DATA_DIR / 'baseline_xgb2.csv', index=False)
#0.971078

Seems that mean encoding improve a litle, it's better to create lag features. But for kaggle a little is already much :)

Increase learning rate

In [30]:
params = default_params.copy()
params['learning_rate'] = 0.3

In [31]:
train = all_data[all_data['date_block_num'] < 34]
test = all_data[all_data['date_block_num'] == 34]

Xtrain, ytrain = train.drop(['item_cnt_month'] , axis=1), train['item_cnt_month']
Xtest, ytest = test.drop(['item_cnt_month'] , axis=1), test['item_cnt_month']

dtrain = xgb.DMatrix(data=Xtrain.to_numpy(), label=ytrain
                     , feature_names=Xtrain.columns)
dtest = xgb.DMatrix(data=Xtest.to_numpy(), label=ytest
                   , feature_names=Xtrain.columns)

del train, test, Xtrain, ytrain, Xtest, ytest
gc.collect()

34

In [32]:
model4 = xgb.train(params, dtrain, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=100
           , evals=[(dtrain,'train')])

[0]	train-rmse:1.17345
Will train until train-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.86292
[200]	train-rmse:0.82747
[300]	train-rmse:0.80212
[400]	train-rmse:0.78391
[500]	train-rmse:0.77124
[600]	train-rmse:0.75901
[700]	train-rmse:0.74942
[800]	train-rmse:0.74120
[900]	train-rmse:0.73349
[999]	train-rmse:0.72710


In [33]:
ans = pd.DataFrame({'item_cnt_month': model4.predict(dtest)})
ans['item_cnt_month'] = ans['item_cnt_month'].clip(0, 20)
ans.insert(0, 'ID', ans.index)
ans.to_csv(DATA_DIR / 'baseline_xgb4.csv', index=False)
#overfit 1.016534