In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap

import os
import getpass
import gc
from pathlib import Path

import itertools

import xgboost as xgb
from lightgbm import LGBMRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from tqdm.notebook import tqdm
import warnings
#warnings.filterwarnings('ignore')

In [2]:
#with such encoding got better results
def cv_mean_enc(sample, column, kf, globalmean, alpha):
    
    for train_ix, test_ix in kf.split(sample):
        train, test = sample.iloc[train_ix], sample.iloc[test_ix]
        
        #smoothing
        column_enc = (train.groupby(column)['item_cnt_month'].mean()
                        * train.groupby(column)['item_cnt_month'].count()
                        + globalmean * alpha) / (train.groupby(column)['item_cnt_month'].count() + alpha)
        #column_enc = train.groupby(column)['item_cnt_month'].mean()


        
        sample.loc[test_ix, column+'_enc'] = sample.loc[test_ix, column].map(column_enc)

    #fill nan with global mean
    sample[column+'_enc'].fillna(globalmean, inplace=True) 
    
    return sample
    
def mean_enconding(sample, column, alpha=5):

    kf = KFold(n_splits=5, shuffle=False)
    
    
    #globalmean_part = sample_part[column].mean()
    globalmean = sample[column].mean()
    
    #sample_part = cv_mean_enc(sample_part, column, kf, globalmean_part, alpha)
    sample = cv_mean_enc(sample, column, kf, globalmean, alpha)

    return sample

In [3]:
DATA_DIR = Path('/data', getpass.getuser(), 'kaggle', 'future_sales') #path to data files

sales = pd.read_csv(DATA_DIR / 'sales_train.csv')
items = pd.read_csv(DATA_DIR / 'items.csv')
item_categories = pd.read_csv(DATA_DIR / 'item_categories.csv')
shops = pd.read_csv(DATA_DIR /'shops.csv')

test = pd.read_csv(DATA_DIR / 'test.csv').set_index('ID')

**Little preporcessing**

In [8]:
def shop_id_change(sample):
    sample.loc[sample['shop_id']==57, 'shop_id'] = 0
    sample.loc[sample['shop_id']==58, 'shop_id'] = 1
    sample.loc[sample['shop_id']==11, 'shop_id'] = 10
    
    return sample

In [9]:
sales = sales[(sales['item_cnt_day'] <= 900) & (sales['item_price'] <= 60000) & ((sales['item_price'] >= 0))]

In [10]:
#change shop_id
sales = shop_id_change(sales)
shops = shop_id_change(shops)
test = shop_id_change(test)

In [11]:
#create feature - city
shops['shop_city'] = shops['shop_name'].apply(lambda x: x.lower()).str.replace('[^\w\s]', '').str.strip()
shops['shop_city'] = shops['shop_city'].apply(lambda x: x.split()[0])
shops['shop_city'] = LabelEncoder().fit_transform(shops['shop_city'])

In [12]:
#create feature - section of products
item_categories['item_section_name'] = item_categories['item_category_name'] \
                                        .apply(lambda x: x.lower().split('-')[0].split()[0].strip())
item_categories['item_section_name'] = LabelEncoder().fit_transform(item_categories['item_section_name'])


Since we need to predict monthly, so aggregate data

In [13]:
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = []

for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(itertools.product(*[cur_shops, cur_items, [block_num]])), dtype='int32'))

grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)


#create binary feature if item is returned
sales['item_is_less_zero'] = np.where(sales['item_cnt_day'] < 0, 1, 0)

# Aggregations
sales['item_cnt_day'] = sales['item_cnt_day'].clip(0,20) #seems it's better to clip here
train = sales.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False) \
            .agg({'item_cnt_day':'sum', 'item_price':'mean', 'item_is_less_zero': 'sum'})
train = train.rename(columns = {'item_cnt_day' : 'item_cnt_month'})
train['item_cnt_month'] = train['item_cnt_month'].clip(0,20) #one more clip

#merge aggregated sales and grid
train = pd.merge(grid, train, how='left',on=index_cols)
train['item_cnt_month'] = train['item_cnt_month'].fillna(0)

train['item_is_less_zero'] = train['item_is_less_zero'].fillna(0)
train['item_is_less_zero'] = np.where(train['item_is_less_zero'] > 0, 1, 0)

In [14]:
#try without price since we dont know on test set (i suppose they are not obliged to be equal)
#train = train.drop(['item_price'], axis=1)
train.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_is_less_zero
0,59,22154,0,1.0,999.0,0
1,59,2552,0,0.0,,0


In [15]:
train.shape

(10913804, 6)

In [16]:
#add some features to train
train = train.merge(items[['item_id', 'item_category_id']], on='item_id')
train = train.merge(item_categories[['item_category_id', 'item_section_name']], on='item_category_id')
train = train.merge(shops[['shop_city', 'shop_id']], on='shop_id')

train = train.drop_duplicates()
train = train.sort_values(by=['date_block_num']).reset_index(drop=True)
train.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_is_less_zero,item_category_id,item_section_name,shop_city
0,59,22154,0,1.0,999.0,0,37,7,30
1,16,6197,0,0.0,,0,55,9,10


In [17]:
test['date_block_num']= 34
test['item_cnt_month'] = -1 #since need to predict it
test['item_cnt_less_zero'] = 0 #check if it helps
print(test.shape)

(214200, 5)


In [18]:
test = test.merge(items[['item_id', 'item_category_id']], on='item_id', how='left')
test = test.merge(item_categories[['item_category_id', 'item_section_name']], on='item_category_id', how='left')
test = test.merge(shops[['shop_city', 'shop_id']], on='shop_id', how='left')
test = test.drop_duplicates().reset_index(drop=True)

In [19]:
test.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_cnt_less_zero,item_category_id,item_section_name,shop_city
0,5,5037,34,-1,0,19,5,3
1,5,5320,34,-1,0,55,9,3


**Add mean/median item_price per shop in test**

In [None]:
temp = train.groupby(['shop_id', 'item_id']).agg({'item_price' : 'median'}).reset_index()
test = test.merge(temp, on=['shop_id', 'item_id'], how='left')
print(test.shape)
test.head(2)

**Combine data**

In [21]:
#concat all data to create mean encoded features and lag ones
all_data = pd.concat([train, test]).reset_index(drop=True)

In [22]:
# List of columns that we will use to create lags
#using 'item_price here' seems to lead to overfit
lag_features = ['item_cnt_month', 'item_price', 'item_is_less_zero']
#lag_features = ['item_cnt_month']

shift_range = list(range(1, 13))

for month_shift in tqdm(shift_range):
    train_shift = all_data[index_cols + lag_features].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: f'{x}_lag_{month_shift}' if x in lag_features else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [23]:
#one more features since we have predict based on pairs shop_id and item_id
all_data['shop_item'] = (all_data['shop_id'].astype(str) + '0' + all_data['item_id'].astype(str)).astype(int)
#order of month
all_data['month'] = all_data['date_block_num'].apply(lambda x: x % 12)

In [24]:
all_data.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_price,item_is_less_zero,item_category_id,item_section_name,shop_city,item_cnt_less_zero,...,item_price_lag_10,item_is_less_zero_lag_10,item_cnt_month_lag_11,item_price_lag_11,item_is_less_zero_lag_11,item_cnt_month_lag_12,item_price_lag_12,item_is_less_zero_lag_12,shop_item,month
0,59,22154,0,1.0,999.0,0.0,37,7,30,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59022154,0
1,16,6197,0,0.0,0.0,0.0,55,9,10,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1606197,0


**mean encoding**

In [25]:
#train_part, val = all_data[all_data['date_block_num'] < 33], all_data[all_data['date_block_num'] == 33]
#train, test = all_data[all_data['date_block_num'] < 34], all_data[all_data['date_block_num'] == 34]
#############################################################################################
all_data_part = all_data[all_data['date_block_num'] < 34]

In [26]:
mean_enc_features = ['item_id', 'shop_id', 'item_category_id', 'item_section_name', 'shop_city', 'shop_item']
#encode for all train data and test
#for col in tqdm(mean_enc_features):
#    train, test = mean_enconding(train.copy(), test.copy(), col)

#encode for train_part data and validation
#for col in tqdm(mean_enc_features):
#    train_part, val = mean_enconding(train_part.copy(), val.copy(), col)
#####################################################################################################
for col in tqdm(mean_enc_features):
    all_data = mean_enconding(all_data.copy(), col)

for col in tqdm(mean_enc_features):
    all_data_part = mean_enconding(all_data_part.copy(), col)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [27]:
#to validate
train_part = all_data_part[all_data_part['date_block_num'] < 33]
val = all_data_part[all_data_part['date_block_num'] == 33]

#to fit on all data
train, test = all_data[all_data['date_block_num'] < 34], all_data[all_data['date_block_num'] == 34]

In [28]:
#check if target is clipped
print(train['item_cnt_month'].min(), train['item_cnt_month'].max())

0.0 20.0


In [29]:
Xtrain_part, ytrain_part = train_part.drop(['item_cnt_month'] , axis=1), train_part['item_cnt_month']
Xval, yval = val.drop(['item_cnt_month'] , axis=1), val['item_cnt_month']

Xtrain, ytrain = train.drop(['item_cnt_month'] , axis=1), train['item_cnt_month']
Xtest, ytest = test.drop(['item_cnt_month'] , axis=1), test['item_cnt_month']

dtrain_part = xgb.DMatrix(data=Xtrain_part.to_numpy(), label=ytrain_part
                     , feature_names=Xtrain.columns)
dval = xgb.DMatrix(data=Xval.to_numpy(), label=yval
                     , feature_names=Xtrain.columns)

dtrain = xgb.DMatrix(data=Xtrain.to_numpy(), label=ytrain
                     , feature_names=Xtrain.columns)
dtest = xgb.DMatrix(data=Xtest.to_numpy(), label=ytest
                   , feature_names=Xtrain.columns)

In [30]:
default_params = {  'objective': 'reg:squarederror'                    
                    , 'subsample': 0.8
                    , 'colsample_bytree': 0.9
                    , 'learning_rate': 0.1
                    , 'tree_method': 'hist'
                    , 'grow_policy': 'lossguide'
                    , 'max_leaves': 63
                    , 'max_depth': 0
                    , 'gamma': 1
                    , 'reg_alpha': 1.2
                    , 'reg_lambda': 1.3
                    , 'min_child_weight': 30
                    , 'max_delta_step': 2
                    , 'n_jobs': 60
                    , 'random_state': 42
                    , 'eval_metric': 'rmse'
                }   

In [33]:
model1_part = xgb.train(default_params, dtrain_part, num_boost_round=400, early_stopping_rounds=100, verbose_eval=100
           , evals=[(dtrain_part,'train_part'), (dval,'val')])

[0]	train_part-rmse:1.19765	val-rmse:1.11913
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 100 rounds.
[100]	train_part-rmse:0.70051	val-rmse:0.73171
[200]	train_part-rmse:0.66199	val-rmse:0.71723
[300]	train_part-rmse:0.64300	val-rmse:0.71067
[399]	train_part-rmse:0.62857	val-rmse:0.70721


In [34]:
model1 = xgb.train(default_params, dtrain, num_boost_round=400, early_stopping_rounds=100, verbose_eval=100
           , evals=[(dtrain,'train')])

[0]	train-rmse:1.19595
Will train until train-rmse hasn't improved in 100 rounds.
[100]	train-rmse:0.69898
[200]	train-rmse:0.66226
[300]	train-rmse:0.64454
[399]	train-rmse:0.63063


In [35]:
ans = pd.DataFrame({'item_cnt_month': model1.predict(dtest)})
ans['item_cnt_month'] = ans['item_cnt_month'].clip(0, 20)
ans.insert(0, 'ID', ans.index)
ans.to_csv(DATA_DIR / '22_06_2020_1.csv', index=False)
#1.366496 LB - seems that using item_price like that leads to overfitting

**the same as above but drop item_price features**

In [36]:
l = ['item_price_lag_' + str(i) for i in range(1, 13)] + ['item_price']
Xtrain_part, ytrain_part = train_part.drop(l + ['item_cnt_month'] , axis=1), train_part['item_cnt_month']
Xval, yval = val.drop(l+['item_cnt_month'] , axis=1), val['item_cnt_month']

Xtrain, ytrain = train.drop(l+['item_cnt_month'] , axis=1), train['item_cnt_month']
Xtest, ytest = test.drop(l+['item_cnt_month'] , axis=1), test['item_cnt_month']

dtrain_part = xgb.DMatrix(data=Xtrain_part.to_numpy(), label=ytrain_part
                     , feature_names=Xtrain.columns)
dval = xgb.DMatrix(data=Xval.to_numpy(), label=yval
                     , feature_names=Xtrain.columns)

dtrain = xgb.DMatrix(data=Xtrain.to_numpy(), label=ytrain
                     , feature_names=Xtrain.columns)
dtest = xgb.DMatrix(data=Xtest.to_numpy(), label=ytest
                   , feature_names=Xtrain.columns)

In [37]:
model2_part = xgb.train(default_params, dtrain_part, num_boost_round=400, early_stopping_rounds=100, verbose_eval=100
           , evals=[(dtrain_part,'train_part'), (dval,'val')])

[0]	train_part-rmse:1.21758	val-rmse:1.14240
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 100 rounds.
[100]	train_part-rmse:0.90476	val-rmse:0.94841
[200]	train_part-rmse:0.87558	val-rmse:0.95621
Stopping. Best iteration:
[165]	train_part-rmse:0.88583	val-rmse:0.94589



In [42]:
model2 = xgb.train(default_params, dtrain, num_boost_round=165, early_stopping_rounds=100, verbose_eval=100
           , evals=[(dtrain,'train_part')])

[0]	train_part-rmse:1.21611
Will train until train_part-rmse hasn't improved in 100 rounds.
[100]	train_part-rmse:0.90736
[200]	train_part-rmse:0.88100
[300]	train_part-rmse:0.86350
[400]	train_part-rmse:0.85052
[499]	train_part-rmse:0.83788


In [43]:
ans = pd.DataFrame({'item_cnt_month': model2.predict(dtest)})
ans['item_cnt_month'] = ans['item_cnt_month'].clip(0, 20)
ans.insert(0, 'ID', ans.index)
ans.to_csv(DATA_DIR / '22_06_2020_3.csv', index=False)
#0.98228 - 165 trees, 0.976359 - 500 trees

**drop item_price and item_is_less_zero**

In [48]:
l = ['item_price_lag_' + str(i) for i in range(1, 13)] + ['item_price']\
    + ['item_is_less_zero_lag_' + str(i) for i in range(1, 13)] + ['item_is_less_zero']
Xtrain_part, ytrain_part = train_part.drop(l + ['item_cnt_month'] , axis=1), train_part['item_cnt_month']
Xval, yval = val.drop(l+['item_cnt_month'] , axis=1), val['item_cnt_month']

Xtrain, ytrain = train.drop(l+['item_cnt_month'] , axis=1), train['item_cnt_month']
Xtest, ytest = test.drop(l+['item_cnt_month'] , axis=1), test['item_cnt_month']

dtrain_part = xgb.DMatrix(data=Xtrain_part.to_numpy(), label=ytrain_part
                     , feature_names=Xtrain.columns)
dval = xgb.DMatrix(data=Xval.to_numpy(), label=yval
                     , feature_names=Xtrain.columns)

dtrain = xgb.DMatrix(data=Xtrain.to_numpy(), label=ytrain
                     , feature_names=Xtrain.columns)
dtest = xgb.DMatrix(data=Xtest.to_numpy(), label=ytest
                   , feature_names=Xtrain.columns)

In [49]:
model3_part = xgb.train(default_params, dtrain_part, num_boost_round=400, early_stopping_rounds=100, verbose_eval=100
           , evals=[(dtrain_part,'train_part'), (dval,'val')])

[0]	train_part-rmse:1.21776	val-rmse:1.14271
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 100 rounds.
[100]	train_part-rmse:0.90643	val-rmse:0.94459
[200]	train_part-rmse:0.87604	val-rmse:0.95315
Stopping. Best iteration:
[143]	train_part-rmse:0.89197	val-rmse:0.94215



In [53]:
model3 = xgb.train(default_params, dtrain, num_boost_round=143, early_stopping_rounds=100, verbose_eval=100
           , evals=[(dtrain,'train_part')])

[0]	train_part-rmse:1.21644
Will train until train_part-rmse hasn't improved in 100 rounds.
[100]	train_part-rmse:0.90651
[142]	train_part-rmse:0.89209


In [55]:
ans = pd.DataFrame({'item_cnt_month': model3.predict(dtest)})
ans['item_cnt_month'] = ans['item_cnt_month'].clip(0, 20)
ans.insert(0, 'ID', ans.index)
ans.to_csv(DATA_DIR / '22_06_2020_4.csv', index=False)
# 0.978727 - 143 trees, 0.973552 - 500 trees

It seems thar using item_price and item_is_less_zero are useless in a such way in this case

There is the result i got using almost all as for model3 above or like that, lost a bit

In [31]:
model1_part = xgb.train(default_params, dtrain, num_boost_round=1000, early_stopping_rounds=100, verbose_eval=100
           , evals=[(dtrain_part,'train_part'), (dval,'val')])

[0]	train_part-rmse:1.21844	val-rmse:1.14205
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 100 rounds.
[100]	train_part-rmse:0.93454	val-rmse:0.91436
[200]	train_part-rmse:0.93094	val-rmse:0.86595
[300]	train_part-rmse:0.93203	val-rmse:0.84951
[400]	train_part-rmse:0.94141	val-rmse:0.84116
[500]	train_part-rmse:0.95617	val-rmse:0.83298
[600]	train_part-rmse:0.96739	val-rmse:0.82580
[700]	train_part-rmse:0.97089	val-rmse:0.82122
[800]	train_part-rmse:0.98814	val-rmse:0.81956
[900]	train_part-rmse:1.02199	val-rmse:0.81698
[999]	train_part-rmse:1.03429	val-rmse:0.81378


In [32]:
model1 = xgb.train(default_params, dtrain, num_boost_round=1000, early_stopping_rounds=100, verbose_eval=100
           , evals=[(dtrain,'train')])

[0]	train_part-rmse:1.21644
Will train until train_part-rmse hasn't improved in 100 rounds.
[100]	train_part-rmse:0.90624
[200]	train_part-rmse:0.88000
[300]	train_part-rmse:0.86189
[400]	train_part-rmse:0.84833
[500]	train_part-rmse:0.83575
[600]	train_part-rmse:0.82518
[700]	train_part-rmse:0.81165
[800]	train_part-rmse:0.80382
[900]	train_part-rmse:0.79622
[999]	train_part-rmse:0.78983


In [30]:
ans = pd.DataFrame({'item_cnt_month': model1.predict(dtest)})
ans['item_cnt_month'] = ans['item_cnt_month'].clip(0, 20)
ans.insert(0, 'ID', ans.index)
ans.to_csv(DATA_DIR / 'scores.csv', index=False)
#96706