In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap

import os
import getpass
import gc
from pathlib import Path

import itertools

import xgboost as xgb
from lightgbm import LGBMRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from tqdm.notebook import tqdm
import warnings
#warnings.filterwarnings('ignore')

In [3]:
DATA_DIR = Path('/data', getpass.getuser(), 'kaggle', 'future_sales') #path to data files

sales = pd.read_csv(DATA_DIR / 'sales_train.csv')
items = pd.read_csv(DATA_DIR / 'items.csv')
item_categories = pd.read_csv(DATA_DIR / 'item_categories.csv')
shops = pd.read_csv(DATA_DIR /'shops.csv')

test = pd.read_csv(DATA_DIR / 'test.csv').set_index('ID')

**Little preporcessing**

In [4]:
def shop_id_change(sample):
    sample.loc[sample['shop_id']==57, 'shop_id'] = 0
    sample.loc[sample['shop_id']==58, 'shop_id'] = 1
    sample.loc[sample['shop_id']==11, 'shop_id'] = 10
    
    return sample

In [5]:
sales = sales[(sales['item_cnt_day'] <= 900) & (sales['item_price'] <= 60000) & ((sales['item_price'] >= 0))]

In [6]:
#change shop_id
sales = shop_id_change(sales)
shops = shop_id_change(shops)
test = shop_id_change(test)

In [7]:
#create feature - city
shops['shop_city'] = shops['shop_name'].apply(lambda x: x.lower()).str.replace('[^\w\s]', '').str.strip()
shops['shop_city'] = shops['shop_city'].apply(lambda x: x.split()[0])
shops['shop_city'] = LabelEncoder().fit_transform(shops['shop_city'])

In [8]:
#create feature - section of products
item_categories['item_section_name'] = item_categories['item_category_name'] \
                                        .apply(lambda x: x.lower().split('-')[0].split()[0].strip())
item_categories['item_section_name'] = LabelEncoder().fit_transform(item_categories['item_section_name'])


Since we need to predict monthly, so aggregate data

In [9]:
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = []

for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(itertools.product(*[cur_shops, cur_items, [block_num]])), dtype='int32'))

grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)


# Aggregations
sales['item_cnt_day'] = sales['item_cnt_day'].clip(0,20) #seems it's better to clip here
train = sales.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False) \
            .agg({'item_cnt_day':'sum', 'item_price':'mean'})
train = train.rename(columns = {'item_cnt_day' : 'item_cnt_month'})
train['item_cnt_month'] = train['item_cnt_month'].clip(0,20) #one more clip

#merge aggregated sales and grid
train = pd.merge(grid, train, how='left',on=index_cols)
train['item_cnt_month'] = train['item_cnt_month'].fillna(0)

In [10]:
train = train.drop(['item_price'], axis=1)
print(train.shape)
train.head(2)

(10913804, 4)


Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
0,59,22154,0,1.0
1,59,2552,0,0.0


In [11]:
#add some features to train
train = train.merge(items[['item_id', 'item_category_id']], on='item_id')
train = train.merge(item_categories[['item_category_id', 'item_section_name']], on='item_category_id')
train = train.merge(shops[['shop_city', 'shop_id']], on='shop_id')

train = train.drop_duplicates()
train = train.sort_values(by=['date_block_num']).reset_index(drop=True)
train.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_category_id,item_section_name,shop_city
0,59,22154,0,1.0,37,7,30
1,16,6197,0,0.0,55,9,10


In [12]:
test['date_block_num']= 34
test['item_cnt_month'] = -1 #since need to predict it
print(test.shape)

(214200, 4)


In [13]:
test = test.merge(items[['item_id', 'item_category_id']], on='item_id', how='left')
test = test.merge(item_categories[['item_category_id', 'item_section_name']], on='item_category_id', how='left')
test = test.merge(shops[['shop_city', 'shop_id']], on='shop_id', how='left')
test = test.drop_duplicates().reset_index(drop=True)

In [14]:
test.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_category_id,item_section_name,shop_city
0,5,5037,34,-1,19,5,3
1,5,5320,34,-1,55,9,3


**Combine data**

In [15]:
#concat all data to create mean encoded features and lag ones
all_data = pd.concat([train, test]).reset_index(drop=True)

**Add lag features**

In [16]:
def create_lag_features(sample, group_cols, statistics=['median'], shift_range=list(range(1,13))):
    '''Create lag features based on grouped columns and statistics
    
    Parameters:
        sample - datafram
        group_cols - list of columns which must be used for group by operation, the first one is 'date_block_num'
        statistics - list of statistics which must be calculated
        shif_range - orders of lag's 
        
    Return:
        sample with new features
    '''
    
    gb = sample.groupby(group_cols, as_index=False)
    
    #to create name of new cols
    name = ''
    for i, x in enumerate(group_cols[1:]):
        if not i:
            name += x
        else:
            name += '_'+x
        
    for stat in statistics:
        for month_shift in tqdm(shift_range):
            feature = name+'_cnt_'+stat+'_'+str(month_shift)
            res = gb.agg({'item_cnt_month': stat})
            res['date_block_num'] += month_shift
            res = res.rename(columns={'item_cnt_month': feature})
            sample = sample.merge(res, on=group_cols, how='left')
            sample[feature] = sample[feature].fillna(0)
            
    return sample

In [17]:
columns = [['date_block_num', 'shop_id', 'item_id'], ['date_block_num', 'item_id']]
for cols in columns:
    all_data = create_lag_features(all_data, cols)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [18]:
# List of columns that we will use to create lags
lag_features = ['item_cnt_month']

shift_range = list(range(1, 13))

for month_shift in tqdm(shift_range):
    train_shift = all_data[index_cols + lag_features].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: f'{x}_lag_{month_shift}' if x in lag_features else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




**Add more features**

In [19]:
#one more features since we have predict based on pairs shop_id and item_id
all_data['shop_item'] = (all_data['shop_id'].astype(str) + '0' + all_data['item_id'].astype(str)).astype(int)
#order of month
all_data['month'] = all_data['date_block_num'].apply(lambda x: x % 12)

In [20]:
all_data.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_category_id,item_section_name,shop_city,shop_id_item_id_cnt_median_1,shop_id_item_id_cnt_median_2,shop_id_item_id_cnt_median_3,...,item_cnt_month_lag_5,item_cnt_month_lag_6,item_cnt_month_lag_7,item_cnt_month_lag_8,item_cnt_month_lag_9,item_cnt_month_lag_10,item_cnt_month_lag_11,item_cnt_month_lag_12,shop_item,month
0,59,22154,0,1.0,37,7,30,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59022154,0
1,16,6197,0,0.0,55,9,10,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1606197,0


In [21]:
all_data = all_data[all_data['date_block_num'] > 0] #since lag features created
all_data_part = all_data[all_data['date_block_num'] < 34]

In [26]:
#to validate
train_part = all_data_part[all_data_part['date_block_num'] < 33]
val = all_data_part[all_data_part['date_block_num'] == 33]

#to fit on all data
train, test = all_data[all_data['date_block_num'] < 34], all_data[all_data['date_block_num'] == 34]

In [27]:
Xtrain_part, ytrain_part = train_part.drop(['item_cnt_month'] , axis=1), train_part['item_cnt_month']
Xval, yval = val.drop(['item_cnt_month'] , axis=1), val['item_cnt_month']

Xtrain, ytrain = train.drop(['item_cnt_month'] , axis=1), train['item_cnt_month']
Xtest, ytest = test.drop(['item_cnt_month'] , axis=1), test['item_cnt_month']

dtrain_part = xgb.DMatrix(data=Xtrain_part.to_numpy(), label=ytrain_part
                     , feature_names=Xtrain.columns)
dval = xgb.DMatrix(data=Xval.to_numpy(), label=yval
                     , feature_names=Xtrain.columns)

dtrain = xgb.DMatrix(data=Xtrain.to_numpy(), label=ytrain
                     , feature_names=Xtrain.columns)
dtest = xgb.DMatrix(data=Xtest.to_numpy(), label=ytest
                   , feature_names=Xtrain.columns)

In [28]:
default_params = {  'objective': 'reg:squarederror'                    
                    , 'subsample': 0.8
                    , 'colsample_bytree': 0.9
                    , 'learning_rate': 0.1
                    , 'tree_method': 'hist'
                    , 'grow_policy': 'lossguide'
                    , 'max_leaves': 63
                    , 'max_depth': 0
                    , 'gamma': 1
                    , 'reg_alpha': 1.2
                    , 'reg_lambda': 1.3
                    , 'min_child_weight': 30
                    , 'max_delta_step': 2
                    , 'n_jobs': 60
                    , 'random_state': 42
                    , 'eval_metric': 'rmse'
                }   

In [29]:
model1_part = xgb.train(default_params, dtrain_part, num_boost_round=1000, early_stopping_rounds=100, verbose_eval=100
           , evals=[(dtrain_part,'train_part'), (dval,'val')])

[0]	train_part-rmse:1.21511	val-rmse:1.14208
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 100 rounds.
[100]	train_part-rmse:0.90857	val-rmse:0.94096
[200]	train_part-rmse:0.88163	val-rmse:0.93121
[300]	train_part-rmse:0.86940	val-rmse:0.92739
[400]	train_part-rmse:0.85977	val-rmse:0.92443
[500]	train_part-rmse:0.85049	val-rmse:0.92228
[600]	train_part-rmse:0.84333	val-rmse:0.92001
[700]	train_part-rmse:0.83654	val-rmse:0.91753
[800]	train_part-rmse:0.83087	val-rmse:0.91611
Stopping. Best iteration:
[776]	train_part-rmse:0.83187	val-rmse:0.91590



In [30]:
model1 = xgb.train(default_params, dtrain, num_boost_round=500, early_stopping_rounds=100, verbose_eval=100
           , evals=[(dtrain,'train')])

[0]	train-rmse:1.21350
Will train until train-rmse hasn't improved in 100 rounds.
[100]	train-rmse:0.90961
[200]	train-rmse:0.88386
[300]	train-rmse:0.87090
[400]	train-rmse:0.86092
[499]	train-rmse:0.85242


In [31]:
ans = pd.DataFrame({'item_cnt_month': model1.predict(dtest)})
ans['item_cnt_month'] = ans['item_cnt_month'].clip(0, 20)
ans.insert(0, 'ID', ans.index)
ans.to_csv(DATA_DIR / 'final_ans_0.csv', index=False)
#0.968062