In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap

import os
import getpass
import gc
from pathlib import Path

import itertools

import xgboost as xgb
from lightgbm import LGBMRegressor

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from tqdm.notebook import tqdm
import warnings
#warnings.filterwarnings('ignore')

In [2]:
#with such encoding got better results
def cv_mean_enc(sample, column, kf, globalmean, alpha):
    
    for train_ix, test_ix in kf.split(sample):
        train, test = sample.iloc[train_ix], sample.iloc[test_ix]
        
        #smoothing
        column_enc = (train.groupby(column)['item_cnt_month'].mean()
                        * train.groupby(column)['item_cnt_month'].count()
                        + globalmean * alpha) / (train.groupby(column)['item_cnt_month'].count() + alpha)
        #column_enc = train.groupby(column)['item_cnt_month'].mean()


        
        sample.loc[test_ix, column+'_enc'] = sample.loc[test_ix, column].map(column_enc)

    #fill nan with global mean
    sample[column+'_enc'].fillna(globalmean, inplace=True) 
    
    return sample
    
def mean_enconding(sample, column, alpha=5):

    kf = KFold(n_splits=5, shuffle=False)
    
    
    #globalmean_part = sample_part[column].mean()
    globalmean = sample[column].mean()
    
    #sample_part = cv_mean_enc(sample_part, column, kf, globalmean_part, alpha)
    sample = cv_mean_enc(sample, column, kf, globalmean, alpha)

    return sample

In [3]:
DATA_DIR = Path('/data', getpass.getuser(), 'kaggle', 'future_sales') #path to data files

sales = pd.read_csv(DATA_DIR / 'sales_train.csv')
items = pd.read_csv(DATA_DIR / 'items.csv')
item_categories = pd.read_csv(DATA_DIR / 'item_categories.csv')
shops = pd.read_csv(DATA_DIR /'shops.csv')

test = pd.read_csv(DATA_DIR / 'test.csv').set_index('ID')

**Little preporcessing**

In [4]:
def shop_id_change(sample):
    sample.loc[sample['shop_id']==57, 'shop_id'] = 0
    sample.loc[sample['shop_id']==58, 'shop_id'] = 1
    sample.loc[sample['shop_id']==11, 'shop_id'] = 10
    
    return sample

In [5]:
sales = sales[(sales['item_cnt_day'] <= 900) & (sales['item_price'] <= 60000) & ((sales['item_price'] >= 0))]

In [6]:
#change shop_id
sales = shop_id_change(sales)
shops = shop_id_change(shops)
test = shop_id_change(test)

In [7]:
#create feature - city
shops['shop_city'] = shops['shop_name'].apply(lambda x: x.lower()).str.replace('[^\w\s]', '').str.strip()
shops['shop_city'] = shops['shop_city'].apply(lambda x: x.split()[0])
shops['shop_city'] = LabelEncoder().fit_transform(shops['shop_city'])

In [8]:
#create feature - section of products
item_categories['item_section_name'] = item_categories['item_category_name'] \
                                        .apply(lambda x: x.lower().split('-')[0].split()[0].strip())
item_categories['item_section_name'] = LabelEncoder().fit_transform(item_categories['item_section_name'])


Since we need to predict monthly, so aggregate data

In [9]:
index_cols = ['shop_id', 'item_id', 'date_block_num']
grid = []

for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(itertools.product(*[cur_shops, cur_items, [block_num]])), dtype='int32'))

grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)


# Aggregations
sales['item_cnt_day'] = sales['item_cnt_day'].clip(0,20) #seems it's better to clip here
train = sales.groupby(['shop_id', 'item_id', 'date_block_num'], as_index=False) \
            .agg({'item_cnt_day':'sum', 'item_price':'mean'})
train = train.rename(columns = {'item_cnt_day' : 'item_cnt_month'})
train['item_cnt_month'] = train['item_cnt_month'].clip(0,20) #one more clip

#merge aggregated sales and grid
train = pd.merge(grid, train, how='left',on=index_cols)
train['item_cnt_month'] = train['item_cnt_month'].fillna(0)

In [10]:
train = train.drop(['item_price'], axis=1)
print(train.shape)
train.head(2)

(10913804, 4)


Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
0,59,22154,0,1.0
1,59,2552,0,0.0


In [11]:
#add some features to train
train = train.merge(items[['item_id', 'item_category_id']], on='item_id')
train = train.merge(item_categories[['item_category_id', 'item_section_name']], on='item_category_id')
train = train.merge(shops[['shop_city', 'shop_id']], on='shop_id')

train = train.drop_duplicates()
train = train.sort_values(by=['date_block_num']).reset_index(drop=True)
train.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_category_id,item_section_name,shop_city
0,59,22154,0,1.0,37,7,30
1,16,6197,0,0.0,55,9,10


In [12]:
test['date_block_num']= 34
test['item_cnt_month'] = -1 #since need to predict it
print(test.shape)

(214200, 4)


In [13]:
test = test.merge(items[['item_id', 'item_category_id']], on='item_id', how='left')
test = test.merge(item_categories[['item_category_id', 'item_section_name']], on='item_category_id', how='left')
test = test.merge(shops[['shop_city', 'shop_id']], on='shop_id', how='left')
test = test.drop_duplicates().reset_index(drop=True)

In [14]:
test.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_category_id,item_section_name,shop_city
0,5,5037,34,-1,19,5,3
1,5,5320,34,-1,55,9,3


**Combine data**

In [15]:
#concat all data to create mean encoded features and lag ones
all_data = pd.concat([train, test]).reset_index(drop=True)

**Add lag features**

In [16]:
def create_lag_features(sample, group_cols, statistics=['median'], shift_range=list(range(1,13))):
    '''Create lag features based on grouped columns and statistics
    
    Parameters:
        sample - datafram
        group_cols - list of columns which must be used for group by operation, the first one is 'date_block_num'
        statistics - list of statistics which must be calculated
        shif_range - orders of lag's 
        
    Return:
        sample with new features
    '''
    
    gb = sample.groupby(group_cols, as_index=False)
    
    #to create name of new cols
    name = ''
    for i, x in enumerate(group_cols[1:]):
        if not i:
            name += x
        else:
            name += '_'+x
        
    for stat in statistics:
        for month_shift in shift_range:
            feature = name+'_cnt_'+str(month_shift)+'_'+stat
            res = gb.agg({'item_cnt_month': stat})
            res['date_block_num'] += month_shift
            res = res.rename(columns={'item_cnt_month': feature})
            sample = sample.merge(res, on=group_cols, how='left')
            sample[feature] = sample[feature].fillna(0)
            
    return sample

In [17]:
# List of columns that we will use to create lags
lag_features = ['item_cnt_month']

shift_range = list(range(1, 13))

for month_shift in tqdm(shift_range):
    train_shift = all_data[index_cols + lag_features].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: f'{x}_lag_{month_shift}' if x in lag_features else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [18]:
columns = [['date_block_num', 'shop_id', 'item_id'], ['date_block_num', 'item_id'], ['date_block_num', 'shop_id']]
for cols in tqdm(columns):
    all_data = create_lag_features(all_data, cols, ['mean', 'median', 'max', 'min'])

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




**Add more features**

In [19]:
#one more features since we have predict based on pairs shop_id and item_id
all_data['shop_item'] = (all_data['shop_id'].astype(str) + '0' + all_data['item_id'].astype(str)).astype(int)

In [20]:
all_data.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_category_id,item_section_name,shop_city,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,...,shop_id_cnt_4_min,shop_id_cnt_5_min,shop_id_cnt_6_min,shop_id_cnt_7_min,shop_id_cnt_8_min,shop_id_cnt_9_min,shop_id_cnt_10_min,shop_id_cnt_11_min,shop_id_cnt_12_min,shop_item
0,59,22154,0,1.0,37,7,30,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59022154
1,16,6197,0,0.0,55,9,10,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1606197


**mean encoding**

In [21]:
all_data_part = all_data[all_data['date_block_num'] < 34]

In [22]:
mean_enc_features = ['item_id', 'shop_id', 'item_category_id', 'item_section_name', 'shop_city', 'shop_item']
for col in tqdm(mean_enc_features):
    all_data = mean_enconding(all_data.copy(), col)

for col in tqdm(mean_enc_features):
    all_data_part = mean_enconding(all_data_part.copy(), col)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




**save data**

In [23]:
#it's much faster than csv
all_data.to_pickle(DATA_DIR / 'all_data.pkl')
all_data_part.to_pickle(DATA_DIR / 'all_data_part.pkl')