In [None]:
#Imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plots
import sklearn as sk # models
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from string import punctuation # punctuation array
from xgboost import XGBRegressor # XGBoost Regression
from xgboost import plot_importance
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

%matplotlib inline 
import gc, warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)
def print_files():
    import os
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))

def downcast_dtypes(df, inplace=False):
    '''
    input  df: object
    output df: object
    
    reject size of col type
    '''
    if not inplace:
        data = df.copy()
    else:
        data = df
    float_cols = [c for c in data if data[c].dtype in ["float32", "float64"]]
    int_cols = [c for c in data if data[c].dtype in ["int64", "int32"]]
    data[float_cols] = data[float_cols].astype(np.float16)
    data[int_cols] = data[int_cols].astype(np.int16)
    return data

In [None]:
print_files()
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
sales_train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv', index_col="ID")
sample_submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
item_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')

## Sales_train processing
<hr>

In [None]:
# Reduse data size
downcast_dtypes(sales_train, inplace=True)
print(sales_train.info())
# No missing values in sales_train
print("Num of missing values in sales_train: %d" %sales_train.isnull().sum().sum())
# 6 duplicated rows in sales_train
print("Num of duplicated rows in sales_train: %d" %sales_train.duplicated().sum())

### Sales_train outliers

In [None]:
fig = plt.figure(figsize=(25,5))
(ax1,ax2) = fig.subplots(1,2)
ax1.boxplot(x=sales_train.item_price, vert=False)
ax1.set_xlabel("item_price")
ax2.boxplot(x=sales_train.item_cnt_day, vert=False)
ax1.set_xlabel("item_cnt_day")
print("item_price outliers item_id", *sales_train[sales_train.item_price>45000].index.values)
print("item_cnt_day outliers item_id", *sales_train[sales_train.item_cnt_day>999].index.values)

There are items with large prices and sales.   
I've researched it in details and decided to remove items with price > 100000 and sales > 1001  
We have item_name = 'Доставка (EMS)' - delivery in every case.

In [None]:
print("Price less then zero index", *sales_train[sales_train.item_price<0].index.values)
med = sales_train[(sales_train.shop_id==32)&(sales_train.item_id==11365)\
            &(sales_train.item_price>0)&(sales_train.date_block_num==4)].median()
sales_train.iloc[484683] = med

In [None]:
train = sales_train[sales_train.item_price<100000][sales_train.item_cnt_day<1001]

## Shops processing
<hr>

Сarefully examining `shops`, you can see that there are duplicates  


In [None]:
print(shops.iloc[np.r_[10,11,23,24,39,40,0,57,1,58]])

The structure of the `shops`  
Shop City | Shop type | Shop name

Here we clean duplicates and extract feature `shop_type` , `shop_city`, `shop_name_c`  
Encode all data with LabelEncoder()

In [None]:
# Map duplicated shop_id through a dictionary
d = {0:57, 1:58, 10:11, 23:24, 39:40}
shops["shop_id"] = shops["shop_id"].apply(lambda x: d[x] if x in d.keys() else x)
sales_train["shop_id"] = sales_train["shop_id"].apply(lambda x: d[x] if x in d.keys() else x)
test["shop_id"] = test["shop_id"].apply(lambda x: d[x] if x in d.keys() else x)
# Remove all punctuation in shop_name
shops['shop_name_c'] = shops['shop_name'].apply(lambda string: "".join([pt for pt in string if pt not in punctuation])) 
shops['shop_name_c'] = shops['shop_name_c'].str.lower()

#Exctract new features

#shop_type
shops['shop_type'] = shops['shop_name_c'].apply(lambda x: 'мтрц' if 'мтрц' in x else 'трц' if 'трц' in x else 'трк' if 'трк' in x else 'тц' if 'тц' in x else 'тк' if 'тк' in x else 'тц')

#shop_city
shops['shop_city'] = shops['shop_name_c'].str.partition(' ')[0]


In [None]:
#OneHotEncoder for shop_type
OHE = OneHotEncoder(handle_unknown='ignore', dtype=np.int8)
OHE.fit(np.array(shops['shop_type'].unique()).reshape(len(shops['shop_type'].unique()), 1))
OneHot_transform = OHE.transform(np.array(shops['shop_type']).reshape(-1, 1))
#prepare to megre
OneHot_transform = pd.DataFrame(data=OneHot_transform.toarray(), columns=["lab_mtrc","lab_tk","lab_trk","lab_trc","lab_tc"])

shops_merged = pd.merge(shops, OneHot_transform, how='left',left_on="shop_id", right_on=OneHot_transform.index)

In [None]:
#LabelEncoding shop_city
shops_merged['shop_city_code'] = LabelEncoder().fit_transform(shops['shop_city'])

In [None]:
shop_prepared = shops_merged.drop(["shop_name","shop_name_c", "shop_type", "shop_city"], axis=1)
gc.collect()

## Item_categories processing
<hr>

In [None]:
#(84, 2)
item_categories.shape
#No duplicates
item_categories["item_category_name"].duplicated().sum()
item_categories.rename(columns={'item_category_id': 'category_item_id'}, inplace=True)
items.rename(columns={'item_category_id': 'category_item_id'}, inplace=True)

In [None]:
pd.options.display.max_rows = item_categories.shape[0]

In [None]:
#Clean type and encode category freature
item_categories["category"] = item_categories["item_category_name"]\
                            .str.split("-")\
                            .map(lambda x: x[0].strip())
item_categories["category_code"] = LabelEncoder().fit_transform(item_categories['category'])
#Extract subcategory
item_categories["subcat"] = item_categories["item_category_name"]\
                            .str.split("-")\
                            .map(lambda x: x[1].strip() if len(x) > 1 else 'no_SUB') # 0 -> 1
item_categories["subcat_code"] = LabelEncoder().fit_transform(item_categories['subcat'])
item_categories_prepared = item_categories.drop(["item_category_name","category", "subcat"], axis=1)

## Items processing
<hr>
We encode `additional` feature in brackets () - (UNV).

In [None]:
items

In [None]:
itemsa = items.copy()
import re
itemsa.item_name = itemsa.item_name.map(lambda x: re.search(r"(\(.*?\))|(\[(.*?)\])", x))
itemsa.item_name = itemsa.item_name.map(lambda x: (x.group() if type(x)!=type(None) else "no_feature"))
itemsa.item_name = itemsa.item_name.map(lambda x: x.strip("().[]+")) 
items["item_name_add_code"] = LabelEncoder().fit_transform(itemsa.item_name)
items_prepared = items.drop(["item_name"],axis=1)

## Joining to train
<hr>
So, what do we have on this moment?
We've prepared data set with standart features. Now we going to concatinate all data in one set  
  
But first look at the test set. It has shape (214200,1). There is product of some shops and some items within 34 month.

In [None]:
test.item_id.nunique() * test.shop_id.nunique()

 Test set has 363 unique `item_id`, they were met in train set newer. We can calculate monthly sales for train set and extend it with *zero* sales for each unique pair. Following this idea the train set will be similar to test set.

In [None]:
from itertools import product
product_train = []
colms = ['date_block_num', 'shop_id', 'item_id']
for m in range(34):
    sales = sales_train[sales_train.date_block_num==m]
    product_train.append(np.array(list(product([m], sales.shop_id.unique(), sales.item_id.unique()))))
product_train = pd.DataFrame(np.vstack(product_train), columns=colms)
product_train.sort_values(colms, inplace=True)

Now need to aggregate train set by shop/item pairs to calculate target sum, then clip(0,20) target value. This way train target will be similar to the test predictions.

In [None]:
group = sales_train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

product_train = pd.merge(product_train, group, on=colms, how='left')

In [None]:
product_train.item_cnt_month = product_train.item_cnt_month.fillna(0).clip(0,20)

Append test set as 34th month

In [None]:
test["date_block_num"] = 34

In [None]:
product_train = pd.concat([product_train, test], ignore_index=True, sort=False, keys=colms)
product_train.fillna(0, inplace=True)

Join other data to set

In [None]:
product_train = pd.merge(product_train, shop_prepared, on=['shop_id'], how='left')
product_train = pd.merge(product_train, items_prepared, on=['item_id'], how='left')
product_train = pd.merge(product_train, item_categories_prepared, on=['category_item_id'], how='left')

New duplicated rows appeared, drop them!

In [None]:
product_train = product_train.drop_duplicates().reset_index(drop=True)

## LAG FEATURES  
### item_cnt_month lag

In [None]:
def lag_feature(df, lags, col):
    '''
    General function for compute lags.
    input: df: pd.DataFrame, lags: list of lags, col: name of lagging column(string)
    output: df with LAGged feature(s)
    '''
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [None]:
# Data downcast
for x in ['date_block_num','shop_id', 'lab_mtrc', 'lab_tk', 'lab_trk', 'lab_trc', 'lab_tc', 'shop_city_code', 'category_item_id', 'category_code', 'subcat_code']:
    product_train[x] = product_train[x].astype(np.int8)
    
product_train['item_name_add_code'] = product_train['item_name_add_code'].astype(np.int16)
product_train['item_cnt_month'] = product_train['item_cnt_month'].astype(np.float16)
product_train['item_id'] = product_train['item_id'].astype(np.int16)

In [None]:
product_train = lag_feature(product_train, [1,2,3,7,12], 'item_cnt_month')

## Mean encoded features

Calculate mean encoded features and find most interesting lags with acf and pacf plots

In [None]:
product_train.columns.to_frame()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
fig, (ax1, ax2) = plt.subplots(1, 2,figsize = (16,6), dpi = 80)
product_train.item_cnt_month = product_train.item_cnt_month.astype(np.int16)
cols = ['subcat_code']
plot_acf(product_train.groupby(['date_block_num', *cols]).agg({'item_cnt_month': ['mean']}), ax = ax1, lags = 13)
plot_pacf(product_train.groupby(['date_block_num', *cols]).agg({'item_cnt_month': ['mean']}), ax = ax2, lags = 13);


In [None]:
def mean_encode(df, grop_b:list, lags:list):
    grouped = df.groupby(['date_block_num', *grop_b]).agg({'item_cnt_month': ['mean']})
    col_name = [str_part.split('_')[0] for str_part in grop_b]
    col_name = str('date_'+'_'.join(col_name)+'_avg_item_cnt')
    grouped.columns = [col_name]
    grouped.reset_index(inplace=True)

    df = pd.merge(df, grouped, on=['date_block_num',*grop_b], how='left')
    df[col_name] = df[col_name].astype(np.float16)
    df = lag_feature(df, lags, col_name)
    df.drop([col_name], axis=1, inplace=True)
    return df

In [None]:
%%time
product_train = mean_encode(product_train, ['item_id'], [1,2,3])
product_train = mean_encode(product_train, ['shop_id','category_item_id'], [1,6])
product_train = mean_encode(product_train, ['shop_id','shop_city_code'], [1,6])
product_train = mean_encode(product_train, ['shop_city_code'], [1])
product_train = mean_encode(product_train, ['item_id', 'category_item_id'], [1])
product_train = mean_encode(product_train, ['category_item_id'], [1,2,3])
product_train = mean_encode(product_train, ['subcat_code'], [1])



## Trend item price features
> I was looking for the closest not nullable price, to compare it with the avg price. If an item costs less than in the past - its is a positive thend, otherwise - negative. The more difference between closest price and avg price - the more trend value.

In [None]:
group = sales_train.groupby(['item_id']).agg({'item_price': ['mean']})
group.columns = ['item_avg_item_price']
group.reset_index(inplace=True)

product_train = pd.merge(product_train, group, on=['item_id'], how='left')
product_train['item_avg_item_price'] = product_train['item_avg_item_price'].astype(np.float16)

group = sales_train.groupby(['date_block_num','item_id']).agg({'item_price': ['mean']})
group.columns = ['date_item_avg_item_price']
group.reset_index(inplace=True)

product_train = pd.merge(product_train, group, on=['date_block_num','item_id'], how='left')
product_train['date_item_avg_item_price'] = product_train['date_item_avg_item_price'].astype(np.float16)

lags = [1,2,3,4,5,6]
product_train = lag_feature(product_train, lags, 'date_item_avg_item_price')

for i in lags:
    product_train['delta_price_lag_'+str(i)] = \
        (product_train['date_item_avg_item_price_lag_'+str(i)] - product_train['item_avg_item_price']) / product_train['item_avg_item_price']

def select_trend(row):
    for i in lags:
        if row['delta_price_lag_'+str(i)]:
            return row['delta_price_lag_'+str(i)]
    return 0
    
product_train['delta_price_lag'] = product_train.apply(select_trend, axis=1)
product_train['delta_price_lag'] = product_train['delta_price_lag'].astype(np.float16)
product_train['delta_price_lag'].fillna(0, inplace=True)

# https://stackoverflow.com/questions/31828240/first-non-null-value-per-row-from-a-list-of-pandas-columns/31828559
# matrix['price_trend'] = matrix[['delta_price_lag_1','delta_price_lag_2','delta_price_lag_3']].bfill(axis=1).iloc[:, 0]
# Invalid dtype for backfill_2d [float16]

fetures_to_drop = ['item_avg_item_price', 'date_item_avg_item_price']
for i in lags:
    fetures_to_drop += ['date_item_avg_item_price_lag_'+str(i)]
    fetures_to_drop += ['delta_price_lag_'+str(i)]

product_train.drop(fetures_to_drop, axis=1, inplace=True)


In [None]:
sales_train['revenue'] = sales_train['item_price'] *  sales_train['item_cnt_day']

In [None]:
# Month revenue trend in shop
group = sales_train.groupby(['date_block_num','shop_id']).agg({'revenue': ['sum']})
group.columns = ['date_shop_revenue']
group.reset_index(inplace=True)

product_train = pd.merge(product_train, group, on=['date_block_num','shop_id'], how='left')
product_train['date_shop_revenue'] = product_train['date_shop_revenue'].astype(np.float32)

group = group.groupby(['shop_id']).agg({'date_shop_revenue': ['mean']})
group.columns = ['shop_avg_revenue']
group.reset_index(inplace=True)

product_train = pd.merge(product_train, group, on=['shop_id'], how='left')
product_train['shop_avg_revenue'] = product_train['shop_avg_revenue'].astype(np.float32)

product_train['delta_revenue'] = (product_train['date_shop_revenue'] - product_train['shop_avg_revenue']) / product_train['shop_avg_revenue']
product_train['delta_revenue'] = product_train['delta_revenue'].astype(np.float16)

product_train = lag_feature(product_train, [1], 'delta_revenue')

product_train.drop(['date_shop_revenue','shop_avg_revenue','delta_revenue'], axis=1, inplace=True)

Months since the last sale for each shop/item pair and for item only.

In [None]:
%%time
cache = {}
product_train['item_shop_last_sale'] = -1
product_train['item_shop_last_sale'] = product_train['item_shop_last_sale'].astype(np.int8)
for idx, row in product_train.iterrows():    
    key = str(row.item_id)+' '+str(row.shop_id)
    if key not in cache:
        if row.item_cnt_month!=0:
            cache[key] = row.date_block_num
    else:
        last_date_block_num = cache[key]
        product_train.at[idx, 'item_shop_last_sale'] = row.date_block_num - last_date_block_num
        cache[key] = row.date_block_num       

Months since the first sale for each shop/item pair and for item only.

In [None]:
product_train['item_shop_first_sale'] = product_train['date_block_num'] - product_train.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
product_train['item_first_sale'] = product_train['date_block_num'] - product_train.groupby('item_id')['date_block_num'].transform('min')

### Date features

In [None]:
# ADD month
product_train['month'] = product_train['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
# ADD day in month respectively
product_train['days'] = product_train['month'].map(days).astype(np.int8)

## Fill na in lag and del first 12 month
In order to haven't data leakage

In [None]:
def fill_na(df):
    for col in df.columns:
        if ('_lag_' in col) & (df[col].isnull().any()):
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)         
    return df

product_train = product_train[product_train.date_block_num > 11]
product_train = fill_na(product_train)

## Dataset prepared

In [None]:
product_train.info()

In [None]:
import pickle
product_train.to_pickle('data.pkl')
del cache
del OneHot_transform
del shops_merged
del shop_prepared
del item_categories_prepared
del group
del items_prepared
# del test
del items
del shops
del item_categories
del sales_train
gc.collect();

In [None]:
dataset = pd.read_pickle('/kaggle/input/datapkl/data.pkl')

In [None]:
dataset = dataset[['date_block_num',
       'shop_id',
       'item_id',
       'item_cnt_month',
#        'lab_mtrc',
#        'lab_tk',
#        'lab_trk',
#        'lab_trc',
#        'lab_tc',
       'shop_city_code',
       'category_item_id',
       'item_name_add_code',
       'category_code',
       'subcat_code',
       'item_cnt_month_lag_1',
       'item_cnt_month_lag_2',
       'item_cnt_month_lag_3',
#        'item_cnt_month_lag_7',
#        'item_cnt_month_lag_12',
       'date_item_avg_item_cnt_lag_1',
       'date_item_avg_item_cnt_lag_2',
       'date_item_avg_item_cnt_lag_3',
       'date_shop_category_avg_item_cnt_lag_1',
#        'date_shop_category_avg_item_cnt_lag_6',
       'date_shop_shop_avg_item_cnt_lag_1',
#        'date_shop_shop_avg_item_cnt_lag_6',
       'date_shop_avg_item_cnt_lag_1',
       'date_item_category_avg_item_cnt_lag_1',
       'date_category_avg_item_cnt_lag_1',
       'date_category_avg_item_cnt_lag_2',
       'date_category_avg_item_cnt_lag_3',
       'date_subcat_avg_item_cnt_lag_1',
       'delta_price_lag',
       'delta_revenue_lag_1',
       'item_shop_last_sale',
       'item_shop_first_sale',
       'item_first_sale',
       'month',
       'days'
        ]]

Train 13-33   
Validation 34  
Test 35  

In [None]:
X_train = dataset[dataset.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = dataset[dataset.date_block_num < 33]['item_cnt_month']
X_valid = dataset[dataset.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = dataset[dataset.date_block_num == 33]['item_cnt_month']
X_test = dataset[dataset.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [None]:
del dataset
gc.collect();

In [None]:
xbg_model = XGBRegressor(
    tree_method = 'gpu_hist',
    max_depth=10,
    n_estimators=800,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=29)

xbg_model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

In [None]:
from catboost import CatBoostRegressor
catboost_model = CatBoostRegressor(
    iterations=1000,
    max_ctr_complexity=8,
    random_seed=29,
    od_type='Iter',
    od_wait=25,
    verbose=50,
    task_type='GPU',
    loss_function='RMSE',
    learning_rate = 0.3
)
catboost_model.fit(
    X_train, Y_train,
    eval_set=(X_valid, Y_valid)
)

## Hyperparameter tuning

In [None]:
##XGBoost
# from sklearn.model_selection import GridSearchCV
# xgb = XGBRegressor(
#     tree_method = 'gpu_hist'
#     )
# params = {'max_depth':[4,8,10],
#     'n_estimators':[800,1000],
#     'min_child_weight':[300], 
#     'colsample_bytree':[0.7,0.8], 
#     'subsample':[0.8], 
#     'eta':[0.1,0.2,0.3],    
#     'seed':[27]}
# model = GridSearchCV(xgb, param_grid=params, n_jobs=1)


# model.fit(
#     X_train, 
#     Y_train,  
#     eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
#     verbose=10, 
#     eval_metric='rmse',
#     early_stopping_rounds = 5)

##CatBoost

# from catboost import CatBoostRegressor
# catboost_model = CatBoostRegressor()
# params = {
#     'iterations':[1000,800],
#     'max_ctr_complexity':[8,5,6],
#     'random_seed':[29],
#     'od_type':['Iter'],
#     'od_wait':[25,15,20],
#     'verbose':[50],
#     'task_type':['GPU'],
#     'loss_function':['RMSE'],
# }
# catboost_model.fit(
#     X_train, Y_train,
#     eval_set=(X_valid, Y_valid)
# )


## Stacking

In [None]:
Y_pred_xgb = xbg_model.predict(X_valid).clip(0, 20)
Y_pred_cat = catboost_model.predict(X_valid).clip(0, 20)

Y_test_xgb = xbg_model.predict(X_test).clip(0, 20)
Y_test_cat = catboost_model.predict(X_test).clip(0, 20)

X_lev2_train = pd.DataFrame(np.array([Y_pred_xgb,Y_pred_cat]).T, columns=['XBG','CAT'])
X_lev2_test = pd.DataFrame(np.array([Y_test_xgb,Y_test_cat]).T, columns=['XBG','CAT'])

model_level2 =  LinearRegression()
model_level2.fit(X_lev2_train, Y_valid)
Y_answer = model_level2.predict(X_lev2_test).clip(0, 20)
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_answer
})
submission.to_csv('submission.csv', index=False)
