In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import matplotlib.pyplot as plt
import seaborn as sns

import catboost
from catboost import CatBoostRegressor
from catboost import Pool, cv
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from itertools import product
import gc
import shap
shap.initjs()


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Read the datasets

In [None]:
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
sample_submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')
item_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv').set_index('ID')

Lets investigate a little bit!

In [None]:
items.head()
sample_submission.head()
item_categories.head()
train.head()
shops.head()
test.head()

Lets see more about the `train` dataset

In [None]:
train.head()
train.shape
train.isnull().sum()
train.info()
train.describe()

Lets see about `test` dataset

In [None]:
test.head()
test.shape
test.isnull().sum()
test.info()
test.describe()

Great we don't have any NaNs! Now, we will investigate the values of the colums in more details.

In [None]:
sns.histplot(train, x="item_price")


The graph is heavily skewed to the right! Looks like we have some outliers! Lets dig deeper.

In [None]:
plt.figure(figsize=(10,4))
sns.boxplot(x=train["item_price"])


Yeah! we have some outliers! Let's see the rows with price greater than 100000

In [None]:
train_item_price_gt_100K = train[train['item_price'] >= 100000]
train_item_price_gt_100K

Let's remove that point!

In [None]:
train = train[train['item_price'] <= 100000]

In [None]:
plt.figure(figsize=(10,4))
sns.boxplot(x=train["item_price"])

In [None]:
plt.xlim(0,10000)
plt.ylim(0, 50000)
sns.histplot(train, x="item_price")

Lets see more info about `item_cnt_day`

In [None]:
train_item_cnt_day_gt_1000 = train[train.item_cnt_day > 1000]
train_item_cnt_day_gt_1000

In [None]:
train = train[train.item_cnt_day < 1000]

In [None]:
plt.figure(figsize=(10,4))
sns.boxplot(x=train.item_cnt_day)

Lets dig deeper into `item_cmt_day`

In [None]:
train.item_cnt_day.value_counts()

We have negative values for `item_cnt_day`! it seems like they are error points!

In [None]:
train.item_cnt_day[train.item_cnt_day < 0].value_counts()

What is the percentage of these points of the total dataset?

In [None]:
print('% of error points {}'.format(round(100 * len(train[train.item_cnt_day < 0]) / len(train), 5)))

I'm thinking of removing these points but we may have some shops and products in the test set that are using the same rows! Lets see more.

In [None]:
train_item_cnt_day_lt_0 = train[train.item_cnt_day < 0]

In [None]:
print('{} % of the test set exists in the shop ids of error points'.format(test.shop_id.isin(train_item_cnt_day_lt_0.shop_id).sum() / len(test)))

Now! its impossible to remove these points but lets have another look on the `item_id` !

In [None]:
print('{} % of the test set exists in the shop ids of error points'.format(test.item_id.isin(train_item_cnt_day_lt_0.item_id).sum() / len(test)))

Lets see more about `item_price`

In [None]:
train[train.item_price < 0].value_counts()

Also, we have one point in `item_price` in negative! Now! Let's see the best imputation technique.

In [None]:
train_shop_id_eq_32_item_id_2973 = train[(train.shop_id==32) & (train.item_id==2973)]

In [None]:
sns.histplot(data=train_shop_id_eq_32_item_id_2973, x='item_price', bins=10)

I think `median` will be better!

In [None]:
train.loc[train.item_price < 0, 'item_price'] = train[(train.shop_id==32) & (train.item_id==2973)]['item_price'].median()

In [None]:
train[train.item_price < 0].value_counts()

Nice!

Lets fix some duplidations issues in the `shop_id`. Find out more in this [kernel](www.kaggle.com/dlarionov/feature-engineering-xgboost?scriptVersionId=4396431&cellId=12)

In [None]:
# Якутск Орджоникидзе, 56
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

Great! Lets see the `item_cnt_day` best imputation technique

In [None]:
shops_item_cnt_day_lt_0  = list(train_item_cnt_day_lt_0.shop_id.unique())
for i, shop in enumerate(shops_item_cnt_day_lt_0):
    plt.figure(i)
    plt.title('Shop #{}'.format(shop))
    sns.histplot(data=train[train.shop_id == shop], x = 'item_cnt_day')

I would go with `median` imputation. `mean` imputation here will be very unrealistic

In [None]:
new_item_cnt_day = []
for row in train.itertuples():
    if row.item_cnt_day < 0:
        item_cnt_day = train[train.shop_id == row.shop_id]['item_cnt_day'].median()
    else: 
        item_cnt_day = row.item_cnt_day
    new_item_cnt_day.append(item_cnt_day)
    
train.item_cnt_day = new_item_cnt_day

Lets convert `date` column in the `train` dataset

In [None]:
train.date = pd.to_datetime(train.date,  format='%d.%m.%Y')
# Lets see the max and minumum date
train.date.min()
train.date.max()
# Train shape
train.shape

No data leakage found. The data exactly matches the description!

Lets sort `train` according to date

In [None]:
train.sort_values(by=['date'], ascending=True, inplace=True)
train.tail()

In [None]:
train['month'] = train['date'].dt.month
train['days_in_month'] = train['date'].dt.daysinmonth

In [None]:
train.head()
train.tail()

Lets drop the `date` column as we will work monthly.

In [None]:
train.drop(['date'], inplace=True, axis = 1)

Now, lets set the data types for all columns for better memory use!

In [None]:
train.head()

In [None]:
# We will add more element to this dict as we go forward
dtypes_dict = {'date_block_num': np.int8, 
              'shop_id': np.int8, 
              'item_id': np.int16, 
              'item_price': np.float32, 
              'item_cnt_day': np.int8,
               'month': np.int8,
               'days_in_month': np.int8
              }

In [None]:
train = train.astype(dtypes_dict)
train.info()

Lets see more about `shops` dataset

In [None]:
shops.head()
shops.tail()
shops.isnull().sum()

Of course, I don't understand Russian but lets try searching and translating some names.

The first sylbus of the shope name includes the city! Lets extract them as a new feature.

In [None]:
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

Lets see the `item_categories`

In [None]:
item_categories.head()
item_categories.tail()

`item_category_name` has the category and sub-category on it. 

In [None]:
item_categories['split'] = item_categories['item_category_name'].str.split('-')
item_categories['category'] = item_categories['split'].map(lambda x: x[0].strip())
item_categories['category_code'] = LabelEncoder().fit_transform(item_categories['category'])

In [None]:
# if subtype is nan then take the 
item_categories['sub_category'] = item_categories['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
item_categories['sub_category_code'] = LabelEncoder().fit_transform(item_categories['sub_category'])


In [None]:
# Final item_categories dataset
item_categories = item_categories[['item_category_id','category_code', 'sub_category_code']]

Lets see `items` dataset

In [None]:
items.head()
items.tail()

Lets drop the `item_name`

In [None]:
items.drop(['item_name'], axis=1, inplace=True)

Now, lets construct a `comb` datafram that has all the combinations of `shop_id` and `item_id` for each month.

In [None]:
comb = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num==i]
    comb.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
    
comb = pd.DataFrame(np.vstack(comb), columns=cols)
comb['date_block_num'] = comb['date_block_num'].astype(np.int8)
comb['shop_id'] = comb['shop_id'].astype(np.int8)
comb['item_id'] = comb['item_id'].astype(np.int16)
comb.sort_values(cols,inplace=True)

In [None]:
comb.head()
comb.shape

Now, lets caculate the `revenue`

In [None]:
train['revenue'] = train['item_price'] *  train['item_cnt_day']

Now, lets merge construct a monthly dataset by grouping the `train` then merging it with `comb`. 

In [None]:
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)

comb = pd.merge(comb, group, on=cols, how='left')
comb['item_cnt_month'] = (comb['item_cnt_month']
                                .fillna(0)
                                .clip(0,20) # NB clip target here
                                .astype(np.float16))

In [None]:
comb.head()

In [None]:
comb.shape

This is perfect!

Now, lets append the `test` set to add features to all of the data.

In [None]:
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

In [None]:
comb = pd.concat([comb, test], ignore_index=True, sort=False, keys=cols)
comb.fillna(0, inplace=True)

In [None]:
comb.head()
comb.tail()

Lets merge `items`, `item_categories`, and `shops` to the `comb`.

In [None]:
comb = pd.merge(comb, shops, on=['shop_id'], how='left')
comb = pd.merge(comb, items, on=['item_id'], how='left')
comb = pd.merge(comb, item_categories, on=['item_category_id'], how='left')
# Lets see the dataframe now
comb.head()
comb.shape
comb.info()

In [None]:
# Lets update the dtypes_dict
dtypes_dict['city_code'] = np.int8
dtypes_dict['item_category_id'] = np.int8
dtypes_dict['category_code'] = np.int8
dtypes_dict['sub_category_code'] = np.int8
dtypes_dict['item_cnt_month'] = np.int8

In [None]:
dtypes_dict

In [None]:
comb = comb.astype(dict((k, dtypes_dict[k]) for k in comb.columns))
comb.info()

Lets create some lag features for the target!

In [None]:
def create_lag_features(df, time_feature, keys, lags, on):
    """
    This function is to create lage features of a specific target for multiple lags
    df : pd.DataFrame, DataFrame in which we are trying to create lag features
    time_feature: list, 
    keys: list, list of static columns that will not contribute in creating lags
    lags: list, list of window periods to create lags on
    on : list, target feature 
    """
    temp = df[keys+on]
    for lag in lags:
        shifted_temp = temp.copy()
        shifted_temp.columns = keys+[on[0]+'_lag_'+str(lag)]
        shifted_temp[time_feature] +=lag
        df = pd.merge(df, shifted_temp, on = keys, how = 'left')
    return df

In [None]:
comb = create_lag_features(df = comb, keys=['date_block_num','shop_id','item_id'], time_feature = 'date_block_num',lags = [1, 2, 3, 6, 12],  on = ['item_cnt_month'])

In [None]:
comb['item_cnt_month_lag_1'] = comb['item_cnt_month_lag_1'].astype(np.float16)
comb['item_cnt_month_lag_2'] = comb['item_cnt_month_lag_2'].astype(np.float16)
comb['item_cnt_month_lag_3'] = comb['item_cnt_month_lag_3'].astype(np.float16)
comb['item_cnt_month_lag_6'] = comb['item_cnt_month_lag_6'].astype(np.float16)
comb['item_cnt_month_lag_12'] = comb['item_cnt_month_lag_12'].astype(np.float16)


comb.head()
comb.info()

Lets create some `Mean Encoded Features`

Lets create `avg_item_cnt_per_month` with some lag features

In [None]:
group = comb.groupby(['date_block_num']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'avg_item_cnt_per_month' ]
group.reset_index(inplace=True)
comb = pd.merge(comb, group, on=['date_block_num'], how='left')
comb.head()

In [None]:
comb['avg_item_cnt_per_month'] = comb['avg_item_cnt_per_month'].astype(np.float16)
comb = create_lag_features(df = comb, keys=['date_block_num','shop_id','item_id'], time_feature = 'date_block_num',lags = [1],  on = ['avg_item_cnt_per_month'])
comb.head()

Now, lets create `avg_item_cnt_per_month_per_shop`

In [None]:
group = comb.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'avg_item_cnt_per_month_per_shop' ]
group.reset_index(inplace=True)
comb = pd.merge(comb, group, on=['date_block_num','shop_id'], how='left')
comb.head()

In [None]:
comb['avg_item_cnt_per_month_per_shop'] = comb['avg_item_cnt_per_month_per_shop'].astype(np.float16)
# matrix = lag_feature(matrix, [1,2,3,6,12], 'date_shop_avg_item_cnt')
comb = create_lag_features(df = comb, keys=cols, time_feature = 'date_block_num',lags = [1,2,3,6,12],  on = ['avg_item_cnt_per_month_per_shop'])
comb.head()
# matrix.drop(['date_shop_avg_item_cnt'], axis=1, inplace=True)

In [None]:
comb.info()

Now, we will create `avg_item_cat_id_item_cnt_per_month`

In [None]:
group = comb.groupby(['date_block_num', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'avg_item_cat_id_item_cnt_per_month' ]
group.reset_index(inplace=True)
comb = pd.merge(comb, group, on=['date_block_num','item_category_id'], how='left')
comb.head()

In [None]:
comb['avg_item_cat_id_item_cnt_per_month'] = comb['avg_item_cat_id_item_cnt_per_month'].astype(np.float16)
comb = create_lag_features(df = comb, keys=cols, time_feature = 'date_block_num',lags = [1],  on = ['avg_item_cat_id_item_cnt_per_month'])
comb.head()
# matrix = lag_feature(matrix, [1], 'date_cat_avg_item_cnt')

# matrix.drop(['date_cat_avg_item_cnt'], axis=1, inplace=True)

Now, will create `avg_item_cnt_month_per_shop_per_item_cat`

In [None]:
group = comb.groupby(['date_block_num', 'shop_id', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['avg_item_cnt_month_per_shop_per_item_cat']
group.reset_index(inplace=True)
comb = pd.merge(comb, group, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
comb.head()


In [None]:
comb['avg_item_cnt_month_per_shop_per_item_cat'] = comb['avg_item_cnt_month_per_shop_per_item_cat'].astype(np.float16)
comb = create_lag_features(df = comb, keys=cols, time_feature = 'date_block_num',lags = [1],  on = ['avg_item_cnt_month_per_shop_per_item_cat'])
comb.head()

Now, will create `avg_item_cnt_month_per_cat_per_shop`

In [None]:
group = comb.groupby(['date_block_num', 'shop_id', 'category_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['avg_item_cnt_month_per_cat_per_shop']
group.reset_index(inplace=True)
comb = pd.merge(comb, group, on=['date_block_num', 'shop_id', 'category_code'], how='left')
comb.head()

In [None]:
comb['avg_item_cnt_month_per_cat_per_shop'] = comb['avg_item_cnt_month_per_cat_per_shop'].astype(np.float16)
comb = create_lag_features(df = comb, keys=cols, time_feature = 'date_block_num',lags = [1],  on = ['avg_item_cnt_month_per_cat_per_shop'])
comb.head()

In [None]:
comb.info()

Now, lets calculate `avg_item_cnt_month_per_sub_cat_per_shop`

In [None]:
group = comb.groupby(['date_block_num', 'shop_id', 'sub_category_code']).agg({'item_cnt_month': ['mean']})
group.columns = ['avg_item_cnt_month_per_sub_cat_per_shop']
group.reset_index(inplace=True)
comb = pd.merge(comb, group, on=['date_block_num', 'shop_id', 'sub_category_code'], how='left')
comb.head()

In [None]:
comb['avg_item_cnt_month_per_sub_cat_per_shop'] = comb['avg_item_cnt_month_per_sub_cat_per_shop'].astype(np.float16)
comb = create_lag_features(df = comb, keys=cols, time_feature = 'date_block_num',lags = [1],  on = ['avg_item_cnt_month_per_sub_cat_per_shop'])
comb.head()

Lets calculate, `avg_item_cnt_month_per_city`

In [None]:
group = comb.groupby(['date_block_num', 'city_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'avg_item_cnt_month_per_city' ]
group.reset_index(inplace=True)
comb = pd.merge(comb, group, on=['date_block_num', 'city_code'], how='left')
comb.head()

In [None]:
comb['avg_item_cnt_month_per_city'] = comb['avg_item_cnt_month_per_city'].astype(np.float16)
comb = create_lag_features(df = comb, keys=cols, time_feature = 'date_block_num',lags = [1],  on = ['avg_item_cnt_month_per_city'])
comb.head()

Lets calculate, `avg_item_cnt_month_per_item_per_city`

In [None]:
group = comb.groupby(['date_block_num', 'item_id', 'city_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'avg_item_cnt_month_per_item_per_city' ]
group.reset_index(inplace=True)
comb = pd.merge(comb, group, on=['date_block_num', 'item_id', 'city_code'], how='left')
comb.head()

In [None]:
comb['avg_item_cnt_month_per_item_per_city'] = comb['avg_item_cnt_month_per_item_per_city'].astype(np.float16)
comb = create_lag_features(df = comb, keys=cols, time_feature = 'date_block_num',lags = [1],  on = ['avg_item_cnt_month_per_item_per_city'])
comb.head()

`avg_item_cnt_month_per_cat_per_month`

In [None]:
group = comb.groupby(['date_block_num', 'category_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'avg_item_cnt_month_per_cat_per_month' ]
group.reset_index(inplace=True)
comb = pd.merge(comb, group, on=['date_block_num', 'category_code'], how='left')
comb.head()

In [None]:
comb['avg_item_cnt_month_per_cat_per_month'] = comb['avg_item_cnt_month_per_cat_per_month'].astype(np.float16)
# matrix = lag_feature(matrix, [1], 'date_type_avg_item_cnt')
# matrix.drop(['date_type_avg_item_cnt'], axis=1, inplace=True)
comb = create_lag_features(df = comb, keys=cols, time_feature = 'date_block_num',lags = [1],  on = ['avg_item_cnt_month_per_cat_per_month'])
comb.head()

In [None]:
del train_item_cnt_day_gt_1000, train_item_cnt_day_lt_0, train_item_price_gt_100K, train_shop_id_eq_32_item_id_2973, shops, item_categories, group
gc.collect()

`avg_item_cnt_month_per_sub_cat_per_month`

In [None]:
group = comb.groupby(['date_block_num', 'sub_category_code']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'avg_item_cnt_month_per_sub_cat_per_month' ]
group.reset_index(inplace=True)
comb = pd.merge(comb, group, on=['date_block_num', 'sub_category_code'], how='left')
comb.head()

In [None]:
comb['avg_item_cnt_month_per_sub_cat_per_month'] = comb['avg_item_cnt_month_per_sub_cat_per_month'].astype(np.float16)
comb = create_lag_features(df = comb, keys=cols, time_feature = 'date_block_num',lags = [1],  on = ['avg_item_cnt_month_per_sub_cat_per_month'])
comb.head()

In [None]:
# import sys
# def sizeof_fmt(num, suffix='B'):
#     ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
#     for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
#         if abs(num) < 1024.0:
#             return "%3.1f %s%s" % (num, unit, suffix)
#         num /= 1024.0
#     return "%.1f %s%s" % (num, 'Yi', suffix)

# for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
#                          key= lambda x: -x[1])[:10]:
#     print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

In [None]:
# del group, items, sales
# gc.collect()

In [None]:
# comb.to_pickle('/kaggle/input/competitive-data-science-predict-future-sales/comb.pkl')

In [None]:
# del comb
# gc.collect()

In [None]:
# comb = pd.read_pickle('/kaggle/input/competitive-data-science-predict-future-sales/comb.pkl')
# comb.head()

In [None]:
group = train.groupby(['item_id']).agg({'item_price': ['mean']})
group.columns = ['avg_item_price']
group.reset_index(inplace=True)

comb = pd.merge(comb, group, on=['item_id'], how='left')
comb['avg_item_price'] = comb['avg_item_price'].astype(np.float16)

In [None]:
comb.head()

In [None]:
group = train.groupby(['date_block_num','item_id']).agg({'item_price': ['mean']})
group.columns = ['avg_item_price_per_month']
group.reset_index(inplace=True)

comb = pd.merge(comb, group, on=['date_block_num','item_id'], how='left')
comb['avg_item_price_per_month'] = comb['avg_item_price_per_month'].astype(np.float16)
comb.head()

In [None]:
lags = [1,2,3,4,5,6]
comb = create_lag_features(df = comb, keys=cols, time_feature = 'date_block_num',lags = lags,  on = ['avg_item_price_per_month'])
comb.head()

In [None]:
for i in lags:
    comb['delta_price_lag_'+str(i)] = \
        (comb['avg_item_price_per_month_lag_'+str(i)] - comb['avg_item_price']) / comb['avg_item_price']

In [None]:
def select_trend(row):
    for i in lags:
        if row['delta_price_lag_'+str(i)]:
            return row['delta_price_lag_'+str(i)]
    return 0

In [None]:
comb['delta_price_lag'] = comb.apply(select_trend, axis=1)
comb['delta_price_lag'] = comb['delta_price_lag'].astype(np.float16)
comb['delta_price_lag'].fillna(0, inplace=True)
comb.head()

In [None]:
fetures_to_drop = ['avg_item_price', 'avg_item_price_per_month']
for i in lags:
    fetures_to_drop += ['avg_item_price_per_month_lag_'+str(i)]
    fetures_to_drop += ['delta_price_lag_'+str(i)]

comb.drop(fetures_to_drop, axis=1, inplace=True)
comb.head()

`sum_revenue_per_month_per_shop`

In [None]:
group = train.groupby(['date_block_num','shop_id']).agg({'revenue': ['sum']})
group.columns = ['sum_revenue_per_month_per_shop']
group.reset_index(inplace=True)

comb = pd.merge(comb, group, on=['date_block_num','shop_id'], how='left')
comb['sum_revenue_per_month_per_shop'] = comb['sum_revenue_per_month_per_shop'].astype(np.float32)
comb.head()

`avg_shop_revenue`

In [None]:
group = group.groupby(['shop_id']).agg({'sum_revenue_per_month_per_shop': ['mean']})
group.columns = ['shop_avg_revenue']
group.reset_index(inplace=True)

comb = pd.merge(comb, group, on=['shop_id'], how='left')
comb['shop_avg_revenue'] = comb['shop_avg_revenue'].astype(np.float32)



In [None]:
comb['delta_revenue'] = (comb['sum_revenue_per_month_per_shop'] - comb['shop_avg_revenue']) / comb['shop_avg_revenue']
comb['delta_revenue'] = comb['delta_revenue'].astype(np.float16)

comb = create_lag_features(df = comb, keys=cols, time_feature = 'date_block_num',lags = [1],  on = ['delta_revenue'])
comb.head()

Now, lets add the `month` and `days_in_month` features

In [None]:
month_dict = dict(train[['date_block_num', 'month']].values)
month_dict[34] = 11
days_in_month_dict = dict(train[['month', 'days_in_month']].values)
comb['month'] = comb.date_block_num.map(month_dict)
comb['days_in_month'] = comb.month.map(days_in_month_dict)
comb.head()

`item_shop_last_sale`

In [None]:
cache = {}
comb['item_shop_last_sale'] = -1
comb['item_shop_last_sale'] = comb['item_shop_last_sale'].astype(np.int8)
for idx, row in comb.iterrows():    
    key = str(row.item_id)+' '+str(row.shop_id)
    if key not in cache:
        if row.item_cnt_month!=0:
            cache[key] = row.date_block_num
    else:
        last_date_block_num = cache[key]
        comb.at[idx, 'item_shop_last_sale'] = row.date_block_num - last_date_block_num
        cache[key] = row.date_block_num      
        
comb.head()

`item_last_sale`

In [None]:
comb['item_last_sale'] = -1
comb['item_last_sale'] = comb['item_last_sale'].astype(np.int8)
for idx, row in comb.iterrows():    
    key = row.item_id
    if key not in cache:
        if row.item_cnt_month!=0:
            cache[key] = row.date_block_num
    else:
        last_date_block_num = cache[key]
        if row.date_block_num>last_date_block_num:
            comb.at[idx, 'item_last_sale'] = row.date_block_num - last_date_block_num
            cache[key] = row.date_block_num       
comb.head()

`item_shop_fist_sale` and `item_first_sale`

In [None]:
comb['item_shop_first_sale'] = comb['date_block_num'] - comb.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
comb['item_first_sale'] = comb['date_block_num'] - comb.groupby('item_id')['date_block_num'].transform('min')
comb.head()

Because we have generated lags up to 12 month, we have a lot of NaNs so, I will keep only the data for more than 11 months.

In [None]:
comb = comb[comb.date_block_num > 11]

In [None]:
comb.head()

In [None]:
def fill_na(df):
    for col in df.columns:
        if ('_lag_' in col) & (df[col].isnull().any()):
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)         
    return df

In [None]:
comb = fill_na(comb)
comb.head()

In [None]:
X_train = comb[comb.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = comb[comb.date_block_num < 33]['item_cnt_month']
X_valid = comb[comb.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = comb[comb.date_block_num == 33]['item_cnt_month']
X_test = comb[comb.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [None]:
catboost = CatBoostRegressor(iterations=1000, 
                            learning_rate=0.1, 
                            max_depth= 8, 
                            loss_function='RMSE', 
                            early_stopping_rounds=10, 
                            verbose=True, 
                            )

In [None]:
catboost.fit(X= X_train, 
             y=Y_train, 
            eval_set=(X_valid, Y_valid ),
            verbose=True, 
            plot = True)

In [None]:
mean_squared_error(Y_valid, catboost.predict(X_valid).clip(0,20))**0.5

In [None]:
y_pred = catboost.predict(X_test).clip(0,20)

In [None]:
def plot_feature_importance(importance,names,model_type):
    
    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    
    #Define size of bar plot
    plt.figure(figsize=(12,10))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
plot_feature_importance(catboost.feature_importances_, X_train.columns,'CATBOOST')


In [None]:
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": y_pred
})
submission.to_csv('catboost_submission.csv', index=False)
