### Introduction

The goal of the competition to predict future sales of items in set of '1C' company's stores for one month given historical data.

#### Libraries and data

In [None]:
import gc
import re #regular expressions
import os
import time
import pickle 
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from itertools import product
from xgboost import XGBRegressor, plot_importance
from matplotlib.pylab import rcParams
from sklearn.preprocessing import LabelEncoder

sns.set(style="darkgrid")
rcParams['figure.figsize'] = 12, 4

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
cats = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')

## 1. Data Cleaning

### Outliers

In [None]:
plt.figure(figsize=(10, 4))
plt.xlim(-100, 3000)
flierprops = dict(marker='o', markerfacecolor='green', markersize=6,
                  linestyle='none', markeredgecolor='black') #style of outliers
sns.boxplot(x=train.item_cnt_day, flierprops=flierprops)

plt.figure(figsize=(10,4))
plt.xlim(train.item_price.min(), train.item_price.max()*1.1)
sns.boxplot(x=train.item_price, flierprops=flierprops)

Remove outliers, chosing thresholds visually - the items sold more than 1000 in one day, and the item with price higher than 300 thounds.

In [None]:
train = (
    train
    [
        (train['item_price'] > 0) &
        (train['item_price'] < 300000) &
        (train['item_cnt_day'] < 1000)
    ]
    .reset_index(drop = True)
)

train.loc[train['item_cnt_day'] < 0, 'item_cnt_day'] = 0

Also remove rows with negative price value and make zero negative item_cnt_day.

### Shop Dataframe Cleaning

Several entries looks like the data for same stores but for different period.

In [None]:
for i in [(0, 57), (1, 58), (10, 11)]:
    train.loc[train['shop_id'] == i[0], 'shop_id'] = i[1]
    test.loc[test['shop_id'] == i[0], 'shop_id'] = i[1]

Change some shop names and add 'city' and 'category' columns to dataframe.

In [None]:
shops.loc[shops['shop_name'] == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops.shop_name.str.split(' ').map(lambda x: x[0])
shops['category'] = shops.shop_name.str.split(' ').map(lambda x: x[1])
shops.loc[shops['city'] == '!Якутск', 'city'] = 'Якутск'

Use only large enough categories

In [None]:
categories = []
for categ in shops['category'].unique():
    if len(shops[shops['category'] == categ]) > 4:
        categories.append(categ)
shops['category'] = shops['category'].apply(lambda x: x if x in categories else 'other')

In [None]:
shops['shop_category'] = LabelEncoder().fit_transform(shops['category'])
shops['shop_city'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id', 'shop_category', 'shop_city']]
#shops

### Item Categories Data Cleaning

In [None]:
cats['type_code'] = (
    cats['item_category_name']
    .apply(
        lambda x: x.split(' ')[0]
    )
    .astype(str)
)
cats.loc[
    (cats['type_code'] == 'Игровые') |
    (cats['type_code'] == 'Аксессуары'),
    'category'
] = 'Игры'
#cats.head()

In [None]:
categories = []
for categ in cats['type_code'].unique():
    if len(cats[cats['type_code'] == categ]) > 4: 
        categories.append(categ)
cats['type_code'] = cats['type_code'].apply(lambda x: x if x in categories else 'etc')

In [None]:
cats['type_code'] = LabelEncoder().fit_transform(cats['type_code'])
cats['split'] = (
    cats['item_category_name']
    .apply(lambda x: x.split('-'))
)
cats['subtype'] = (
    cats['split']
    .apply(
        lambda x: x[1].strip() if len(x) >= 2 else x[0].strip()
    )
)
cats['subtype_code'] = LabelEncoder().fit_transform(cats['subtype'])
cats = cats[['item_category_id', 'subtype_code', 'type_code']]

### Item Data Cleaning

In [None]:
def name_correction(x):
    x = x.lower() #lower case
    x = x.partition('[')[0] # partition by square brackets
    x = x.partition('(')[0] # partition by curly brackets
    x = re.sub('\W+', ' ', x) # remove special characters
    x = x.replace('  ', ' ') # replace double spaces with single spaces
    x = x.strip() # remove leading and trailing white space
    return x

Clean item names

In [None]:
# split item names by first bracket
items['name1'], items['name2'] = items['item_name'].str.split('[', 1).str
items['name1'], items['name3'] = items['item_name'].str.split('(', 1).str

# replace special characters and turn to lower case
items['name2'] = items['name2'].str.replace('\W+', ' ').str.lower()
items['name3'] = items['name3'].str.replace('\W+', ' ').str.lower()

# fill nulls with '0'
items = items.fillna('0')

items['item_name'] = items['item_name'].apply(lambda x: name_correction(x))

# return all characters except the last if name 2 is not "0" - the closing bracket
items['name2'] = items['name2'].apply(lambda x: x[:-1] if x != '0' else '0')

Clean item type

In [None]:
items['type'] = (
    items['name2']
    .apply(
        lambda x: x[0:8] if x.split(' ')[0] == 'xbox' else x.split(' ')[0]
    )
)

items.loc[
    (items['type'] == 'x360') |
    (items['type'] == 'xbox360') |
    (items['type'] == 'xbox 360'),
    'type'
] = 'xbox 360'
items.loc[items['type'] == '', 'type'] = 'mac'
items.type = (
    items['type']
    .apply(
        lambda x: x.replace(' ', '')
    )
)
items.loc[
    (items['type'] == 'pc' ) |
    (items['type'] == 'pс') |
    (items['type'] == 'pс'),
    'type'
] = 'pс'

items.loc[items['type'] == 'рs3' , 'type'] = 'рs3'

In [None]:
group_sum = (
    items
    .groupby('type')
    .agg({'item_id': 'count'})
    .reset_index()
)

drop_cols = []
for categ in group_sum['type'].unique():
    if group_sum.loc[(group_sum['type'] == categ), 'item_id'].values[0] <= 39:
        drop_cols.append(categ)

items['name2'] = (
    items['name2']
    .apply(
        lambda x: 'other' if x in drop_cols else x
    )
)
items = items.drop(['type'], axis=1)

In [None]:
items['name2'] = LabelEncoder().fit_transform(items['name2'])
items['name3'] = LabelEncoder().fit_transform(items['name3'])

items.drop(['item_name', 'name1'], axis=1, inplace=True)
#items.head()

## 2. Data preparation & Feature Enginering

Create matrix format dataframe for every month, shop and item id to aggregate data to monthly data. 'Item_cnt_day' summed up to ' item_cnt_month'.

In [None]:
matrix = []
cols  = ['date_block_num', 'shop_id', 'item_id']
for i in range(34):
    sales = train[train['date_block_num'] == i]
    matrix.append(
        np.array(
            list(product(
                [i],
                sales['shop_id'].unique(),
                sales['item_id'].unique()
            )),
            dtype = np.int16
        )
    )

matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix = matrix.astype({
    'date_block_num': np.int8, 
    'shop_id': np.int8, 
    'item_id': np.int16
})
matrix.sort_values(cols, inplace=True)

In [None]:
# create revenue column
train['revenue'] = train['item_cnt_day'] * train['item_price']

In [None]:
group = (
    train
    .groupby(['date_block_num', 'shop_id', 'item_id'])
    .agg({
        'item_cnt_day': 'sum'
    })
)
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)
matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (
    matrix['item_cnt_month']
    .fillna(0)
    .astype(np.float16)
)

In [None]:
#Create test set for 34th month.
test['date_block_num'] = 34
test = (
    test
    .astype({
        'date_block_num': np.int8, 
        'shop_id': np.int8, 
        'item_id': np.int16
    })
)

In [None]:
#Concatenate train and test
matrix = pd.concat(
    [matrix, test.drop(['ID'], axis=1)],
    ignore_index=True, sort=False, keys=cols
)
matrix.fillna(0, inplace=True)

In [None]:
#Add all our data categories to matrix
matrix = pd.merge(matrix, shops, on='shop_id', how='left')
matrix = pd.merge(matrix, items, on='item_id', how='left')
matrix = pd.merge(matrix, cats, on='item_category_id', how='left')
matrix = (
    matrix
    .astype({
        'shop_city': np.int8,
        'shop_category': np.int8,
        'item_category_id': np.int8,
        'subtype_code': np.int8,
        'name2': np.int8,
        'name3': np.int16,
        'type_code': np.int8
    })
)

Feature Enginering. Add lags to matrix.

In [None]:
# Define a lag feature function
def lag_feature(df, lags, cols):
    for col in cols:
        tmp = df[['date_block_num', 'shop_id', 'item_id', col]]
        for i in lags:
            shifted = tmp.copy()
            shifted.columns = ['date_block_num', 'shop_id', 'item_id', col + "_lag_" + str(i)]
            shifted['date_block_num'] = shifted['date_block_num'] + i
            df = pd.merge(df, shifted, on=['date_block_num', 'shop_id', 'item_id'], how='left')
    return df

In [None]:
#Add item_cnt_month lag features.
matrix = lag_feature(matrix, [1, 2, 3], ['item_cnt_month'])

In [None]:
#Add the previous month's average item_cnt.
group = (
    matrix
    .groupby('date_block_num')
    .agg({
        'item_cnt_month' : 'mean'
    })
)
group.columns = ['date_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on='date_block_num', how="left")
matrix['date_avg_item_cnt'] = matrix['date_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], ['date_avg_item_cnt'])
matrix.drop(['date_avg_item_cnt'], axis=1, inplace=True)

In [None]:
#Add lag values of item_cnt_month for month / item_id.
group = (
    matrix
    .groupby(['date_block_num', 'item_id'])
    .agg({
        'item_cnt_month': 'mean'
    })
)
group.columns = ['date_item_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'item_id'], how='left')
matrix['date_item_avg_item_cnt'] = matrix['date_item_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1, 2, 3], ['date_item_avg_item_cnt'])
matrix.drop(['date_item_avg_item_cnt'], axis=1, inplace=True)

In [None]:
#Add lag values for item_cnt_month for every month / shop combination.
group = (
    matrix
    .groupby(['date_block_num', 'shop_id'])
    .agg({
        'item_cnt_month': 'mean'
    })
)
group.columns = ['date_shop_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id'], how='left')
matrix['date_shop_avg_item_cnt'] = matrix['date_shop_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1, 2, 3], ['date_shop_avg_item_cnt'])
matrix.drop(['date_shop_avg_item_cnt'], axis=1, inplace=True)

In [None]:
#Add lag values for item_cnt_month for month/shop/item.
group = (
    matrix
    .groupby(['date_block_num', 'shop_id', 'item_id'])
    .agg({
        'item_cnt_month': 'mean'
    })
)
group.columns = ['date_shop_item_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'item_id'], how='left')
matrix['date_shop_item_avg_item_cnt'] = matrix['date_shop_item_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1, 2, 3], ['date_shop_item_avg_item_cnt'])
matrix.drop(['date_shop_item_avg_item_cnt'], axis=1, inplace=True)

In [None]:
#Add lag values for item_cnt_month for month/shop/item subtype.
group = (
    matrix
    .groupby(['date_block_num', 'shop_id', 'subtype_code'])
    .agg({
        'item_cnt_month': 'mean'
    })
)
group.columns = ['date_shop_subtype_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'subtype_code'], how='left')
matrix['date_shop_subtype_avg_item_cnt'] = matrix['date_shop_subtype_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], ['date_shop_subtype_avg_item_cnt'])
matrix.drop(['date_shop_subtype_avg_item_cnt'], axis=1, inplace=True)

In [None]:
#Add lag values for item_cnt_month for month/city.
group = (
    matrix
    .groupby(['date_block_num', 'shop_city'])
    .agg({
        'item_cnt_month': 'mean'
    })
)
group.columns = ['date_city_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_city'], how='left')
matrix['date_city_avg_item_cnt'] = matrix['date_city_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], ['date_city_avg_item_cnt'])
matrix.drop(['date_city_avg_item_cnt'], axis=1, inplace=True)

In [None]:
#Add lag values for item_cnt_month for month/city/item.
group = (
    matrix
    .groupby(['date_block_num', 'item_id', 'shop_city'])
    .agg({
        'item_cnt_month': 'mean'
    })
)
group.columns = ['date_item_city_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'item_id', 'shop_city'], how='left')
matrix['date_item_city_avg_item_cnt'] = matrix['date_item_city_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], ['date_item_city_avg_item_cnt'])
matrix.drop(['date_item_city_avg_item_cnt'], axis=1, inplace=True)

* Add average item price to matix. 
* Add lag values of item price per month.
* Add delta price values - how current month average pirce relates to global average.

In [None]:
group = (
    train
    .groupby('item_id')
    .agg({
        'item_price': 'mean'
    })
)
group.columns = ['item_avg_item_price']
group.reset_index(inplace=True)

matrix = matrix.merge(group, on='item_id', how='left')
matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)


group = (
    train
    .groupby(['date_block_num', 'item_id'])
    .agg({
        'item_price': 'mean'
    })
)
group.columns = ['date_item_avg_item_price']
group.reset_index(inplace=True)

matrix = matrix.merge(group, on=['date_block_num', 'item_id'], how='left')
matrix['date_item_avg_item_price'] = matrix['date_item_avg_item_price'].astype(np.float16)
lags = [1, 2, 3]
matrix = lag_feature(matrix, lags, ['date_item_avg_item_price'])

for i in lags:
    matrix['delta_price_lag_' + str(i)] = (
        matrix['date_item_avg_item_price_lag_' + str(i)] -\
        matrix['item_avg_item_price']
    ) / matrix['item_avg_item_price']

def select_trends(row) :
    for i in lags:
        if row['delta_price_lag_' + str(i)]:
            return row['delta_price_lag_' + str(i)]
    return 0

matrix['delta_price_lag_'] = matrix.apply(select_trends, axis=1)
matrix['delta_price_lag_'] = matrix['delta_price_lag_'].astype(np.float16)
matrix['delta_price_lag_'].fillna(0, inplace=True)

features_to_drop = ['item_avg_item_price', 'date_item_avg_item_price']
for i in lags:
    features_to_drop.append('date_item_avg_item_price_lag_' + str(i))
    features_to_drop.append('delta_price_lag_' + str(i))
matrix.drop(features_to_drop, axis=1, inplace=True)

* Add total shop revenue per month to matrix. 
* Add lag values of revenue per month.
* Add delta revenue values - how current month revenue relates to global average. 

In [None]:
group = (
    train
    .groupby(['date_block_num', 'shop_id'])
    .agg({
        'revenue': 'sum'
    })
)
group.columns = ['date_shop_revenue']
group.reset_index(inplace=True)

matrix = matrix.merge(group, on=['date_block_num', 'shop_id'], how='left')
matrix['date_shop_revenue'] = matrix['date_shop_revenue'].astype(np.float32)

group = (
    group
    .groupby('shop_id')
    .agg({
        'date_block_num': 'mean'
    })
)
group.columns = ['shop_avg_revenue']
group.reset_index(inplace=True)

matrix = matrix.merge(group, on='shop_id', how='left')
matrix['shop_avg_revenue'] = matrix['shop_avg_revenue'].astype(np.float32)
matrix['delta_revenue'] = (
    matrix['date_shop_revenue'] - matrix['shop_avg_revenue']
) / matrix['shop_avg_revenue']
matrix['delta_revenue'] = matrix['delta_revenue'].astype(np.float32)

matrix = lag_feature(matrix, [1], ['delta_revenue'])
matrix['delta_revenue_lag_1'] = matrix['delta_revenue_lag_1'].astype(np.float32)
matrix.drop(
    ['date_shop_revenue', 'shop_avg_revenue', 'delta_revenue'],
    axis=1, inplace=True
)

In [None]:
#Add month and number of days in each month to matrix
matrix['month'] = matrix['date_block_num'] % 12
days = pd.Series([31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31])
matrix['days'] = matrix['month'].map(days).astype(np.int8)

In [None]:
#Add the month of each shop and item first sale.
matrix['item_shop_first_sale'] = (
    matrix['date_block_num'] - matrix.groupby(['item_id', 'shop_id'])['date_block_num'].transform('min')
)
matrix['item_first_sale'] = (
    matrix['date_block_num'] - matrix.groupby(['item_id'])['date_block_num'].transform('min')
)

In [None]:
#Delete first three months from matrix. They don't have lag values.
matrix = matrix[matrix['date_block_num'] >= 4]
matrix.head().T

## 3. Modelling

### xgboost

In [None]:
data = matrix.copy()

In [None]:
data[data['date_block_num'] == 34].shape

In [None]:
#Use month 34 as validation for training.
X_train = data[data.date_block_num <= 32].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num <= 32]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

In [None]:
Y_train = Y_train.clip(0, 20)
Y_valid = Y_valid.clip(0, 20)

In [None]:
model = XGBRegressor(
    max_depth=10,
    n_estimators=1000,
    min_child_weight=0.5, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.1,
    seed=42
)

model.fit(
    X_train, 
    Y_train, 
    eval_metric='rmse',
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds=20
)

In [None]:
Y_pred = model.predict(X_valid).clip(0, 20)
Y_test = model.predict(X_test).clip(0, 20)

submission = pd.DataFrame({
    'ID': test.index, 
    'item_cnt_month': Y_test
})
submission.to_csv('xgb_submission.csv', index=False)

In [None]:
def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

plot_features(model, (10, 14))