# Loading Initial Data

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from calendar import monthrange
from itertools import product

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

%matplotlib inline

In [None]:
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
catgs = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
sales = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')

testd = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
sampl = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')

# EDA: Search for Outliers

Search for NaN values

In [None]:
print(sales.isna().sum(), '\n')
print(testd.isna().sum())

No NaN values found, look for data distribution

In [None]:
plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
sns.boxplot(x=sales.item_cnt_day)

print('Item count day - Min: {}, Max: {}'.format(sales.item_cnt_day.min(), sales.item_cnt_day.max()))

plt.figure(figsize=(10,4))
plt.xlim(sales.item_price.min(), sales.item_price.max()*1.1)
sns.boxplot(x=sales.item_price)

print('Item price - Min: {}, Max: {}'.format(sales.item_price.min(), sales.item_price.max()))

As can be seen on the graphs, there are some high outliers on prices and item count. 

Also there's some negative values on prices and count. Nagative values are expected on count values (devolution cases), but not expected on prices.

Let's remove the highest outliers and change the strange price values for a common value.

In [None]:
# Remove outliers
sales = sales[sales.item_price <= 100000]
sales = sales[sales.item_cnt_day <= 1000]

# Adjusting negatice prices (change it for median values)
median = sales[(sales.shop_id == 32) & (sales.item_id == 2973) & (sales.date_block_num == 4) & (sales.item_price > 0)].item_price.median()
sales.loc[sales.item_price < 0, 'item_price'] = median

# Shops dataset preprocessing

Since I speak no Russian, I took advantage of other people work to help extract these features. Great part of this code was extracted from [this notebook](https://www.kaggle.com/karell/xgb-baseline-advanced-feature-engineering).

Several shops are duplicates of each other (according to its name). Fix sales and testd set.

In [None]:
# Якутск Орджоникидзе, 56
sales.loc[sales.shop_id == 0, 'shop_id'] = 57
testd.loc[testd.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales.loc[sales.shop_id == 1, 'shop_id'] = 58
testd.loc[testd.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales.loc[sales.shop_id == 10, 'shop_id'] = 11
testd.loc[testd.shop_id == 10, 'shop_id'] = 11
# РостовНаДону ТРК "Мегацентр Горизонт"
sales.loc[sales.shop_id == 39, 'shop_id'] = 40
testd.loc[testd.shop_id == 39, 'shop_id'] = 40

In [None]:
shops.shop_name.unique()

Let's categorize shops in ['Орджоникидзе,' 'ТЦ' 'ТРК' 'ТРЦ', 'ул.' 'Магазин' 'ТК' 'склад' ]
Then transform other values to 'etc'

In [None]:
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['shop_category'] = shops['shop_name'].str.split(' ').map(lambda x:x[1]).astype(str)
categories = ['Орджоникидзе,', 'ТЦ', 'ТРК', 'ТРЦ','ул.', 'Магазин', 'ТК', 'склад']
shops.shop_category = shops.shop_category.apply(lambda x: x if (x in categories) else 'etc')
shops.shop_category.unique()

In [None]:
shops.groupby(['shop_category']).sum()

However, some categories have small values. So we reduce categories 9 to 5.
['Орджоникидзе,', 'ТЦ', 'ТРК', 'ТРЦ','ул.', 'Магазин', 'ТК', 'склад', 'etc'] => ['ТЦ', 'ТРК', 'ТРЦ', 'ТК', 'etc']**

In [None]:
category = ['ТЦ', 'ТРК', 'ТРЦ', 'ТК']
shops.shop_category = shops.shop_category.apply(lambda x: x if (x in category) else 'etc')
print('Category Distribution', shops.groupby(['shop_category']).sum())

shops['shop_category_code'] = LabelEncoder().fit_transform(shops['shop_category'])

Extract City name information from the Shop name

In [None]:
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code', 'shop_category_code']]

shops.head()

# Categories dataset preprocessing

In [None]:
print(len(catgs.item_category_name.unique()))
catgs.item_category_name.unique()

We think that category 'Игровые консоли' and 'Аксессуары' are same as 'Игры'.
So, we transform the two features to 'Игры'
Also, PC - Гарнитуры/Наушники and change to Музыка - Гарнитуры/Наушники

In [None]:
catgs['type'] = catgs.item_category_name.apply(lambda x: x.split(' ')[0]).astype(str)
catgs.loc[(catgs.type == 'Игровые') | (catgs.type == 'Аксессуары'), 'category'] = 'Игры'
catgs.loc[catgs.type == 'PC', 'category'] = 'Музыка'
category = ['Игры', 'Карты', 'Кино', 'Книги','Музыка', 'Подарки', 'Программы', 'Служебные', 'Чистые', 'Аксессуары']
catgs['type'] = catgs.type.apply(lambda x: x if (x in category) else 'etc')
print(catgs.groupby(['type']).sum())
catgs['type_code'] = LabelEncoder().fit_transform(catgs['type'])

# if subtype is nan then type
catgs['split'] = catgs.item_category_name.apply(lambda x: x.split('-'))
catgs['subtype'] = catgs['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
catgs['subtype_code'] = LabelEncoder().fit_transform(catgs['subtype'])
catgs = catgs[['item_category_id','type_code', 'subtype_code']]

catgs.head()

# Append train and test data

Concatenate train (sales) and test (testd) data. Also add manually some missing information on the test data like: date_block_num, year, month, item_cnt_day, item_price.

`item_price` is a missing information and `item_cnt_day` is part of the information we're trying to predict (in fact we're looking for `item_cnt_month`. `item_cnt_month` is the sum of `item_cnt_day` of a given shop and a given item on a month). For now we're gonna fill these values with 0.


In [None]:
sales['date'] = pd.to_datetime(sales['date'], format='%d.%m.%Y')
sales['month'] = sales['date'].dt.month
sales['year'] = sales['date'].dt.year
sales = sales.drop(columns=['date'])

# sales.head()
to_append = testd[['shop_id', 'item_id']].copy()

to_append['date_block_num'] = sales['date_block_num'].max() + 1
to_append['year'] = 2015
to_append['month'] = 11
to_append['item_cnt_day'] = 0
to_append['item_price'] = 0

sales = pd.concat([sales, to_append], ignore_index=True, sort=False)
sales.head()

# Date dataset preprocessing

Let's remove all date data (except `date_block_num`) from sales and store it on `period`.

In [None]:
period = sales[['date_block_num', 'year', 'month']].drop_duplicates().reset_index(drop=True)
period['days'] = period.apply(lambda r: monthrange(r.year, r.month)[1], axis=1)

sales = sales.drop(columns=['month', 'year'])

period.head()

Now let's group and summarize the sales dataset. The new dataset (agg_sales) will contain the mean price and total number of items shops and items for every single month.

After grouping the sales dataset, it's time to convert shop_id and item_id into columns using a pivot function (shop_item_sales).

In [None]:
agg_sales = sales.groupby(['date_block_num','shop_id','item_id'], as_index=False).agg({'item_price' : np.mean, 'item_cnt_day' : np.sum})\
    .rename(columns={'item_cnt_day' : 'item_cnt_month'})

shop_item_sales = pd.pivot_table(agg_sales, values='item_price', index=['date_block_num'],
                    columns=['shop_id', 'item_id'], fill_value=np.nan)
shop_item_sales

As can be seen on this data set, there's a lot of NaN information. For instance, shop_id=2 and item_id=30, this column show that the price of this shop/item has changed during, but also show some missing data during the course of time.

The intuition of the next steps is that once a price is defined it will be the same until it is explicitly changed, and the missing data (between this period) means that no item was sold (then item count will be zero).

That been said, let's use the fillna function to copy the actual value of a price to future periods until a new value is defined.

In [None]:
shop_item_sales = shop_item_sales.fillna(method='ffill')
shop_item_sales

As can be observed, on the same situation of shop_id=2 and item_id=30, almost all prices with NaN values are now filled with the most recent defined price, the exception are the first rows, since there are no previous information and no item sold we can't infer this information.

Now if the missing information defined, it's time to change the dataset to it's original format ("unpivot" the table).

In [None]:
agg_sales_future = shop_item_sales.stack().stack().reset_index().rename(columns={0 : 'item_price'})

print('agg_sales shape: ', agg_sales.shape, '\n')
print('agg_sales_future shape: ',agg_sales_future.shape)

agg_sales_future.head(10)

Since a lot of missing data was infered, the agg_sales_future have much more rows than the original agg_sales data set, however the agg_sales data set still have one extra column, the item_cnt_month.

The next step will join this two tables and fulfill all missing data (on item_cnt_month column) with zero (due the assumtion that no items were sold in this cases).

In [None]:
month_summary = pd.merge(agg_sales_future, agg_sales.drop(columns='item_price'), how='left', on=['date_block_num', 'shop_id', 'item_id'])\
            .fillna(0.0)\
            .rename(columns= {'item_price' : 'item_price_month'})\
            .sort_values(by=['shop_id', 'item_id', 'date_block_num'])

Now, let's add more dimensional data (`date_block_num`, `year`, `month`, `days`, `city_code`, `shop_category_code`, `shop_id`, `item_category_id`, `type_code`, `subtype_code`, `item_id`) to the above dataset. To achieve this goal let's join it with the other pre precessed datasets.

Also, let's downcast this dataset.

In [None]:
# Join dimensional data
month_summary = pd.merge(month_summary, shops, on='shop_id')
month_summary = pd.merge(month_summary, items, on='item_id')
month_summary = pd.merge(month_summary, catgs, on='item_category_id')
month_summary = pd.merge(month_summary, period, on='date_block_num')

# Adjusting columns order
month_summary = month_summary[['date_block_num', 'year', 'month', 'days', 'city_code', 'shop_category_code', 'shop_id', 'item_category_id', 
                               'type_code', 'subtype_code', 'item_id', 'item_price_month', 'item_cnt_month']]

# Downcasting values
for c in ['date_block_num', 'month', 'days', 'city_code', 'shop_category_code', 'shop_id', 'item_category_id', 'type_code', 'subtype_code']:
    month_summary[c] = month_summary[c].astype(np.int8)
month_summary['item_id'] = month_summary['item_id'].astype(np.int16)
month_summary['year'] = month_summary['year'].astype(np.int16)
month_summary['item_cnt_month'] = month_summary['item_cnt_month'].astype(np.float16)
month_summary['item_price_month'] = month_summary['item_price_month'].astype(np.float16)

# Remove unused and temporary datasets
del shops, items, catgs, to_append, shop_item_sales, agg_sales, agg_sales_future

month_summary.head()

In [None]:
print('Min: {} and Max: {} item_cnt_month values'.format(month_summary['item_cnt_month'].min(), month_summary['item_cnt_month'].max()))

As stated on the problem [evaluation section](https://www.kaggle.com/c/competitive-data-science-predict-future-sales/overview/evaluation):

    Submissions are evaluated by root mean squared error (RMSE). True target values are clipped into [0,20] range.
    
then, let's **clip(0,20)** target (`item_cnt_month`) values. This way train target will be similar to the test predictions.

In [None]:
month_summary['item_cnt_month'] = month_summary['item_cnt_month'].clip(0,20)

# Mean Encoded Features

This section is focused on the generation of new features (measures) based on the existing ones. For instance, we can create a generalization feature that calculate the mean of `item_cnt_month` for every shop on a specific month and add this as a new feature `date_shop_avg_item_cnt`. This technique can be used with other dimensions (`item_id`, `item_category_id`, `city_code`) or a combination of features (`shop_id` + `item_category_id`).

This is a powerful technique to help generalize the prediction capabilities of a model. However, our test data does not contain any `item_price` or `item_cnt_month` (in fact, we're trying to predict this one) data. That been said, we can't count on any existing or generated actual feature, **but we can** still count on existing or generated features of **past data**. This means that, for instance, we can use the last 12 months prices of an `item_id` or last 3 months of any feature combination (`shop_id` + `item_category_id`).

To achieve this goal let's define the `agg_by_and_lag` function, it will generate mean encoded features based on an informed `group_cols` list of columns and "lagging" the data N months informed on the `

In [None]:
def agg_by(month_summary, group_cols, new_col, target_col = 'item_cnt_month', agg_func = 'mean'):
    aux = month_summary\
        .groupby(group_cols, as_index=False)\
        .agg({target_col : agg_func})\
        .rename(columns= {target_col : new_col})
    aux[new_col] = aux[new_col].astype(np.float16)

    return pd.merge(month_summary, aux, how='left', on=group_cols)

def lag_feature(df, col, lags=[1,2,3,6,12]):
    tmp = df[['date_block_num','shop_id','item_id', col]]
    for i in lags:
        shifted = tmp.copy()
        cols = ['date_block_num','shop_id','item_id', '{}_lag_{}'.format(col, i)]
        shifted.columns = cols
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left').fillna(value={(cols[-1]) : 0.0})
    return df

def agg_by_and_lag(month_summary, group_cols, new_col, lags=[1,2,3,6,12], target_col = 'item_cnt_month', agg_func = 'mean'):
    tmp = agg_by(month_summary, group_cols, new_col, target_col, agg_func)
    tmp = lag_feature(tmp, new_col, lags)
    return tmp.drop(columns=[new_col])

Mean encode and lag `item_cnt_month` data.

In [None]:
# date_avg_item_cnt
month_summary = agg_by_and_lag(month_summary, ['date_block_num'], 'date_avg_item_cnt', [1])

# date_avg_item_cnt
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'item_id'], 'date_item_avg_item_cnt', [1,2,3,6,12])

# date_city_avg_item_cnt
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'city_code'], 'date_city_avg_item_cnt', [1])

# date_shop_avg_item_cnt
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'shop_id'], 'date_shop_avg_item_cnt', [1,2,3,6,12])

# date_cat_avg_item_cnt
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'item_category_id'], 'date_cat_avg_item_cnt', [1])

# date_type_avg_item_cnt
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'type_code'], 'date_type_avg_item_cnt', [1])

# date_subtype_avg_item_cnt
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'subtype_code'], 'date_subtype_avg_item_cnt', [1])

# date_shop_category_avg_item_cnt
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'shop_category_code'], 'date_shop_category_avg_item_cnt', [1])

# date_shop_cat_avg_item_cnt
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'shop_id', 'item_category_id'], 'date_shop_cat_avg_item_cnt', [1])

# date_shop_type_avg_item_cnt
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'shop_id', 'type_code'], 'date_shop_type_avg_item_cnt', [1])

# date_shop_subtype_avg_item_cnt
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'shop_id', 'subtype_code'], 'date_shop_subtype_avg_item_cnt', [1])

# date_shop_category_subtype_avg_item_cnt
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'shop_category_code', 'subtype_code'], 'date_shop_category_subtype_avg_item_cnt', [1])

# date_item_city_avg_item_cnt
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'city_code', 'item_id'], 'date_item_city_avg_item_cnt', [1])

Mean encode and lag `item_price_month` data.

In [None]:
# date_avg_item_price
month_summary = agg_by_and_lag(month_summary, ['date_block_num'], 'date_avg_item_price', [1], 'item_price_month')

# date_item_avg_item_price
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'item_id'], 'date_item_avg_item_price', [1,2,3,6,12], 'item_price_month')

# date_city_avg_item_price
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'city_code'], 'date_city_avg_item_price', [1], 'item_price_month')

# date_shop_avg_item_price
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'shop_id'], 'date_shop_avg_item_price', [1,2,3,6,12], 'item_price_month')

# date_cat_avg_item_price
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'item_category_id'], 'date_cat_avg_item_price', [1], 'item_price_month')

# date_type_avg_item_price
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'type_code'], 'date_type_avg_item_price', [1], 'item_price_month')

# date_subtype_avg_item_price
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'subtype_code'], 'date_subtype_avg_item_price', [1], 'item_price_month')

# date_shop_category_avg_item_price
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'shop_category_code'], 'date_shop_category_avg_item_price', [1], 'item_price_month')

# date_shop_cat_avg_item_price
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'shop_id', 'item_category_id'], 'date_shop_cat_avg_item_price', [1], 'item_price_month')

# date_shop_type_avg_item_price
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'shop_id', 'type_code'], 'date_shop_type_avg_item_price', [1], 'item_price_month')

# date_shop_subtype_avg_item_price
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'shop_id', 'subtype_code'], 'date_shop_subtype_avg_item_price', [1], 'item_price_month')

# date_shop_category_subtype_avg_item_price
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'shop_category_code', 'subtype_code'], 'date_shop_category_subtype_avg_item_price', [1], 'item_price_month')

# date_item_city_avg_item_price
month_summary = agg_by_and_lag(month_summary, ['date_block_num', 'city_code', 'item_id'], 'date_item_city_avg_item_price', [1], 'item_price_month')

# Extra features

Let's extract some extra features, like the difference in months between and actual sell and the first time it happens.

In [None]:
month_summary['item_shop_first_sale'] = month_summary['date_block_num'] - month_summary.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
month_summary['item_first_sale'] = month_summary['date_block_num'] - month_summary.groupby('item_id')['date_block_num'].transform('min')

Our dataset is now ready, let's summarize it.

In [None]:
month_summary.to_pickle('month_summary.pkl')
month_summary.info()

# Split Data

Let's split the generated data into train, validation and test data.

For test data we will take the last month (34), this is the month we must predict the `item_cnt_month`.

For the validation data we will use the last month in the original training set (33).

And for the training data we will use all data between month 12 (since we have lagged some features in 12 months) and 32.

In [None]:
month_summary = pd.read_pickle('month_summary.pkl')

In [None]:
def generate_subsample(month_summary, target='item_cnt_month'):
    X_test = month_summary[month_summary['date_block_num'] == 34]
    X_test = X_test.drop(columns=[target])

    X_val = month_summary[month_summary['date_block_num'] == 33]
    y_val = X_val[target]
    X_val = X_val.drop(columns=[target])

    X_train = month_summary[(month_summary['date_block_num'] >= 12) & (month_summary['date_block_num'] < 33)]
    y_train = X_train[target]
    X_train = X_train.drop(columns=[target])

    return X_train, y_train, X_val, y_val, X_test

In [None]:
X_train, y_train, X_val, y_val, X_test = generate_subsample(month_summary.drop(columns=['item_price_month']), 'item_cnt_month')

del month_summary

# Train Model

Lets use the train and validation data to train a simples lightgbm model.

In [None]:
def train_gbmodel(X_train, y_train, X_val, y_val):

    RAND_SEED = 42

    lgb_params = {'num_leaves': 2**8, 'max_depth': 19, 'max_bin': 107, #'n_estimators': 3747,
              'bagging_freq': 1, 'bagging_fraction': 0.7135681370918421, 
              'feature_fraction': 0.49446461478601994, 'min_data_in_leaf': 2**8, # 88
              'learning_rate': 0.015980721586917768, 'num_threads': 2, 
              'min_sum_hessian_in_leaf': 6,
              'random_state' : RAND_SEED,
              'bagging_seed' : RAND_SEED,
              'boost_from_average' : 'true',
              'boost' : 'gbdt',
              'metric' : 'rmse',
              'verbose' : 1}

    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_val = lgb.Dataset(X_val, label=y_val)

    return lgb.train(lgb_params, lgb_train, 
                      num_boost_round=300,
                      valid_sets=[lgb_train, lgb_val],
                      early_stopping_rounds=20)

In [None]:
# model_old_item = train_gbmodel(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]).clip(0, 20), X_val, y_val.clip(0, 20))
gbm_model = train_gbmodel(X_train, y_train, X_val, y_val)

y_hat = gbm_model.predict(X_val).clip(0, 20)
print(np.sqrt(mean_squared_error(y_val.clip(0, 20), y_hat)))

with open('./gbm_model.pickle', 'wb') as handle:
    pickle.dump(gbm_model, handle)

With the trained model, let's finally use it to predict the `item_cnt_month` of the test dataset.

In [None]:
y_pred = gbm_model.predict(X_test).clip(0, 20)

result = pd.merge(testd, X_test.assign(item_cnt_month=y_pred), how='left', on=['shop_id', 'item_id'])[['ID', 'item_cnt_month']]
result.to_csv('submission.csv', index=False)