# Predicting Future Sales - First Competition Entry
I broke this notebook down into multiple stages.  For the sake of RAM, I've error'd out many of the cells which clean the data, but keep the code for others to see.  I used Microsoft's LightGBM for my predictions, and much of the processing was done on a Google cloud instance.

In [1]:
import os
import re
import gc
import math
import numpy as np
import pandas as pd
import datetime
from itertools import product
from tqdm import tqdm_notebook as tqdm

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline

import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

import stldecompose
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm

skip_processing = False

print(os.listdir('../input'))

In [23]:
sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
sales['date'] = sales.date.apply(lambda x: datetime.datetime.strptime(x, '%d.%m.%Y'))
categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
sales.head()

### What is the frequency of sales data?
365

In [None]:
len(sales.date.apply(lambda x: x.strftime('%m-%d')).unique())

## How do monthly total sales look?
What months are there?

In [None]:
sales.date_block_num.unique()

In [None]:
def plot_seasonal(res, axes, axes_col):
    for i, p in enumerate(['observed', 'trend', 'seasonal', 'resid']):
        axes[i, axes_col].plot(getattr(res, p))
        if not axes_col:
            axes[i, axes_col].set_ylabel(p.title())

In [None]:
total_ts = sales.groupby('date_block_num').item_cnt_day.sum()
fig, axes = plt.subplots(4, 3, figsize=(12, 6))

add_decomposed = sm.tsa.seasonal_decompose(total_ts.values, freq=12, model="additive")
mul_decomposed = sm.tsa.seasonal_decompose(total_ts.values, freq=12, model="multiplicative")
stl_decompose = stldecompose.decompose(total_ts.values, period=12)

plot_seasonal(add_decomposed, axes, 0)
plot_seasonal(mul_decomposed, axes, 1)
plot_seasonal(stl_decompose, axes, 2)

axes[0, 0].set_title('Additive')
axes[0, 1].set_title('Multiplicative')
axes[0, 2].set_title('Lowess')
fig.tight_layout();

## How do daily sales by shop look?

In [None]:
def plot_monthly_shop_sales(shops_per_row=3, height_scalar=15):
    shop_sales = sales.groupby(['date', 'shop_id']).agg({'item_cnt_day': 'sum'})
    shop_sales = shop_sales.unstack(level=0).transpose()
    
    num_shops = len(shop_sales.columns)
    
    nrows = math.ceil(num_shops / shops_per_row)
    height = height_scalar * shops_per_row
    
    fig, axes = plt.subplots(nrows, 1, figsize=(10, height))
    for i in range(0, num_shops, shops_per_row):
        ax_row = axes[int(i / shops_per_row)]
        shop_sales.iloc[:,i:i+shops_per_row].plot(ax=ax_row, alpha=0.8)
        ax_row.set_xlabel('')
        ax_row.set_xticklabels([])
    fig.tight_layout()
plot_monthly_shop_sales()

### Many shops are limited and infrequent

In [None]:
shop_months = [{'shop_id': ind, 'total_months': len(x), 'min': min(x), 
                'max': max(x), 'missing_months': 1 + max(x) - min(x) != len(x)} 
               for ind, x in 
                   sales.groupby(['shop_id']).date_block_num.unique().items()]
shop_months = pd.DataFrame(shop_months)
shop_months[shop_months['max'] < 33]

### Clearly significant missing data
## What are these items?

In [None]:
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
items.tail()

* Item names have inconsistent formats
* One category per item

## What are item categories?

In [None]:
categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
categories.head()

### Item names have consistent pattern: Main Category -/() Subcategory

In [None]:
categories[categories.item_category_name.apply(lambda x: ' - ' not in x and '(' not in x)]

### ...with some exceptions

In [None]:
def get_main_cat(name):
    if ' - ' in name:
        return name.split(' - ')[0]
    elif '(' in name:
        return name.split('(')[0].strip()
    return name
categories['main_category'] = categories.item_category_name.apply(get_main_cat)
categories.main_category.unique()

#### Better decipher these...

In [None]:
translate_categories = {
    'PC': 'PC',
    'Аксессуары': 'Accessories',
    'Цифра': 'Figure',
    'Доставка товара': 'Delivery of goods',
    'Игровые консоли': 'Game consoles',
    'Игры': 'Games',
    'Игры Android': 'Android games',
    'Игры MAC': 'Games MAC',
    'Игры PC': 'Games PC',
    'Кино, Музыка, Игры': 'Cinema, Music, Games',
    'Карты оплаты': 'Payment cards',
    'Кино': 'Cinema',
    'Билеты': 'Tickets',
    'Книги': 'Books',
    'Музыка': 'Music',
    'Подарки': 'Gifts',
    'Программы': 'Programs',
    'Служебные': 'Utilities',
    'Чистые носители': 'Clean Media',
    'Элементы питания': 'Batteries'
}
categories['main_category'] = categories.main_category.apply(lambda x: translate_categories[x])
categories.main_category.unique()

## What are shops?

In [None]:
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv').set_index('shop_id')
shops.tail()

### Shops have a mostly consistent nomenclature: City "Shop name"
Need to fix some exceptions:

In [None]:
_shop_name_replace = {
    'Выездная Торговля': 'Выездная Торговля ""',
    'Жуковский ул. Чкалова 39м?': 'Жуковский ул. "Чкалова 39м?"',
    'Жуковский ул. Чкалова 39м²': 'Жуковский ул. "Чкалова 39м²"',
    'Воронеж (Плехановская, 13)': 'Воронеж "Плехановская, 13"',
    'Интернет-магазин ЧС': 'Интернет-магазин "ЧС"',
    'Москва Магазин С21': 'Москва "Магазин С21"',
    'Цифровой склад 1С-Онлайн': 'Цифровой склад "1С-Онлайн"',
    'Якутск Орджоникидзе, 56': 'Якутск "Орджоникидзе, 56"',
    '!Якутск Орджоникидзе, 56 фран': 'Якутск "Орджоникидзе, 56 фран"',
    'Воронеж ТРЦ Сити-Парк "Град"': 'Воронеж ТРЦ "Сити-Парк Град"'
}
shops['shop_name'] = shops.shop_name.apply(lambda x: _shop_name_replace[x] if x in _shop_name_replace else x)

In [None]:
_shop_type = {
    'ТЦ': 'Shopping center',
    'ТРК': 'Dispenser',
    'ТРЦ': 'Shopping mall',
    'ТК': 'TC',
    'МТРЦ': 'MTRC',
    'Цифровой': 'Digital Warehouse',
    'Интернет-магазин': 'Online'
}
#Get the shop type of the store
_type = shops.shop_name.apply(lambda x: [w for w in x.split() if w in _shop_type])
shops['shop_type'] = _type.apply(lambda x: _shop_type[x[0]] if len(x) else np.nan)
#Get the city from the shop name: !Moscow TC "Store 26" -> Moscow
_city = shops.shop_name.apply(lambda x: x.split(' "')[0])
_city = _city.apply(lambda x: " ".join([w for w in x.split() if w not in _shop_type]))
_city = _city.str.replace('!', '').str.title()
shops['shop_city'] = _city
shops.head()

In [None]:
_pop_replace = {
    '': np.nan, 'Адыгея': 282419, 'Балашиха': 228567, 'Волжский': 320761, 'Вологда': 305397, 
    'Воронеж': 997447, 'Выездная Торговля': np.nan, 'Жуковский Ул.': 107994, 'Казань': 1169000, 
    'Калуга': 328871, 'Коломна': 144838, 'Красноярск': 1007000, 'Курск': 425950, 
    'Москва': 11920000, 'Мытищи': 176825, 'Н.Новгород': 1257000, 'Новосибирск': 1511000, 
    'Омск': 1159000, 'Ростовнадону': 1100000, 'Спб': 4991000, 'Самара': 1170000, 
    'Сергиев Посад': 109076, 'Сургут': 321062, 'Томск': 543596, 'Тюмень': 621918, 'Якутск': 282419, 
    'Уфа': 1075000, 'Химки': 218275, 'Склад': np.nan, 'Чехов': 71301, 'Ярославль': 597161
}
shops['shop_city_pop'] = shops.shop_city.map(lambda x: _pop_replace[x])
shops.head()

## Finally, what does the test set look like?

In [None]:
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv').set_index('ID')
test.head()

Need to set date_block_num to 34.  Are there any new items or shops?

In [None]:
len(set(test.item_id) - set(sales.item_id)), len(set(test.shop_id) - set(sales.shop_id))

363 new items in the test set. No new shops.

In [None]:
test.shop_id.nunique() * test.item_id.nunique() == test.shape[0]

Test contains all combinations of shops and items, but train contains only combinations which resulted in a sale.

## Gather all features together
Test set contains many new items, but train set only includes data where sales have been made.

Need to index out every shop/item combination for each month.

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
#Create index for every combination of shop and item per month
index = []
for dbn in sales.date_block_num.unique():
    shop_ids = sales[sales.date_block_num == dbn].shop_id.unique()
    item_ids = sales[sales.date_block_num == dbn].item_id.unique()
    index.append(np.array(list(product(shop_ids, item_ids, [dbn])), dtype='int16'))
index = pd.DataFrame(np.vstack(index), columns=['shop_id', 'item_id', 'date_block_num'])
index = index.set_index(['shop_id', 'item_id', 'date_block_num']).index
index.shape

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
sales_train = sales.groupby(['shop_id', 'item_id', 'date_block_num'])
sales_train = sales_train.agg({'item_cnt_day':'sum'})
sales_train = sales_train.reindex(index=index)
sales_train = sales_train.reset_index()
sales_train = sales_train.rename(columns={'item_cnt_day': 'item_cnt_month'})
sales_train['item_cnt_month'] = sales_train.item_cnt_month.clip(0, 20).fillna(0)
sales_train = sales_train.set_index(['shop_id', 'item_id', 'date_block_num'])
sales_train.shape

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
sales_train = sales_train.reset_index().merge(items[['item_id', 'item_category_id']], on = 'item_id')
sales_train.head()

## Quick Baseline

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
sales_train = sales_train.fillna(0)
train_X, train_y = sales_train.drop('item_cnt_month', axis=1), sales_train.item_cnt_month.clip(0, 20)
model = lgb.LGBMModel(objective='regression', max_depth=10, n_estimators=100, min_child_weight=0.5, 
                         random_state=40, n_jobs=-1, silent=False)
model.fit(train_X, train_y, eval_metric='rmse')

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
test = test.merge(items[['item_id', 'item_category_id']], on='item_id')
test = test.set_index('ID')
test['date_block_num'] = 34
pred = model.predict(test)
pred = pd.DataFrame(pred, columns=['item_cnt_month']).fillna(0).clip(0, 20)
pred.index.names = ['ID']
pred.to_csv('baseline.csv')

In [None]:
def set_type(series, to_float=False):
    ints = [('int8', 255), ('int16', 65535), ('int32', 2147483647), ('int64', np.inf)]
    floats = [('float16', 32767), ('float32', 2147483647), ('float64', np.inf)]
    dtype = series.dtype.name
    if dtype.startswith('int') and not to_float:
        maxval = series.abs().max()
        for key, val in ints:
            if maxval < val:
                return series.astype(key)
    if dtype.startswith('float') or (to_float and dtype.startswith('int')):
        maxval = series.abs().max()
        for key, val in floats:
            if maxval < val:
                return series.astype(key)
    if dtype in {'object', 'category'}:
        l = LabelEncoder()
        return l.fit_transform(series.fillna('Other')).astype('int8')
    return series
def minimize_memory(df, reset_index=True, to_float=False):
    if reset_index:
        df = df.reset_index()
    for col in df.columns:
        df[col] = set_type(df[col], to_float)
    return df

In [None]:
del add_decomposed, mul_decomposed, stl_decompose, shop_months, train_X, train_y, model, pred, shop_ids, item_ids
gc.collect()

# Building the Model
#### Feature ideas
1. Monthly shop and/or item sales lag
2. Monthly shop and/or item price lag
3. Number of active days per month
4. Months/days since last item sale
5. Months/days since last shop sale
6. Mean encodings of shops and items
7. End-of-month percent-of-mean encodings
8. Shop population, category, and type
9. Item categories
10. Monthly revenue lag
11. Total monthly sale lag
12. Monhs since first sale
13. Month number

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
lags = [1, 2, 3, 6, 12]
#train
train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
train = train[train.item_price < 100000]
train = train[train.item_cnt_day <= 1000]
train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
median = train.loc[(train.shop_id==32) & (train.item_id==2973) & 
                   (train.date_block_num==4) & (train.item_price>0)].item_price.median()
train.loc[train.item_price < 0, 'item_price'] = median
train['item_revenue_month'] = train.item_cnt_day * train.item_price
train = train.groupby(['shop_id', 'item_id', 'date_block_num'])
train = train.agg({'item_cnt_day': 'sum', 'item_price': 'mean', 'item_revenue_month': 'sum'})
train.rename(columns={'item_cnt_day': 'item_cnt_month', 'item_price': 'item_mean_price'}, inplace=True)
train = train.reindex(index=index)
#test
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
test_ids = test.ID
test['date_block_num'] = 34
test = test.set_index(['shop_id', 'item_id', 'date_block_num']).drop('ID', axis=1)
#join together
train_test = pd.concat([train, test])
train_test = train_test.fillna(0).astype({'item_cnt_month': 'int32', 'item_mean_price': 'int32'})
train_test = minimize_memory(train_test)
train_test.head()

In [None]:
#Remove outliers
if skip_processing:
    raise Exception("Skipping data processing steps")
train_test.loc[train_test.shop_id == 0, 'shop_id'] = 57
train_test.loc[train_test.shop_id == 1, 'shop_id'] = 58
train_test.loc[train_test.shop_id == 10, 'shop_id'] = 11
train_test['item_cnt_month'] = train_test.item_cnt_month.fillna(0).clip(0, 20)
train_test.set_index(['shop_id', 'item_id', 'date_block_num'], inplace=True)

## 1 & 2. Monthly shop and/or item sales + price lag

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
def add_aggregate_lags(df, gb_cols, target_col, prefix, astype='float64', 
                       fillna=np.nan, lags=lags, agg='mean'):
    df = df.reset_index()
    _gb = df.groupby(gb_cols).agg({target_col: agg})
    for lag in lags:
        _temp = _gb.copy()
        name = prefix + str(lag)
        _temp.reset_index(inplace=True)
        _temp['date_block_num'] += lag
        _temp = _temp.rename(columns={target_col: name})
        df = pd.merge(df, _temp, on=gb_cols, how='left')
        df[name] = df[name].fillna(fillna).astype(astype)
    return df.set_index(['shop_id', 'item_id', 'date_block_num'])
train_test = add_aggregate_lags(train_test, ['shop_id', 'item_id', 'date_block_num'], 'item_cnt_month', 
                                'shop_item_sales_lag_', fillna=0, astype='int16', agg='sum')
train_test = add_aggregate_lags(train_test, ['shop_id', 'date_block_num'], 'item_cnt_month', 'shop_sales_lag_', 
                                fillna=0, astype='int32', agg='sum')
train_test = add_aggregate_lags(train_test, ['item_id', 'date_block_num'], 'item_cnt_month', 'item_sales_lag_', 
                                fillna=0, astype='int32', agg='sum')
train_test = add_aggregate_lags(train_test, ['shop_id', 'item_id', 'date_block_num'], 'item_mean_price', 
                                'shop_item_price_lag_', lags=[1, 2, 3])
train_test = add_aggregate_lags(train_test, ['item_id', 'date_block_num'], 'item_mean_price', 
                                'item_price_lag_', lags=[1, 2, 3])
train_test.tail()

# 3. Number of active days per month

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
days_per_block = pd.DataFrame({'days_per_block': sales.date.apply(lambda x: x.strftime('%m-%d'))})
days_per_block['date_block_num'] = sales.date_block_num
days_per_block = days_per_block.groupby('date_block_num').nunique()[['days_per_block']].reset_index()
days_per_block = days_per_block.append({'date_block_num': 34, 'days_per_block': 30}, ignore_index=True)
train_test = train_test.reset_index().merge(days_per_block, on='date_block_num')
train_test['days_per_block'] = (train_test.days_per_block - 30).astype('int8')
train_test.set_index(['shop_id', 'item_id', 'date_block_num'], inplace=True)
train_test[['days_per_block']].tail()

# 4 & 5. Months since last item and/or shop sale

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
#Inspired by dlarionov
def add_since_features(df, key_func, name):
    known = {}
    df[name] = -1
    df[name] = df[name].astype(np.int8)
    for i, row in df.iterrows():    
        key = key_func(row)
        if key not in known:
            if row.item_cnt_month > 0:
                known[key] = row.date_block_num
        else:
            if known[key] < row.date_block_num:
                df.at[i, name] = row.date_block_num - known[key]
                known[key] = row.date_block_num  
    return df
train_test.reset_index(inplace=True)
train_test = add_since_features(train_test, lambda r: str(r.shop_id) + '_' + str(r.item_id), 'm_since_last_shop_item_sale')
train_test = add_since_features(train_test, lambda r: str(r.shop_id), 'm_since_last_shop_sale')
train_test = add_since_features(train_test, lambda r: str(r.item_id), 'm_since_last_item_sale')
train_test.set_index(['shop_id', 'item_id', 'date_block_num'], inplace=True)
train_test.iloc[-5:,-3:]

# 6. Mean encodings of shops and items

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
def add_mean_encodings(df, index_cols, target_col, name):
    _gb = df.groupby(index_cols)[[target_col]].mean()
    _gb.rename(columns={target_col: name}, inplace=True)
    return pd.merge(df, _gb.reset_index(), on=index_cols, how='left')
train_test.reset_index(inplace=True)
train_test = add_mean_encodings(train_test, ['shop_id'], 'item_cnt_month', 'shop_mean')
train_test = add_mean_encodings(train_test, ['item_id'], 'item_cnt_month', 'item_mean')
train_test.set_index(['shop_id', 'item_id', 'date_block_num'], inplace=True)
train_test = minimize_memory(train_test).set_index(['shop_id', 'item_id', 'date_block_num'])
train_test[['shop_mean', 'item_mean']].tail()

# 7. End-of-month Percent encodings

In [26]:
if skip_processing:
    raise Exception("Skipping data processing steps")
_gb = sales.groupby(['shop_id', 'item_id', 'date_block_num']).agg({'item_cnt_day': ['sum', 'last']})
_gb.columns = ['month_sum', 'month_last']
_gb = pd.merge(_gb.reset_index(), days_per_block, on='date_block_num')
_gb['date_block_num'] += 1
_gb['end_of_month_percent'] = (_gb.month_last / (_gb.month_sum / (_gb.days_per_block + 30)))
_gb = _gb[['shop_id', 'item_id', 'date_block_num', 'end_of_month_percent']]
_gb['end_of_month_percent'] = _gb.end_of_month_percent.fillna(0)
train_test = pd.merge(train_test.reset_index(), _gb, 
                      on=['shop_id', 'item_id', 'date_block_num'], how='left')
train_test.set_index(['shop_id', 'item_id', 'date_block_num'], inplace=True)
train_test[['item_cnt_month', 'end_of_month_percent']].head()

# 8. Shop population, category, and type

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
train_test.reset_index(inplace=True)
train_test = pd.merge(train_test, shops[['shop_type', 'shop_city', 'shop_city_pop']], 
         on='shop_id', validate='many_to_one', how='left')
train_test = add_mean_encodings(train_test, ['shop_type'], 'item_cnt_month', 'shop_type_mean')
train_test = add_mean_encodings(train_test, ['shop_city'], 'item_cnt_month', 'shop_city_mean')
train_test.set_index(['shop_id', 'item_id', 'date_block_num'], inplace=True)
train_test[['shop_type', 'shop_city', 'shop_city_pop']].iloc[20000:20010,:]

# 9. Item categories

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
train_test.reset_index(inplace=True)
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
items = items.merge(categories, on='item_category_id', validate='m:1', how='left')
train_test = train_test.merge(items[['item_id', 'item_category_name', 'main_category']], 
                              on='item_id', how='left', validate='m:1')
train_test = train_test.rename(columns={'item_category_name': 'item_category_full', 'main_category': 'item_category_main'})
train_test = add_mean_encodings(train_test, ['item_category_full'], 'item_cnt_month', 'item_category_full_mean')
train_test = add_mean_encodings(train_test, ['item_category_main'], 'item_cnt_month', 'item_category_main_mean')
train_test.set_index(['shop_id', 'item_id', 'date_block_num'], inplace=True)
train_test = minimize_memory(train_test).set_index(['shop_id', 'item_id', 'date_block_num'])
train_test.iloc[:5,-4:]

# 10. Revenue lags

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
train_test = add_aggregate_lags(train_test.astype({'item_revenue_month': 'float64'}), ['date_block_num'], 
                                'item_revenue_month', 'monthly_revenue_lag_', 
                                astype='int32', fillna=0, agg='sum')
train_test = add_aggregate_lags(train_test.astype({'item_revenue_month': 'float64'}), 
                                ['shop_id', 'item_id', 'date_block_num'], 'item_revenue_month', 
                                'shop_item_revenue_lag_', astype='int32', fillna=0, agg='sum')
train_test.iloc[:5,:][['item_revenue_month'] + [c for c in train_test.columns if c.startswith('shop_item_rev')]]

# 11. Total monthly sale lags

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
train_test = add_aggregate_lags(train_test, ['date_block_num'], 'item_cnt_month', 
                   'monthly_sales_lag', fillna=0, agg='sum', lags=[1, 2, 3, 6, 12])

# 12. Months since first shop/item sale

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
#Credit to dlarionov
train_test.reset_index(inplace=True)
train_test['m_since_shop_item_first_sale'] = train_test['date_block_num'] - train_test.groupby(['item_id','shop_id'])['date_block_num'].transform('min')
train_test['m_since_item_first_sale'] = train_test['date_block_num'] - train_test.groupby('item_id')['date_block_num'].transform('min')

# 13. Month Number

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
train_test['month'] = train_test.date_block_num % 12
train_test.set_index(['shop_id', 'item_id', 'date_block_num'], inplace=True)

## Data Checkpoint

In [None]:
if skip_processing:
    raise Exception("Skipping data processing steps")
del _gb, sales, items, categories, test, train, shops
gc.collect()
train_test.reset_index(inplace=True)
train_test.replace([np.inf, -np.inf], np.nan, inplace=True)
train_test = minimize_memory(train_test, reset_index=False)
train_test.set_index(['shop_id', 'item_id', 'date_block_num'], inplace=True)
train_test.to_pickle('data.pkl')

In [21]:
train_test = pd.read_pickle('../input/traintestset/data.pkl')
train_test.info()

# Model Validation
#### Strategy is to train on months 12-32 and validate on month 33

In [None]:
train_test.drop(['item_mean_price', 'item_revenue_month'], axis=1, inplace=True) #Only used for lag features
train_test.reset_index(inplace=True)
X_train = train_test[(12 <= train_test.date_block_num) & (train_test.date_block_num <= 32)].drop('item_cnt_month', axis=1)
y_train = train_test[(12 <= train_test.date_block_num) & (train_test.date_block_num <= 32)].item_cnt_month
X_eval = train_test[train_test.date_block_num == 33].drop('item_cnt_month', axis=1)
y_eval = train_test[train_test.date_block_num == 33].item_cnt_month
del train_test
gc.collect()

In [9]:
lgb_model = lgb.LGBMRegressor(objective='regression_l2', n_estimators=1000, reg_alpha=0.0, reg_lambda=0.0, 
                              random_state=40, n_jobs=-1, silent=False, max_depth=15, num_leaves=70, 
                              subsample=0.8, learning_rate=0.1)

In [None]:
categoricals = {'shop_type','shop_city','item_category_full','item_category_main'}
categoricals = [i for i, n in enumerate(X_train.columns) if n in categoricals]
lgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_eval, y_eval)], 
              verbose=True, eval_names=['Train', 'Eval'], early_stopping_rounds=100,
              categorical_feature=categoricals)

In [None]:
lgb_results = pd.DataFrame({'Eval': lgb_model.evals_result_['Eval']['l2'], 'Train': lgb_model.evals_result_['Train']['l2']})
ax = lgb_results.plot(figsize=(12, 6))
ax.set_title('LightGBM Model RMSE')
ax.set_xlabel('Cycle')
ax.set_ylabel('RMSE');

In [None]:
features = pd.DataFrame(list(zip(X_train.columns, lgb_model.feature_importances_)), columns=['Feature', 'Importance'])
features.sort_values('Importance', ascending=True, inplace=True)
ax = features.plot('Feature', 'Importance', kind='barh', figsize=(15, 12))
ax.set_xlabel('Importance (Higher is better)')
ax.set_ylabel('')
ax.set_title('Feature Importance');

This model looks very strong, with no signs of overfitting.  Let's train it on months 12-33 and predict month 34.
# Model Predictions

In [11]:
train_test = pd.read_pickle('../input/traintestset/data.pkl')
pred_index = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv').set_index('ID')

In [12]:
train_test.drop(['item_mean_price', 'item_revenue_month'], axis=1, inplace=True) #Only used for lag features
train_test.reset_index(inplace=True)
X_train = train_test[(12 <= train_test.date_block_num) & (train_test.date_block_num <= 33)].drop('item_cnt_month', axis=1)
y_train = train_test[(12 <= train_test.date_block_num) & (train_test.date_block_num <= 33)].item_cnt_month
X_eval = train_test[train_test.date_block_num == 34].drop('item_cnt_month', axis=1)
del train_test
gc.collect()

In [13]:
categoricals = {'shop_type','shop_city','item_category_full','item_category_main'}
categoricals = [i for i, n in enumerate(X_train.columns) if n in categoricals]
lgb_model.set_params(n_estimators=332) #Strongest number of trees in last run
lgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train)], 
              verbose=True, eval_names=['Train'], early_stopping_rounds=50,
              categorical_feature=categoricals)

In [None]:
pred = lgb_model.predict(X_eval)

In [None]:
X_eval['item_cnt_month'] = pred
X_eval = X_eval[['shop_id', 'item_id', 'item_cnt_month']]

In [None]:
pred_index = pred_index.reset_index().merge(X_eval, on=['shop_id', 'item_id'], how='left')
pred_index = pred_index[['ID', 'item_cnt_month']].set_index('ID')
pred_index['item_cnt_month'] = pred_index.item_cnt_month.clip(0, 20).fillna(0)
pred_index.to_csv('final.csv')

# XGBoost Predictions

In [53]:
train_test = pd.read_pickle('../input/traintestset/data.pkl')
train_test.drop(['item_mean_price', 'item_revenue_month'], axis=1, inplace=True) #Only used for lag features
train_test.reset_index(inplace=True)
X_train = train_test[(12 <= train_test.date_block_num) & (train_test.date_block_num <= 32)].drop('item_cnt_month', axis=1)
y_train = train_test[(12 <= train_test.date_block_num) & (train_test.date_block_num <= 32)].item_cnt_month
X_eval = train_test[train_test.date_block_num == 33].drop('item_cnt_month', axis=1)
y_eval = train_test[train_test.date_block_num == 33].item_cnt_month
del train_test
gc.collect()

In [2]:
xgb_model = xgb.XGBRegressor(max_depth=9, learning_rate=0.1, n_estimators=200, subsample=1,
                             colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1,
                             verbosity=1, n_jobs=-1, gamma=0, random_state=40, 
                             min_child_weight=1, max_delta_step=0, 
                             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5)

In [None]:
xgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_eval, y_eval)], 
              verbose=True, early_stopping_rounds=10)

In [3]:
#max_depth=3, n_estimators=100 => 0.797904  0.838999 no early stopping, 
#max_depth=9, n_estimators=200 => 0.630982  0.780855 at stop 78
train_test = pd.read_pickle('../input/traintestset/data.pkl')
pred_index = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv').set_index('ID')
train_test.drop(['item_mean_price', 'item_revenue_month'], axis=1, inplace=True) #Only used for lag features
train_test.reset_index(inplace=True)
X_train = train_test[(12 <= train_test.date_block_num) & (train_test.date_block_num <= 33)].drop('item_cnt_month', axis=1)
y_train = train_test[(12 <= train_test.date_block_num) & (train_test.date_block_num <= 33)].item_cnt_month
X_eval = train_test[train_test.date_block_num == 34].drop('item_cnt_month', axis=1)
del train_test
gc.collect()

In [4]:
xgb_model = xgb.XGBRegressor(max_depth=10, learning_rate=0.1, n_estimators=80, subsample=1,
                             colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1,
                             verbosity=1, n_jobs=-1, gamma=0, random_state=40, 
                             min_child_weight=1, max_delta_step=0, 
                             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5)
xgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train)], 
              verbose=True, early_stopping_rounds=50)

In [5]:
pred = xgb_model.predict(X_eval)
X_eval['item_cnt_month'] = pred
X_eval = X_eval[['shop_id', 'item_id', 'item_cnt_month']]
pred_index = pred_index.reset_index().merge(X_eval, on=['shop_id', 'item_id'], how='left')
pred_index = pred_index[['ID', 'item_cnt_month']].set_index('ID')
pred_index['item_cnt_month'] = pred_index.item_cnt_month.clip(0, 20).fillna(0)
pred_index.to_csv('xgboost_predictions.csv')

In [6]:
import pickle
with open(r"xgboost_model.pickle", "wb") as output_file:
    pickle.dump(xgb_model, output_file)

In [7]:
pred_index['item_cnt_month'] = pred_index.item_cnt_month.round()
pred_index.to_csv('xgboost_predictions_rounded.csv')

# Stacked Ensemble