In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from itertools import product

from sklearn.preprocessing import LabelEncoder

import lightgbm as lgbm
from lightgbm import plot_importance
from sklearn.metrics import mean_squared_error

import joblib

from math import sqrt

from time import time

In [None]:
train_sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
train_sales['date'] = pd.to_datetime(train_sales['date'], dayfirst = True)

test  = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv').set_index('ID')

df_items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
df_shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
df_categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')

In [None]:
print(train_sales.shape)
train_sales.describe()

In [None]:
max_cnt = train_sales["item_cnt_day"].quantile(0.999)
max_price = train_sales["item_price"].quantile(0.999)

print(max_cnt)
train_sales = train_sales[train_sales["item_cnt_day"]<=max_cnt]

print(max_price)
train_sales = train_sales[(train_sales["item_price"]>0)&(train_sales["item_price"]<=max_price)]

print(train_sales.shape)

In [None]:
train_sales.shape

### Get monthly sales train and test sets
* Get date_block_num, shops and items combination

In [None]:
regs = []

for date_block in range(34):
    train_block = train_sales[train_sales['date_block_num']==date_block]
    
    unique_shops = train_block['shop_id'].unique()
    unique_items = train_block['item_id'].unique()
    
    regs.append(np.array(list(product([date_block], 
                                      unique_shops, 
                                      unique_items))))
    
train_monthly = pd.DataFrame(np.vstack(regs), columns = ['date_block_num', 'shop_id', 'item_id'])

train_monthly.head()

In [None]:
train_monthly.shape

* Add monthly sales

In [None]:
group_sales = train_sales.groupby(['date_block_num', 'shop_id', 'item_id'], 
                                  as_index=False)[['item_cnt_day']].sum()

group_sales = group_sales.rename(columns={"item_cnt_day":"item_cnt_month"})

train_monthly = train_monthly.merge(group_sales, 
                                    on=['date_block_num', 'shop_id', 'item_id'], 
                                    how='left')

train_monthly['item_cnt_month'] = train_monthly['item_cnt_month'].fillna(0).clip(0, 20)

* Concat train and test

In [None]:
test['date_block_num'] = 34
train_test_monthly = pd.concat([train_monthly, test]).reset_index(drop = True)

### Get prices feats

In [None]:
prices = train_sales.groupby(by = ['item_id'],
                             as_index=False).agg({'item_price':['mean']})

prices.columns = ['item_id', 'item_price_mean']
prices.fillna(0, inplace = True)

train_test_monthly = train_test_monthly.merge(prices, 
                                              on=['item_id'], 
                                              how='left')

### Get the shop feats
* Formating shop dataset

In [None]:
df_shops

* 0 == 57
* 1 == 58
* 39 == 40

In [None]:
train_test_monthly.loc[train_test_monthly['shop_id']==0, 'shop_id'] = 57
train_test_monthly.loc[train_test_monthly['shop_id']==1, 'shop_id'] = 58
train_test_monthly.loc[train_test_monthly['shop_id']==39, 'shop_id'] = 40

In [None]:
df_shops.loc[df_shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
df_shops['city'] = df_shops['shop_name'].str.split(' ').map(lambda x: x[0])
df_shops.loc[df_shops['city'] == '!Якутск', 'city'] = 'Якутск'

* Merge with the train/test dataset

In [None]:
train_test_monthly = train_test_monthly.merge(df_shops[['shop_id', 'city']], 
                                              how = 'left', 
                                              on = 'shop_id')

* Encode

In [None]:
train_test_monthly['city'] = LabelEncoder().fit_transform(train_test_monthly['city'])

### Get items categories and types

* Formating categories dataset

In [None]:
df_categories['type'] = df_categories['item_category_name'].str.\
                        split('-').map(lambda x: x[0].strip())

df_categories['sub_type'] = df_categories['item_category_name'].str.\
                            split('-').map(lambda x: x[1].strip() if len(x) > 1 
                                                                  else x[0].strip())

df_categories.head()

* Merge with the items datasets

In [None]:
df_items = df_items.merge(df_categories[['item_category_id', 'type', 'sub_type']], 
                          on = 'item_category_id',
                          how = 'left')

* Merge with the train/test dataset

In [None]:
train_test_monthly = train_test_monthly.merge(df_items[['item_id', 'item_category_id', 'type', 'sub_type']], 
                                              on = 'item_id',
                                              how = 'left')

* Encode

In [None]:
train_test_monthly['type'] = LabelEncoder().fit_transform(train_test_monthly['type'])
train_test_monthly['sub_type'] = LabelEncoder().fit_transform(train_test_monthly['sub_type'])

### Get month

In [None]:
train_test_monthly['month'] = (train_test_monthly['date_block_num'] % 12) + 1

### Pair some feats

In [None]:
train_test_monthly['shop_item_id'] = train_test_monthly['shop_id'].astype(str) + '_' +\
                                     train_test_monthly['item_id'].astype(str)

train_test_monthly['shop_cat_id'] = train_test_monthly['shop_id'].astype(str) + '_' +\
                                    train_test_monthly['item_category_id'].astype(str)

train_test_monthly['shop_type_id'] = train_test_monthly['shop_id'].astype(str) + '_' +\
                                     train_test_monthly['type'].astype(str)

train_test_monthly['item_city_id'] = train_test_monthly['item_id'].astype(str) + '_' +\
                                     train_test_monthly['city'].astype(str)

train_test_monthly['item_city_month'] = train_test_monthly['item_city_id'].astype(str) + '_' +\
                                     train_test_monthly['month'].astype(str)

train_test_monthly['shop_item_month'] = train_test_monthly['shop_item_id'].astype(str) + '_' +\
                                        train_test_monthly['month'].astype(str)

train_test_monthly['shop_cat_month'] = train_test_monthly['shop_cat_id'].astype(str) + '_' +\
                                       train_test_monthly['month'].astype(str)

train_test_monthly['shop_type_month'] = train_test_monthly['shop_type_id'].astype(str) + '_' +\
                                        train_test_monthly['month'].astype(str)

In [None]:
train_test_monthly['item_id'].min()

In [None]:
train_test_monthly.drop(columns=['month'], inplace=True)

* Encode pairs

In [None]:
train_test_monthly['shop_item_id'] = LabelEncoder().fit_transform(train_test_monthly['shop_item_id'])
train_test_monthly['shop_cat_id'] = LabelEncoder().fit_transform(train_test_monthly['shop_cat_id'])
train_test_monthly['shop_type_id'] = LabelEncoder().fit_transform(train_test_monthly['shop_type_id'])
train_test_monthly['item_city_id'] = LabelEncoder().fit_transform(train_test_monthly['item_city_id'])
train_test_monthly['item_city_month'] = LabelEncoder().fit_transform(train_test_monthly['item_city_month'])
train_test_monthly['shop_item_month'] = LabelEncoder().fit_transform(train_test_monthly['shop_item_month'])
train_test_monthly['shop_cat_month'] = LabelEncoder().fit_transform(train_test_monthly['shop_cat_month'])
train_test_monthly['shop_type_month'] = LabelEncoder().fit_transform(train_test_monthly['shop_type_month'])

#### Fill NaNs
* There are some information in the test set that are not in the train set, so we now have some NaN values in the test part of our dataset

In [None]:
train_test_monthly.isna().sum()

In [None]:
group_mean_type = train_test_monthly.groupby('type')['item_price_mean'].mean()
train_test_monthly.loc[train_test_monthly['item_price_mean'].isna(), 'item_price_mean'] =\
            train_test_monthly.loc[train_test_monthly['item_price_mean'].isna(), 'type'].map(group_mean_type)

In [None]:
train_test_monthly.head().T

In [None]:
print('%i cities'%(train_test_monthly[train_test_monthly['date_block_num']<34]['city'].nunique()))
print('%i types'%(train_test_monthly[train_test_monthly['date_block_num']<34]['type'].nunique()))
print('%i sub_types'%(train_test_monthly[train_test_monthly['date_block_num']<34]['sub_type'].nunique()))

In [None]:
print('%i cities'%(train_test_monthly[train_test_monthly['date_block_num']==34]['city'].nunique()))
print('%i types'%(train_test_monthly[train_test_monthly['date_block_num']==34]['type'].nunique()))
print('%i sub_types'%(train_test_monthly[train_test_monthly['date_block_num']==34]['sub_type'].nunique()))

### Mean monthly sales per city

In [None]:
plt.figure(figsize=(20,5))

group_city = train_test_monthly.groupby(by=['date_block_num', 'city']).sum()[['item_cnt_month']]
group_city.reset_index(inplace = True)

sns.barplot(x = 'city', y = 'item_cnt_month', data = group_city)

### Mean monthly sales per type

In [None]:
plt.figure(figsize=(20,5))

group_type = train_test_monthly.groupby(by=['date_block_num', 'type']).sum()[['item_cnt_month']]
group_type.reset_index(inplace = True)

sns.barplot(x = 'type', y = 'item_cnt_month', data = group_type)

### Mean monthly sales per sub_type

In [None]:
plt.figure(figsize=(20,5))

group_sub_type = train_test_monthly.groupby(by=['date_block_num', 'sub_type']).sum()[['item_cnt_month']]
group_sub_type.reset_index(inplace = True)

sns.barplot(x = 'sub_type', y = 'item_cnt_month', data = group_sub_type)

### Lag features

In [None]:
def col_months_lag(col, dataset, date_block_mean=True, use_similar=False):
    lag_months = [1, 2, 3, 12]
    
    df = dataset.copy()
    
    if date_block_mean:
        group_mean = df.groupby(['date_block_num', 
                                 col], 
                                as_index=False)['item_cnt_month'].mean().fillna(0)
        group_mean.rename(columns={'item_cnt_month':col+'_date_block_mean'}, 
                          inplace = True)

        for lag in lag_months:
            group_mean_lag = group_mean.copy()

            group_mean_lag['date_block_num'] += lag
            group_mean_lag.rename(columns={col+'_date_block_mean':col+'_date_block_mean_lag_'+str(lag)}, 
                                  inplace = True)

            df = df.merge(group_mean_lag, 
                          on=['date_block_num', col], 
                          how='left')
            
            df[col+'_date_block_mean_lag_'+str(lag)].fillna(0, inplace = True)
            

        df[col+'_3_months_mean'] = df[[col+'_date_block_mean_lag_1', 
                                       col+'_date_block_mean_lag_2', 
                                       col+'_date_block_mean_lag_3']].mean(axis = 1)
        
        df[col+'_3_months_std'] = df[[col+'_date_block_mean_lag_1', 
                                       col+'_date_block_mean_lag_2', 
                                       col+'_date_block_mean_lag_3']].std(axis = 1)
        

        df[col+'_diff_1_2_months'] = df[col+'_date_block_mean_lag_1'] - df[col+'_date_block_mean_lag_2']
        
        df.drop(columns=[col+'_date_block_mean_lag_3'], 
                inplace=True)
        
    else:
        for lag in lag_months:
            df[col+'_month_lag_'+str(lag)] =\
                       df.groupby([col])['item_cnt_month'].shift(lag)
            
            df[col+'_month_lag_'+str(lag)].fillna(0, inplace = True)

        df[col+'_3_months_mean'] = df[[col+'_month_lag_1', 
                                       col+'_month_lag_2', 
                                       col+'_month_lag_3']].mean(axis = 1)
        
        df[col+'_3_months_std'] = df[[col+'_month_lag_1', 
                                       col+'_month_lag_2', 
                                       col+'_month_lag_3']].std(axis = 1)


        df[col+'_diff_1_2_months'] = df[col+'_month_lag_1'] - df[col+'_month_lag_2']
        
        df.drop(columns=[col+'_month_lag_3'], 
                inplace=True)
    
    return df

In [None]:
train_test_monthly = col_months_lag('shop_item_id', train_test_monthly, False)
train_test_monthly = col_months_lag('shop_type_id', train_test_monthly)
train_test_monthly = col_months_lag('item_id', train_test_monthly)

### Adding similar items lag feature
* Assuming that item_id is similar to item_id-1, we will add the sales from the previous month from these similar items

In [None]:
similar_group_mean = train_test_monthly.groupby(by=['date_block_num', 
                                                    'shop_id',
                                                    'item_id'], 
                                                as_index=False)['item_cnt_month'].last().fillna(0)

similar_group_mean.rename(columns={'item_cnt_month':'shop_similar_item_id_month_lag_1'}, 
                          inplace = True)

similar_group_mean['item_id'] += 1
similar_group_mean['date_block_num'] += 1

train_test_monthly = train_test_monthly.merge(similar_group_mean, 
                                              on=['date_block_num', 'shop_id', 'item_id'], 
                                              how='left')

In [None]:
def RMSE(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

### Validation split
* To simulate the test split, our validation will be all months of 2015 in the train set, and the train data will be all the months before the month of validation
#### Example:
   **Validation:** date_block_num=24;
   **Train:** date_block_num<24
   
   **Validation:** date_block_num=25;
   **Train:** date_block_num<25
   
   **Validation:** date_block_num=26;
   **Train:** date_block_num<26
   
   **.**
   **.**
   **.**

In [None]:
def split_date_block_num(df, date_block_num):
    split_1 = df[df['date_block_num'] < date_block_num].copy()
    split_2 = df[df['date_block_num'] == date_block_num].copy().reset_index(drop=True)
    
    return split_1, split_2

In [None]:
train_sales, test = split_date_block_num(train_test_monthly, 34)

In [None]:
train_sales = train_sales[train_sales['date_block_num']>2]

In [None]:
select_feats = ['shop_id',
                'item_id',
                'shop_item_id',
                'shop_cat_id',
                'shop_type_id',
                'item_city_month',
                'shop_item_month',
                'shop_item_id_month_lag_1',
                'shop_item_id_month_lag_2',
                'shop_item_id_month_lag_12',
                'shop_item_id_3_months_mean',
                'shop_item_id_3_months_std',
                'shop_item_id_diff_1_2_months',
                'shop_type_id_date_block_mean_lag_1',
                'shop_type_id_date_block_mean_lag_2',
                'shop_type_id_date_block_mean_lag_12',
                'shop_type_id_3_months_mean',
                'shop_type_id_3_months_std',
                'shop_type_id_diff_1_2_months',
                'item_id_date_block_mean_lag_1',
                'item_id_date_block_mean_lag_2',
                'item_id_date_block_mean_lag_12',
                'item_id_3_months_mean',
                'item_id_3_months_std',
                'item_id_diff_1_2_months',
                'shop_similar_item_id_month_lag_1'
               ]

* Best feats found for lgbm

In [None]:
params = {
        'num_iterations':140,
        'max_depth':15,
        'objective': 'rmse',
        'metric': 'rmse',
        'num_leaves': (2 ** 11) - 1,
        'learning_rate': 0.025,
        'feature_fraction': 0.4,
        'bagging_fraction': 0.2,
        'bagging_freq': 5,
        'seed': 1,
        'verbose': 1
    }

cat_cols = ['shop_id', 'item_id', 'shop_item_id', 'shop_cat_id', 
            'shop_type_id', 'item_city_month', 'shop_item_month']

In [None]:
errors = []

for block_num in range(24, 34):
    
    train, validation = split_date_block_num(train_sales, block_num)

    x_train = train[select_feats].values
    y_train = train['item_cnt_month'].values

    x_val = validation[select_feats].values
    y_val = validation['item_cnt_month'].values
    
    del train
    del validation
    
    print('TRAINING %i MONTHS:'%(block_num))

    start = time()
    
    lgbmr_train = lgbm.Dataset(x_train, y_train)
    lgbmr_val = lgbm.Dataset(x_val, y_val, reference=lgbmr_train)

    lgbmr_trained = lgbm.train(params, 
                               lgbmr_train, 
                               feature_name=select_feats,
                               categorical_feature=cat_cols)
    end = time()
    
    del lgbmr_train
    del lgbmr_val

    y_train_pred = lgbmr_trained.predict(x_train).clip(0, 20)

    train_rmse = RMSE(y_train, y_train_pred)
    print('Train RMSE = %.5f'%(train_rmse))

    y_val_pred = lgbmr_trained.predict(x_val).clip(0, 20)

    val_rmse = RMSE(y_val, y_val_pred)
    print('Validation RMSE = %.5f'%(val_rmse))
    errors.append(val_rmse)

    print('Training time = %.2f s'%(end - start))

    print()
    
    del x_train
    del y_train
    del x_val
    del y_val
    del lgbmr_trained
    del y_train_pred
    del y_val_pred

In [None]:
print('RMSE = %.5f +/- %.5f'%(np.array(errors).mean(), np.array(errors).std()))

In [None]:
x_train = train_sales[select_feats].values
y_train = train_sales['item_cnt_month'].values

x_test = test[select_feats].values

In [None]:
start = time()
lgbmr_train = lgbm.Dataset(x_train, y_train)
lgbmr_trained = lgbm.train(params, 
                           lgbmr_train, 
                           feature_name=select_feats,
                           categorical_feature=cat_cols)
end = time()

print('Training time = %.2f s'%(end - start))

In [None]:
plot_importance(lgbmr_trained)

In [None]:
test['ID'] = test.index
test['item_cnt_month'] = lgbmr_trained.predict(x_test).clip(0, 20)

In [None]:
test[['ID', 'item_cnt_month']].to_csv("lgbm_submission.csv", index = False) 

In [None]:
joblib.dump(lgbmr_trained, 'lgbm_model.joblib') 