In [None]:
import pandas as pd
import numpy as np

from itertools import product
from sklearn.preprocessing import LabelEncoder

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from xgboost import XGBRegressor
from xgboost import plot_importance

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import time
import sys
import gc
import pickle
sys.version_info

# Loading Data

In [None]:
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
shops.describe()

In [None]:
categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
categories.describe()

In [None]:
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
items.describe()

In [None]:
training_set = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
training_set.describe()

In [None]:
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
test.describe()

# Data Explorationg & EDA

## Training Dataset

In [None]:
fig = plt.figure(figsize=(18,9))
plt.subplots_adjust(hspace=.5)

plt.subplot2grid((3,3), (0,0), colspan = 3)
training_set['shop_id'].value_counts(normalize=True).plot(kind='bar', alpha=0.7)
plt.title('Shop ID Values in the Training Set (Normalized)')

plt.subplot2grid((3,3), (1,0))
training_set['item_id'].plot(kind='hist', alpha=0.7)
plt.title('Item ID Histogram')

plt.subplot2grid((3,3), (1,1))
training_set['item_price'].plot(kind='hist', alpha=0.7, color='orange')
plt.title('Item Price Histogram')

plt.subplot2grid((3,3), (1,2))
training_set['item_cnt_day'].plot(kind='hist', alpha=0.7, color='green')
plt.title('Item Count Day Histogram')

plt.subplot2grid((3,3), (2,0), colspan = 3)
training_set['date_block_num'].value_counts(normalize=True).plot(kind='bar', alpha=0.7)
plt.title('Month (date_block_num) Values in the Training Set (Normalized)')

plt.show()

From the graphs above,

* From the 60 different shop IDs, there is an uneven distribtution of these in the dataset. Four of these shops make around 25 percent of this dataset. These are shops.
* The Item IDs seem to have variations in frequency, but it is no possible to make any further assumptions yet.
* From the vast empty spaces in the histograms of 'item_price' and 'item_cnt_day', it is possible to argue that there are outliers in their distribution.
* Plotting the individual months from January 2013 to October 2015, it is possible to see that the December months are the ones with a hgher amount of sales

### Outliers by price and sales volume

From the previous intuition regarding the outliers, using some boxplots it is possible to see that there are quite a few! Because of that, an empirical estimation is mabe (by looking at the boxplot) to identify the outliers

In [None]:
plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
sns.boxplot(x=training_set['item_cnt_day'])
print('Sale volume outliers:',training_set['item_id'][training_set['item_cnt_day']>=1000].unique())

plt.figure(figsize=(10,4))
plt.xlim(training_set['item_price'].min(), training_set['item_price'].max())
sns.boxplot(x=training_set['item_price'])
print('Item price outliers:',training_set['item_id'][training_set['item_price']>=100000].unique())

From that, I proceed to remove outliers from the training data set. Additionally, there is one price below zero (as seen when describing the data), so I change the value with the median

In [None]:
training_set = training_set[training_set['item_price']<100000]
training_set = training_set[training_set['item_cnt_day']<1001]

median = training_set[(training_set['shop_id']==32)&(training_set['item_id']==2973)&(training_set['date_block_num']==4)&(training_set['item_price']>0)]['item_price'].median()
training_set.loc[training_set['item_price']<0, 'item_price'] = median

## Revenue
I believe that an interesting feature to have would be the revenue (total amount of money) from each transaction/sale

In [None]:
training_set['revenue'] = training_set['item_price'] *  training_set['item_cnt_day']

## Shops Dataset

In [None]:
shops.head()

Additionally, by looking at the actual names and reading the community forums and notebooks, it was possible to determine that some shops have duplicated id/name.
* 11 and 10
* 1 and 58
* 0 and 57

In [None]:
training_set.loc[training_set.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57

training_set.loc[training_set.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58

training_set.loc[training_set.shop_id == 10, 'shop_id'] = 11
test.loc[test.shop_id == 10, 'shop_id'] = 11

### Shop Names
After using Google (and Google Maps) for a couple of minutes/hours, it was possible to understand that the structure of the sho_name is "City" - "Type" - "Name". With that, I use **Label Encoders** for encoding the city of each store

In [None]:
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(shops['city'])
shops = shops[['shop_id','city_code']]

shops.head()

## Categories Dataset

As done with the shops, I encode information about the categories, such as the name, type and subtype

In [None]:
categories['split'] = categories['item_category_name'].str.split('-')
categories['type'] = categories['split'].map(lambda x: x[0].strip())
categories['type_code'] = LabelEncoder().fit_transform(categories['type'])

# if subtype is nan then type
categories['subtype'] = categories['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
categories['subtype_code'] = LabelEncoder().fit_transform(categories['subtype'])
categories = categories[['item_category_id','type_code', 'subtype_code']]

categories.head()

## Items Dataset
For now, I didn't find any good or relevant use to the items names, so I proceeded to remove them from the data set

In [None]:
items.drop(['item_name'], axis=1, inplace=True)
items.head()

## Test Dataset

I just modify the types of the test data for future convenience. Additionally, I add the *date block* number from the month that the test data will be predcting

In [None]:
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

## Feauture Engineering
Since we have to alculate monthly sales, the creation and modification of feature consists in using and extending the information with have from each unique pair (item, shop) within the month (item, shop, month). This way train data will be similar to test data.

In [None]:
## Creation of Data structure for feauture engineering
matrix = []

In [None]:
## Generating the pairs
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = training_set[training_set['date_block_num']==i]
    matrix.append(np.array(list(product([i], sales['shop_id'].unique(), sales['item_id'].unique())), dtype='int16'))
    
matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)

With the matrix generated, I proceed to generate new aggregated features

In [None]:
temp_group = training_set.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
temp_group.columns = ['item_cnt_month']
temp_group.reset_index(inplace=True)

matrix = pd.merge(matrix, temp_group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month'].fillna(0).clip(0,20).astype(np.float16))

Likewise, I use the previous modification in the test dataset to add the month to predict to the matrix.

In [None]:
matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=cols)
matrix.fillna(0, inplace=True)

Afterwards, using the EDA done to the other datasets, I proceed to join them with the feature-engineered datastructure

In [None]:
matrix = pd.merge(matrix, shops, on=['shop_id'], how='left')
matrix = pd.merge(matrix, items, on=['item_id'], how='left')
matrix = pd.merge(matrix, categories, on=['item_category_id'], how='left')

In [None]:
## Changing data types of the data structure to ease future processing
matrix['city_code'] = matrix['city_code'].astype(np.int8)
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)
matrix['type_code'] = matrix['type_code'].astype(np.int8)
matrix['subtype_code'] = matrix['subtype_code'].astype(np.int8)

### Mean Encodings

In [None]:
def lags(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

In [None]:
matrix = lags(matrix, [1,2,3,6,12], 'item_cnt_month')

I create some additional aggregates for their posterior encoding

In [None]:
# Mean number of sales per month
temp_group = matrix.groupby(['date_block_num']).agg({'item_cnt_month': ['mean']})
temp_group.columns = [ 'date_avg_item_cnt' ]
temp_group.reset_index(inplace=True)

matrix = pd.merge(matrix, temp_group, on=['date_block_num'], how='left')
matrix['date_avg_item_cnt'] = matrix['date_avg_item_cnt'].astype(np.float16)

## Additional lags for the monthly average in count
matrix = lags(matrix, [1], 'date_avg_item_cnt')
matrix.drop(['date_avg_item_cnt'], axis=1, inplace=True)
matrix.head()

In [None]:
# Mean quantities grouped by month and item
temp_group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
temp_group.columns = [ 'date_item_avg_item_cnt' ]
temp_group.reset_index(inplace=True)

matrix = pd.merge(matrix, temp_group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_cnt'] = matrix['date_item_avg_item_cnt'].astype(np.float16)

## Additional Lags for the means that were created
matrix = lags(matrix, [1,2,3,6,12], 'date_item_avg_item_cnt')
matrix.drop(['date_item_avg_item_cnt'], axis=1, inplace=True)
matrix.head()

In [None]:
## Mean quantities by month and shop
temp_group = matrix.groupby(['date_block_num', 'shop_id']).agg({'item_cnt_month': ['mean']})
temp_group.columns = [ 'date_shop_avg_item_cnt' ]
temp_group.reset_index(inplace=True)

matrix = pd.merge(matrix, temp_group, on=['date_block_num','shop_id'], how='left')
matrix['date_shop_avg_item_cnt'] = matrix['date_shop_avg_item_cnt'].astype(np.float16)

#As before, additional lags for the means are created
matrix = lags(matrix, [1,2,3,6,12], 'date_shop_avg_item_cnt')
matrix.drop(['date_shop_avg_item_cnt'], axis=1, inplace=True)
matrix.head()

In [None]:
## Means by month and the category Id
temp_group = matrix.groupby(['date_block_num', 'item_category_id']).agg({'item_cnt_month': ['mean']})
temp_group.columns = [ 'date_cat_avg_item_cnt' ]
temp_group.reset_index(inplace=True)

matrix = pd.merge(matrix, temp_group, on=['date_block_num','item_category_id'], how='left')
matrix['date_cat_avg_item_cnt'] = matrix['date_cat_avg_item_cnt'].astype(np.float16)

## Same, additional Lags are created for the month-category means
matrix = lags(matrix, [1], 'date_cat_avg_item_cnt')
matrix.drop(['date_cat_avg_item_cnt'], axis=1, inplace=True)
matrix.head()

In [None]:
# Means by month, item and shop
temp_group = matrix.groupby(['date_block_num', 'shop_id', 'item_category_id']).agg({'item_cnt_month': ['mean']})
temp_group.columns = ['date_shop_cat_avg_item_cnt']
temp_group.reset_index(inplace=True)

matrix = pd.merge(matrix, temp_group, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
matrix['date_shop_cat_avg_item_cnt'] = matrix['date_shop_cat_avg_item_cnt'].astype(np.float16)

#Creation of lags for the means of month, item and shop
matrix = lags(matrix, [1], 'date_shop_cat_avg_item_cnt')
matrix.drop(['date_shop_cat_avg_item_cnt'], axis=1, inplace=True)
matrix.head()

In [None]:
# Means using the month, shop, and shop type feature created before
temp_group = matrix.groupby(['date_block_num', 'shop_id', 'type_code']).agg({'item_cnt_month': ['mean']})
temp_group.columns = ['date_shop_type_avg_item_cnt']
temp_group.reset_index(inplace=True)

matrix = pd.merge(matrix, temp_group, on=['date_block_num', 'shop_id', 'type_code'], how='left')
matrix['date_shop_type_avg_item_cnt'] = matrix['date_shop_type_avg_item_cnt'].astype(np.float16)

#Creating lags for... month, shop and the type
matrix = lags(matrix, [1], 'date_shop_type_avg_item_cnt')
matrix.drop(['date_shop_type_avg_item_cnt'], axis=1, inplace=True)
matrix.head()

In [None]:
# Likewise, creating of means with month, shop and the subtypes
temp_group = matrix.groupby(['date_block_num', 'shop_id', 'subtype_code']).agg({'item_cnt_month': ['mean']})
temp_group.columns = ['date_shop_subtype_avg_item_cnt']
temp_group.reset_index(inplace=True)

matrix = pd.merge(matrix, temp_group, on=['date_block_num', 'shop_id', 'subtype_code'], how='left')
matrix['date_shop_subtype_avg_item_cnt'] = matrix['date_shop_subtype_avg_item_cnt'].astype(np.float16)

#Creating lags for... month, shop and the subtypes
matrix = lags(matrix, [1], 'date_shop_subtype_avg_item_cnt')
matrix.drop(['date_shop_subtype_avg_item_cnt'], axis=1, inplace=True)
matrix.head()

In [None]:
temp_group = matrix.groupby(['date_block_num', 'city_code']).agg({'item_cnt_month': ['mean']})
temp_group.columns = [ 'date_city_avg_item_cnt' ]
temp_group.reset_index(inplace=True)

matrix = pd.merge(matrix, temp_group, on=['date_block_num', 'city_code'], how='left')
matrix['date_city_avg_item_cnt'] = matrix['date_city_avg_item_cnt'].astype(np.float16)
matrix = lags(matrix, [1], 'date_city_avg_item_cnt')
matrix.drop(['date_city_avg_item_cnt'], axis=1, inplace=True)
matrix.head()

In [None]:
# Means using month, item and the city encoding that was done previously
temp_group = matrix.groupby(['date_block_num', 'item_id', 'city_code']).agg({'item_cnt_month': ['mean']})
temp_group.columns = [ 'date_item_city_avg_item_cnt' ]
temp_group.reset_index(inplace=True)

matrix = pd.merge(matrix, temp_group, on=['date_block_num', 'item_id', 'city_code'], how='left')
matrix['date_item_city_avg_item_cnt'] = matrix['date_item_city_avg_item_cnt'].astype(np.float16)

# Lags with the month, item and encoded cities
matrix = lags(matrix, [1], 'date_item_city_avg_item_cnt')
matrix.drop(['date_item_city_avg_item_cnt'], axis=1, inplace=True)
matrix.head()

In [None]:
# Means with the month, and the type of category encodings done previously
temp_group = matrix.groupby(['date_block_num', 'type_code']).agg({'item_cnt_month': ['mean']})
temp_group.columns = [ 'date_type_avg_item_cnt' ]
temp_group.reset_index(inplace=True)

matrix = pd.merge(matrix, temp_group, on=['date_block_num', 'type_code'], how='left')
matrix['date_type_avg_item_cnt'] = matrix['date_type_avg_item_cnt'].astype(np.float16)

# Lags for the month and the encoded category types
matrix = lags(matrix, [1], 'date_type_avg_item_cnt')
matrix.drop(['date_type_avg_item_cnt'], axis=1, inplace=True)
matrix.head()

In [None]:
#Just as before (what a surprise!), the means with month and the encoded subtypes of the categories
temp_group = matrix.groupby(['date_block_num', 'subtype_code']).agg({'item_cnt_month': ['mean']})
temp_group.columns = [ 'date_subtype_avg_item_cnt' ]
temp_group.reset_index(inplace=True)

matrix = pd.merge(matrix, temp_group, on=['date_block_num', 'subtype_code'], how='left')
matrix['date_subtype_avg_item_cnt'] = matrix['date_subtype_avg_item_cnt'].astype(np.float16)

# Lags for the months with encoded category subtypes
matrix = lags(matrix, [1], 'date_subtype_avg_item_cnt')
matrix.drop(['date_subtype_avg_item_cnt'], axis=1, inplace=True)
matrix.head()

### Additional Features

In [None]:
## Feature to relate the months along the dataset's years
matrix['month'] = matrix['date_block_num'] % 12

In [None]:
## Feature to indicate the number of days per month
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days'] = matrix['month'].map(days).astype(np.int8)

### Final Preprocessing before training

Because of the using 12 as lag value, it is necessary to drop those months. likewise, I remove the columns with this month's calculated values

In [None]:
matrix = matrix[matrix.date_block_num > 11]

In [None]:
## Filling NAs from the lags
def process_nas(df):
    for col in df.columns:
        if ('_lag_' in col) & (df[col].isnull().any()):
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)         
    return df

matrix = process_nas(matrix)

Ready to play! But firts, remove unnecesary information to avoid running out of space (happened a couple of times, unfortunately)

In [None]:
matrix.to_pickle('data.pkl')
del matrix
del group
del items
del shops
del cats
del train

gc.collect();

# Training the model

In [None]:
data = pd.read_pickle('data.pkl')
data = data[[
    'date_block_num',
    'shop_id',
    'item_id',
    'item_cnt_month',
    'city_code',
    'item_category_id',
    'type_code',
    'subtype_code',
    'item_cnt_month_lag_1',
    'item_cnt_month_lag_2',
    'item_cnt_month_lag_3',
    'item_cnt_month_lag_6',
    'item_cnt_month_lag_12',
    'date_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_1',
    'date_item_avg_item_cnt_lag_2',
    'date_item_avg_item_cnt_lag_3',
    'date_item_avg_item_cnt_lag_6',
    'date_item_avg_item_cnt_lag_12',
    'date_shop_avg_item_cnt_lag_1',
    'date_shop_avg_item_cnt_lag_2',
    'date_shop_avg_item_cnt_lag_3',
    'date_shop_avg_item_cnt_lag_6',
    'date_shop_avg_item_cnt_lag_12',
    'date_cat_avg_item_cnt_lag_1',
    'date_shop_cat_avg_item_cnt_lag_1',
    'date_city_avg_item_cnt_lag_1',
    'date_item_city_avg_item_cnt_lag_1',
    'month',
    'days'
]]

## Data Splitting
Validation strategy is 
* 34 month for the test set
* 33 month for the validation set 
* 13-33 months for the train.

In [None]:
X_train = data[data.date_block_num < 33].drop(['item_cnt_month'], axis=1)
Y_train = data[data.date_block_num < 33]['item_cnt_month']
X_valid = data[data.date_block_num == 33].drop(['item_cnt_month'], axis=1)
Y_valid = data[data.date_block_num == 33]['item_cnt_month']
X_test = data[data.date_block_num == 34].drop(['item_cnt_month'], axis=1)

## Remove unnecessary variables to allocate space
del data
gc.collect();

For the training, the metaparameters where adjusted using the insights gathered during the course and documentation

In [None]:
## First Model

model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_valid, Y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

In [None]:
## Second Moder
rf_model = RandomForestRegressor(n_estimators=50, 
                                 max_depth=7, 
                                 random_state=0, 
                                 n_jobs=-1, 
                                 verbose=1)

rf_model.fit(X_train, Y_train)

# Ensembling Models

To combine the 1st level model predictions, I'll use a simple linear regression (As I'm only feeding the model with predictions, it is not necessary a complex model)

## Models Results

In [None]:
## Predctions from XGB
xgb_val_pred = model.predict(X_valid)
xgb_test_pred = model.predict(X_test)

## Predictions from RF
rf_val_pred = rf_model.predict(X_valid)
rf_test_pred = rf_model.predict(X_test)

In [None]:
# Feauture Importance
fig, ax = plt.subplots(1,1,figsize=(10, 14))
plot_importance(booster=model, ax=ax)

## **Ensembling Architecture**

1st level:
* XGBM
* Random forest

2nd level:
* Linear Regression

In [None]:
# Dataset that will be the train set of the ensemble model.
first_level = pd.DataFrame(xgb_val_pred, columns=['xgbm'])
first_level['random_forest'] = rf_val_pred
first_level['label'] = Y_valid.values
first_level.head(20)

In [None]:
# Dataset that will be the test set of the ensemble model.
first_level_test = pd.DataFrame(xgb_test_pred, columns=['xgbm'])
first_level_test['random_forest'] = rf_test_pred

This is the model that will combine the other ones to hopefully make an overall better prediction.

In [None]:
meta_model = LinearRegression(n_jobs=-1)
first_level.drop('label', axis=1, inplace=True)
meta_model.fit(first_level, Y_valid)

In [None]:
ensemble_pred = meta_model.predict(first_level)
final_predictions = meta_model.predict(first_level_test)
print('Train rmse:', np.sqrt(mean_squared_error(ensemble_pred, Y_valid)))

# Submission

In [None]:
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": final_predictions.clip(0., 20.)
})
submission.to_csv('submission.csv', index=False)

# save predictions for an ensemble
pickle.dump(xgb_val_pred, open('xgb_val.pickle', 'wb'))
pickle.dump(xgb_test_pred, open('xgb_test.pickle', 'wb'))

pickle.dump(rf_val_pred, open('rf_val.pickle', 'wb'))
pickle.dump(rf_test_pred, open('rf_test.pickle', 'wb'))

pickle.dump(final_predictions, open('ensemble_pred.pickle', 'wb'))