In [None]:
# Sources
# https://www.kaggle.com/dimitreoliveira/model-stacking-feature-engineering-and-eda
# https://www.kaggle.com/dlarionov/feature-engineering-xgboost

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
from itertools import product
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split

import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

from sklearn.metrics import mean_squared_error as mse

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.metrics import confusion_matrix, make_scorer
import shap

def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
cats = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
train = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv').set_index('ID')

In [None]:
train.head()

In [None]:
cats.head()

In [None]:
shops.head()

In [None]:
items.head()

In [None]:
test.head()

In [None]:
train.groupby(['shop_id'])['item_cnt_day'].sum().plot(kind='bar', figsize=(15,5))

In [None]:
train.groupby(['item_id'])['item_cnt_day'].sum().sort_values(ascending=False)

In [None]:
train = train.query('item_price > 0')

In [None]:
plt.figure(figsize=(10,4))
plt.xlim(-100, 3000)
sns.boxplot(x=train.item_cnt_day)

plt.figure(figsize=(10,4))
plt.xlim(train.item_price.min(), train.item_price.max()*1.1)
sns.boxplot(x=train.item_price)

In [None]:
train = train[train.item_price<100000]
train = train[train.item_cnt_day<1001]

In [None]:
median = train[(train.shop_id==32)&(train.item_id==2973)&(train.date_block_num==4)&(train.item_price>0)].item_price.median()
train.loc[train.item_price<0, 'item_price'] = median

median

In [None]:
print(shops['shop_name'].duplicated().sum())
print(shops['shop_id'].duplicated().sum())

In [None]:
group = train.groupby('item_id').agg({'item_price': ['median'], 'item_cnt_day':['sum']})
group.columns = ['median_price', 'sum_sales']
group.reset_index(inplace=True)
group[['median_price', 'sum_sales']].sort_values(by=['sum_sales'], ascending=False).round(1).head(50).set_index('median_price').plot(kind='bar', figsize=(20,8))

In [None]:
group[['item_id', 'median_price', 'sum_sales']].sort_values(by=['sum_sales'], ascending=False).round(1).head(10).set_index('median_price')

In [None]:
group['total_item_revenue'] = group['median_price']*group['sum_sales']
group[['item_id', 'total_item_revenue']].sort_values(by=['total_item_revenue'], ascending=False).round(1).head(50).set_index('item_id').plot(kind='bar', figsize=(20,8))

In [None]:
# Most sold item: 
print(items[['item_name', 'item_id']][items['item_id'] == 20949])
# Free translation: Branded package T-shirt 1C white (34 * 42)
print('-'*75)
# Most revenue generated by a single item:
print(items[['item_name', 'item_id']][items['item_id'] == 6675])
# Sony Playstation 4 (500 Gb) Black

The data is spread among 3 different tables. cats, shops and train.
Not only that, but the data in train and test are given as a daily count, instead of monthly. Since the objective is to predict sales for the 34th month, it makes sense reorganizing the values monthly

In [None]:
matrix = []
cols = ['date_block_num','shop_id','item_id']
for i in range(34):
    sales = train[train.date_block_num==i]
    matrix.append(np.array(list(product([i], sales.shop_id.unique(), sales.item_id.unique())), dtype='int16'))
    

matrix = pd.DataFrame(np.vstack(matrix), columns=cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)

print(matrix)

In [None]:
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)
print(group)
matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month']
                                .fillna(0)
                                .clip(0,20) # NB clip target here
                                .astype(np.float16))

In [None]:
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']}).unstack(fill_value=0).stack()
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)
group

In [None]:
gun = group.groupby('date_block_num').agg({'shop_id': ['unique']})
gun.columns = ['unique']
gun.reset_index(inplace=True)

In [None]:
cl = group['shop_id'].unique()
smoplt = pd.DataFrame(1, index=np.sort(cl), columns=np.arange(34))

for i in range(34):
    gun_ar = gun['unique'][gun['unique'].index == i].to_numpy()
    ca = gun_ar[0].astype(np.int64)
    vals = np.setdiff1d(cl, ca)
    for j in vals:
        smoplt.loc[smoplt[i].index == j, [i]] = 0

In [None]:
fig, ax = plt.subplots(figsize=(15,15))

sns.heatmap(smoplt, cbar=False, ax=ax, robust=True, cmap="GnBu", linewidths=1, linecolor='#000000')

# Based on the heatmap, its safe to assume the stores 8, 9, 13, 17, 20, 23, 29, 30, 32, 33, 40, 43 and 55 will have no sales on the 34 month.

In [None]:
test['date_block_num'] = 34
test['date_block_num'] = test['date_block_num'].astype(np.int8)
test['shop_id'] = test['shop_id'].astype(np.int8)
test['item_id'] = test['item_id'].astype(np.int16)

In [None]:
matrix = pd.concat([matrix, test], ignore_index=True, sort=False, keys=cols)
matrix.fillna(0, inplace=True) # 34 month

In [None]:
matrix

In [None]:
matrix = pd.merge(matrix, shops, on=['shop_id'], how='left')
matrix = pd.merge(matrix, items, on=['item_id'], how='left')
matrix = pd.merge(matrix, cats, on=['item_category_id'], how='left')
matrix['item_category_id'] = matrix['item_category_id'].astype(np.int8)

matrix = matrix.drop(['shop_name', 'item_name', 'item_category_name'], axis=1)
matrix = matrix[['date_block_num', 'shop_id', 'item_category_id', 'item_id', 'item_cnt_month']]

In [None]:
matrix

In [None]:
matrix_train = matrix[matrix['date_block_num'] < 34]

In [None]:
matrix_train

In [None]:
sell_month = matrix_train.astype('float64').groupby('date_block_num')['item_cnt_month'].sum()

sell_month.plot()

In [None]:
sell_month.sort_values(ascending=False)

# The months with peak sales are 11 and 23. Both correspond to December. The predicted month 34 correspond to November.
# Based on the past 2 years, its expected to have a slight increase of sales when compared to month 33.

In [None]:
def lag_feature(df, lags, col):
    tmp = df[['date_block_num','shop_id','item_id',col]]
    for i in lags:
        shifted = tmp.copy()
        shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
        shifted['date_block_num'] += i
        df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
    return df

matrix = lag_feature(matrix, [1,2], 'item_cnt_month')

matrix

In [None]:
group = matrix.groupby(['date_block_num', 'item_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_item_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_id'], how='left')
matrix['date_item_avg_item_cnt'] = matrix['date_item_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1,2], 'date_item_avg_item_cnt')

matrix

In [None]:
group = matrix.groupby(['date_block_num', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = [ 'date_cat_avg_item_cnt' ]
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','item_category_id'], how='left')
matrix['date_cat_avg_item_cnt'] = matrix['date_cat_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_cat_avg_item_cnt')


matrix

In [None]:
group = matrix.groupby(['date_block_num', 'shop_id', 'item_category_id']).agg({'item_cnt_month': ['mean']})
group.columns = ['date_shop_cat_avg_item_cnt']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num', 'shop_id', 'item_category_id'], how='left')
matrix['date_shop_cat_avg_item_cnt'] = matrix['date_shop_cat_avg_item_cnt'].astype(np.float16)
matrix = lag_feature(matrix, [1], 'date_shop_cat_avg_item_cnt')

matrix

In [None]:
group = train.groupby(['item_id']).agg({'item_price': ['mean']})
group.columns = ['item_avg_item_price']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['item_id'], how='left')
matrix['item_avg_item_price'] = matrix['item_avg_item_price'].astype(np.float16)

In [None]:
matrix

In [None]:
train['revenue'] = train['item_price'] *  train['item_cnt_day']

In [None]:
group = train.groupby(['date_block_num','shop_id']).agg({'revenue': ['sum']})
group.columns = ['date_shop_revenue']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['date_block_num','shop_id'], how='left')
matrix['date_shop_revenue'] = matrix['date_shop_revenue'].astype(np.float32)

group = group.groupby(['shop_id']).agg({'date_shop_revenue': ['mean']})
group.columns = ['shop_avg_revenue']
group.reset_index(inplace=True)

matrix = pd.merge(matrix, group, on=['shop_id'], how='left')
matrix['shop_avg_revenue'] = matrix['shop_avg_revenue'].astype(np.float32)

matrix['delta_revenue'] = (matrix['date_shop_revenue'] - matrix['shop_avg_revenue']) / matrix['shop_avg_revenue']
matrix['delta_revenue'] = matrix['delta_revenue'].astype(np.float16)

In [None]:
matrix

In [None]:
def fill_na(df):
    for col in df.columns:
        if ('_lag_' in col) & (df[col].isnull().any()):
            if ('item_cnt' in col):
                df[col].fillna(0, inplace=True)         
    return df

matrix = fill_na(matrix)

In [None]:
matrix.info()

In [None]:
train = matrix[matrix['date_block_num'] < 34]
test = matrix[matrix['date_block_num'] == 34]

train.drop('date_block_num', axis=1, inplace=True)
test.drop(['date_block_num', 'item_cnt_month'], axis=1, inplace=True)

y = train['item_cnt_month']
X = train.drop('item_cnt_month', axis=1)

First, we'll check feature importance and drop the less relevant ones

In [None]:
'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=True, stratify=y , random_state=42)

model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, y_train), (X_test, y_test)], 
    verbose=True, 
    early_stopping_rounds = 3)
'''

In [None]:
'''
plot_features(model, (10,14))

# Highest importance features:
# date_item_avg_item_cnt
# date_shop_cat_avg_item_cnt
# date_cat_avg_item_cnt
# item_cnt_month_lag_2
# 
# Lowest importance features:
# date_shop_revenue
# lag_6 features
# lag_3 features
# date_item_avg_item_price
# delta features
# date_shop_avg_item_cnt
# date_avg_item_cnt
# month
# days
'''

In [None]:
scorer = make_scorer(mse)

linreg = LinearRegression()
elnet = ElasticNet(random_state=42)
dectree = DecisionTreeRegressor(random_state=42)
forest = RandomForestRegressor(random_state=42)
adab = AdaBoostRegressor(random_state=42)
gb = XGBRegressor(eval_metric=scorer, random_state=42)


# Create the train_test_split for model evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,shuffle=True, stratify=y , random_state=42)

In [None]:
def bayes_search(model, param_grid):

    # Initialize the cross validation method
    n_iter = 5
    cv = StratifiedKFold(n_splits=n_iter, shuffle=True, random_state=42)

    # Execute the bayes search
    bsearch = BayesSearchCV(model, param_grid, n_iter=n_iter, scoring=scorer, cv=cv, n_jobs=-1, verbose=True).fit(X,y)
    # Print the values to be used in each parameter for best result in the final fitting
    print(' ',bsearch.best_score_)
    print(' ',bsearch.best_params_)
    
    return None

In [None]:
'''
# Searching for LinearRegression

linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
result = mse(y_pred, y_test, squared=False)
print(result)
'''
# Result:
# rmse: 0.7568274

In [None]:
# Searching for ElasticNet

'''
# Define the parameters to be tested in the bayes search
param_grid = {'alpha': Real(0.1, 1, prior='log-uniform'),
              'l1_ratio': Real(0, 1),
              'max_iter': Integer(50, 2000),
              }

bayes_search(elnet, param_grid)


# Results:
# ([('alpha', 0.6185535843131738), ('l1_ratio', 0.9459955732871277), ('max_iter', 1511)])

elnet = ElasticNet(alpha=0.618, l1_ratio=0.945, max_iter=1511, random_state=42)
elnet.fit(X_train, y_train)
y_pred = elnet.predict(X_test)
result = mse(y_pred, y_test, squared=False)
print(result)
'''

# Result:
# rmse: 1.0813551

In [None]:
# Searching for DecisionTreeRegressor

'''
# Define the parameters to be tested in the bayes search
param_grid = {
              'max_leaf_nodes': Integer(10, 200),
              }

bayes_search(dectree, param_grid)


# Results:
# ([('max_leaf_nodes', 54)])

dectree = DecisionTreeRegressor(max_leaf_nodes=54, n_jobs=-1, random_state=42, verbose=True)
dectree.fit(X_train, y_train)
y_pred = dectree.predict(X_test)
result = mse(y_pred, y_test, squared=False)
print(result)
'''

# Result:
# rmse: 0.7187903167054043

In [None]:
# Searching for RandomForestRegressor
'''

# Define the parameters to be tested in the bayes search
param_grid = {'n_estimators': Integer(100, 1000),
              #'max_leaf_nodes': Integer(10, 200),
              }

bayes_search(forest, param_grid)


# Results:
# ([('max_leaf_nodes', 54), ('n_estimators', 200)])

forest = RandomForestRegressor(max_leaf_nodes=54, n_estimators=200, random_state=42, n_jobs=-1, verbose=True)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)
result = mse(y_pred, y_test, squared=False)
print(result)
'''

# Result
# rmse: 0.7028950041966859

In [None]:
# Searching for AdaBoostRegressor
'''

# Define the parameters to be tested in the bayes search
param_grid = {'n_estimators': Integer(50, 500),
              'learning_rate': Real(0.01, 1, prior='log-uniform')
              }

bayes_search(adab, param_grid)

# Results:
# ([('n_estimators', 200), ('learning_rate', 0.05)])

adab = AdaBoostRegressor(learning_rate=0.1, n_estimators=200, random_state=42)
adab.fit(X_train, y_train)
y_pred = adab.predict(X_test)
result = mse(y_pred, y_test, squared=False)
print(result)
'''

# Result
# rmse: 0.9776510406903495

In [None]:
# Searching for XGBoost

'''
# Define the parameters to be tested in the bayes search
param_grid = {'max_depth': Integer(1, 90),
              'learning_rate': Real(0.01, 1, prior='log-uniform'),
              'n_estimators': Integer(50, 200)
             }

bayes_search(gb, param_grid)


# Results:
# ([('max_depth', 8), ('n_estimators', 200), ('min_child_weight', 300), ('colsample_bytree', 0.8), ('subsample', 0.8), ('eta', 0.3)])


gb = XGBRegressor(colsample_bytree=0.8, max_depth=8, n_estimators=200, eta=0.3, subsample=0.8, n_jobs=-1, random_state=42)
gb.fit(X_train, y_train, verbose=True)
y_pred = gb.predict(X_test)
result = mse(y_pred, y_test, squared=False)
print(result)

'''
# Result:
# rmse: 0.6327427

In [None]:
# Out of all models tried, XGBRegressor got the better result

gb = XGBRegressor(colsample_bytree=0.8, max_depth=8, n_estimators=200, eta=0.3, subsample=0.8, n_jobs=-1, random_state=42)
gb.fit(X, y,
    eval_metric="rmse", 
    eval_set=[(X, y), (X_test, y_test)], 
    verbose=True, 
    early_stopping_rounds = 10)

cols_when_model_builds = gb.get_booster().feature_names
test = test[cols_when_model_builds]
y_pred = gb.predict(test)

In [None]:
pd.Series(y_pred).describe()

In [None]:
plot_features(gb, (10,14))

In [None]:
test = test.reset_index()

submission = pd.DataFrame({
    "ID": test.reset_index().index, 
    "item_cnt_month": y_pred
})

In [None]:
submission.to_csv('xgb_submission.csv', index=False)