In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Some libraries and reading files

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb

path = '/kaggle/input/competitive-data-science-predict-future-sales/'
items = pd.read_csv(path+'items.csv')
sales_train = pd.read_csv(path+'sales_train.csv')
item_categories = pd.read_csv(path+'item_categories.csv')
test = pd.read_csv(path+'test.csv')
shops = pd.read_csv(path+'shops.csv')


In [None]:
items.head(2)

In [None]:
sales_train.head(2)

In [None]:
item_categories.head(2)

In [None]:
test.head(2)

In [None]:
shops.head(2)

In [None]:
sales_train.dtypes

In [None]:
sales_train.head(1)

In [None]:
sales_train.date = pd.to_datetime(sales_train.date, format="%d.%m.%Y")


In [None]:
print("---------------------TRAIN-INFO----------------------")
print(sales_train.head())
print(sales_train.dtypes)
print(sales_train.info())
print(sales_train.shape)

In [None]:
print("---------------------TEST-INFO----------------------")
print(test.head())
print(test.dtypes)
print(test.info())
print(test.shape)

Memory consumption reduction

In [None]:
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

sales_train = downcast_dtypes(sales_train)
print(sales_train.info())

In [None]:
sales_train.hist(figsize=(20,15), bins=50)

Definition of target range. Second advice course

In [None]:
target_range = [0 ,20]

In [None]:
sales = pd.merge(sales_train, items, on='item_id', how='left')
sales = sales.drop('item_name', axis=1)
sales.head(10)

In [None]:
index_cols = ['shop_id', 'item_id', 'date_block_num']
from itertools import product

In [None]:
grid = []
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])), dtype='int32'))
grid = pd.DataFrame(np.vstack(grid), columns = index_cols, dtype=np.int32)

In [None]:
grid.head()

Mean encodings

In [None]:
mean_sales = sales.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day': 'sum', 'item_price': np.mean}).reset_index()

In [None]:
mean_sales = pd.merge(grid, mean_sales, on=['date_block_num', 'shop_id', 'item_id'], how='left').fillna(0)
mean_sales = pd.merge(mean_sales, items, on='item_id', how='left')

Additional part Mean encoding

In [None]:
for type_id in ['item_id', 'shop_id', 'item_category_id']:
    for column_id, aggregator, aggtype in [('item_price',np.mean,'avg'),('item_cnt_day',np.sum,'sum'),('item_cnt_day',np.mean,'avg')]:
        
        mean_df = sales.groupby([type_id,'date_block_num']).aggregate(aggregator).reset_index()[[column_id,type_id,'date_block_num']]
        mean_df.columns = [type_id+'_'+aggtype+'_'+column_id,type_id,'date_block_num']
        mean_sales = pd.merge(mean_sales, mean_df, on=['date_block_num',type_id], how='left')

In [None]:
mean_sales.head()

Lag variables

In [None]:
lag_variables = list(mean_sales.columns[7:])+['item_cnt_day']
lags = [1, 2, 3, 6]
from tqdm import tqdm_notebook
for lag in tqdm_notebook(lags):
    sales_new_df = mean_sales.copy()
    sales_new_df.date_block_num += lag
    sales_new_df = sales_new_df[['date_block_num', 'shop_id', 'item_id']+lag_variables]
    sales_new_df.columns = ['date_block_num', 'shop_id', 'item_id'] + [lag_feat+ '_lag_'+str(lag) for lag_feat in lag_variables]
    mean_sales = pd.merge(mean_sales, sales_new_df, on=['date_block_num', 'shop_id', 'item_id'], how='left')
    

In [None]:
mean_sales.head(10)

Fill NA with zeros

In [None]:
mean_sales = mean_sales[mean_sales['date_block_num']>12]

In [None]:
for feat in mean_sales.columns:
    if 'item_cnt' in feat:
        mean_sales[feat] = mean_sales[feat].fillna(0)
    elif 'item_price' in feat:
        mean_sales[feat] = mean_sales[feat].fillna(mean_sales[feat].median())


In [None]:
cols_to_drop = lag_variables[:-1] + ['item_price', 'item_name']

In [None]:
training = mean_sales.drop(cols_to_drop, axis=1)

Train XGBoost model

In [None]:
xgbtrain = xgb.DMatrix(training.iloc[:, training.columns != 'item_cnt_day'].values, training.iloc[:, training.columns == 'item_cnt_day'].values)

In [None]:
params = {
    'max_depth': 10,
    'subsample': 1,
    'min_child_weight': 0.5,
    'eta': 0.3,
    'num_round': 1000,
    'seed': 1,
    'silent': 0,
    'eval_metric': 'rmse'
}
boost = xgb.train(params, xgbtrain)

In [None]:
x = xgb.plot_importance(boost)
x.figure.set_size_inches(10,20)

In [None]:
cols = list(training.columns)
del cols[cols.index('item_cnt_day')]

In [None]:
[cols[x] for x in [2, 0, 1, 3, 5]]

In [None]:
training.columns

In [None]:
test = pd.read_csv(path+'test.csv')

In [None]:
print("--------------------Test-Info------------------")
print(test.head())
print(test.info())
print(test.columns)

In [None]:
test['date_block_num'] = 34
test = pd.merge(test, items, on='item_id', how='left')

In [None]:
from tqdm import tqdm_notebook
for lag in tqdm_notebook(lags):
    sales_new_df = mean_sales.copy()
    sales_new_df.date_block_num += lag
    sales_new_df = sales_new_df[['date_block_num', 'shop_id', 'item_id']+lag_variables]
    sales_new_df.columns = ['date_block_num', 'shop_id', 'item_id'] + [lag_feat+ '_lag_'+str(lag) for lag_feat in lag_variables]
    test = pd.merge(test, sales_new_df, on=['date_block_num', 'shop_id', 'item_id'], how='left')
    

In [None]:
df_test = set(test.drop(['ID', 'item_name'], axis=1).columns)
df_training = set(training.drop('item_cnt_day', axis=1).columns)
for i in df_test:
    assert i in df_training
for j in df_training:
    assert j in df_test

In [None]:
assert df_training == df_test

In [None]:
test = test.drop(['ID', 'item_name'], axis=1)
for feat in test.columns:
    if 'item_cnt' in feat:
        test[feat]=test[feat].fillna(0)
    elif 'item_price' in feat:
        test[feat] = test[feat].fillna(test[feat].median())

In [None]:
test[['shop_id', 'item_id']+['item_cnt_day_lag_'+str(x) for x in[1,2,3]]].head()

Forecasting monthly sales

In [None]:
xgbpredict = xgb.DMatrix(test.values)
pred = boost.predict(xgbpredict)
pd.Series(pred).describe()

In [None]:
pred = pred.clip(0, 20)
pd.Series(pred).describe()

In [None]:
submission_df = pd.DataFrame({'ID': test.index, 'item_cnt_month': pred})
submission_df.head(10)

In [None]:
submission_df.to_csv('submission_xgboost.csv', index=False)