In [38]:
import numpy as np
import pandas as pd
from scipy.stats import describe
import warnings
warnings.simplefilter('ignore')
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
rcParams['figure.figsize'] = 10,5
import pandas as pd

# Load Data

In [39]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


# Converting strings to dates

In [40]:
df_train['date'] = pd.to_datetime(df_train['date'])
df_train.index = pd.DatetimeIndex(df_train['date'])
df_train.drop('date', axis=1, inplace=True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 913000 entries, 2013-01-01 to 2017-12-31
Data columns (total 3 columns):
store    913000 non-null int64
item     913000 non-null int64
sales    913000 non-null int64
dtypes: int64(3)
memory usage: 27.9 MB


# Sales for each store, item

In [41]:
from itertools import product, starmap


def storeitems():
    return product(range(1,51), range(1,11))


def storeitems_column_names():
    return list(starmap(lambda i,s: 'item_{}_store_{}_sales'.format(i,s), storeitems()))


def sales_by_storeitem(df):
    ret = pd.DataFrame(index=df.index.unique())
    for i, s in storeitems():
        ret['item_{}_store_{}_sales'.format(i,s)] = df[(df['item'] == i) & (df['store'] == s)]['sales'].values
    return ret

In [42]:
df_train = sales_by_storeitem(df_train)
df_train.head()

Unnamed: 0_level_0,item_1_store_1_sales,item_1_store_2_sales,item_1_store_3_sales,item_1_store_4_sales,item_1_store_5_sales,item_1_store_6_sales,item_1_store_7_sales,item_1_store_8_sales,item_1_store_9_sales,item_1_store_10_sales,...,item_50_store_1_sales,item_50_store_2_sales,item_50_store_3_sales,item_50_store_4_sales,item_50_store_5_sales,item_50_store_6_sales,item_50_store_7_sales,item_50_store_8_sales,item_50_store_9_sales,item_50_store_10_sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,13,12,19,10,11,20,7,16,11,14,...,30,43,45,38,19,20,21,45,36,33
2013-01-02,11,16,8,12,9,6,4,10,9,14,...,32,57,47,35,25,23,30,54,44,37
2013-01-03,14,16,10,8,12,11,8,12,20,16,...,25,45,39,32,28,38,20,54,29,46
2013-01-04,13,20,15,15,8,7,7,21,22,17,...,32,45,48,36,27,33,27,52,43,51
2013-01-05,10,16,22,19,13,12,8,15,22,12,...,35,55,46,47,31,33,18,48,53,41


In [43]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1826 entries, 2013-01-01 to 2017-12-31
Columns: 500 entries, item_1_store_1_sales to item_50_store_10_sales
dtypes: int64(500)
memory usage: 7.0 MB


# Test data
Loading test data will allow us to have correct scaling on entire set, and perform all necessary transformation on combined data.

In [44]:
# load data
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1


In [45]:
# strings to dates
df_test['date'] = pd.to_datetime(df_test['date'])
df_test.index = pd.DatetimeIndex(df_test['date'])
df_test.drop('date', axis=1, inplace=True)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 45000 entries, 2018-01-01 to 2018-03-31
Data columns (total 3 columns):
id       45000 non-null int64
store    45000 non-null int64
item     45000 non-null int64
dtypes: int64(3)
memory usage: 1.4 MB


In [46]:
# mock sales to use same transformations as in df_train
df_test['sales'] = np.zeros(df_test.shape[0])
df_test = sales_by_storeitem(df_test)
df_test.head()

Unnamed: 0_level_0,item_1_store_1_sales,item_1_store_2_sales,item_1_store_3_sales,item_1_store_4_sales,item_1_store_5_sales,item_1_store_6_sales,item_1_store_7_sales,item_1_store_8_sales,item_1_store_9_sales,item_1_store_10_sales,...,item_50_store_1_sales,item_50_store_2_sales,item_50_store_3_sales,item_50_store_4_sales,item_50_store_5_sales,item_50_store_6_sales,item_50_store_7_sales,item_50_store_8_sales,item_50_store_9_sales,item_50_store_10_sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 90 entries, 2018-01-01 to 2018-03-31
Columns: 500 entries, item_1_store_1_sales to item_50_store_10_sales
dtypes: float64(500)
memory usage: 352.3 KB


# Combine test and train dataset

In [57]:
# make sure all column names are the same and in the same order
col_names = list(zip(df_test.columns, df_train.columns))
for cn in col_names:
    assert cn[0] == cn[1]

In [58]:
df_test['is_test'] = np.repeat(True, df_test.shape[0])
df_train['is_test'] = np.repeat(False, df_train.shape[0])
df_total = pd.concat([df_train, df_test])
df_total.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1916 entries, 2013-01-01 to 2018-03-31
Columns: 501 entries, item_1_store_1_sales to is_test
dtypes: bool(1), float64(500)
memory usage: 7.3 MB


# 1hot encoding weekdays and months

In [60]:
weekday_df = pd.get_dummies(df_total.index.weekday, prefix='weekday')
weekday_df.index = df_total.index
weekday_df.head()

Unnamed: 0_level_0,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013-01-01,0,1,0,0,0,0,0
2013-01-02,0,0,1,0,0,0,0
2013-01-03,0,0,0,1,0,0,0
2013-01-04,0,0,0,0,1,0,0
2013-01-05,0,0,0,0,0,1,0


In [62]:
month_df = pd.get_dummies(df_total.index.month, prefix='month')
month_df.index = df_total.index
month_df.head()

Unnamed: 0_level_0,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013-01-01,1,0,0,0,0,0,0,0,0,0,0,0
2013-01-02,1,0,0,0,0,0,0,0,0,0,0,0
2013-01-03,1,0,0,0,0,0,0,0,0,0,0,0
2013-01-04,1,0,0,0,0,0,0,0,0,0,0,0
2013-01-05,1,0,0,0,0,0,0,0,0,0,0,0


In [63]:
df_total = pd.concat([weekday_df, month_df, df_total], axis=1)
df_total.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1916 entries, 2013-01-01 to 2018-03-31
Columns: 520 entries, weekday_0 to is_test
dtypes: bool(1), float64(500), uint8(19)
memory usage: 7.4 MB


In [64]:
assert df_total.isna().any().any() == False

# Shift sales
Doing this on combined dataset allows us to have first value for test set already calculated

In [66]:
def shift_series(series, days):
    return series.transform(lambda x: x.shift(days))


def shift_series_in_df(df, series_names=[], days_delta=90):
    """
    Shift columns in df with names in series_names by days_delta.
    
    Negative days_delta will prepend future values to current date,
    positive days_delta wil prepend past values to current date.
    """
    ret = pd.DataFrame(index=df.index.copy())
    str_sgn = 'future' if np.sign(days_delta) < 0 else 'past'
    for sn in series_names:
        ret[f'{sn}_{str_sgn}_{np.abs(days_delta)}'] = shift_series(df[sn], days_delta)
    return ret

    
def stack_shifted_sales(df, days_delta=90):
    names = storeitems_column_names()
    dfs = [df.copy()]
    abs_range = range(1, days_delta+1) if days_delta > 0 else range(days_delta, 0)
    for day_offset in abs_range:
        delta = -day_offset
        shifted = shift_series_in_df(df, series_names=names, days_delta=delta)
        dfs.append(shifted)
    return pd.concat(dfs, axis=1, copy=False)

In [67]:
df_total = stack_shifted_sales(df_total, days_delta=-1)

In [76]:
df_total = df_total.dropna()  # this should ONLY remove 1st row
df_total.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1915 entries, 2013-01-02 to 2018-03-31
Freq: D
Columns: 1020 entries, weekday_0 to item_50_store_10_sales_past_1
dtypes: bool(1), float64(1000), uint8(19)
memory usage: 14.7 MB


In [77]:
# make sure stacked and standard sales columns appear in the same order:
sales_cols = [col for col in df_total.columns if '_sales' in col and '_sales_' not in col]
stacked_sales_cols = [col for col in df_total.columns if '_sales_' in col]
other_cols = [col for col in df_total.columns if col not in set(sales_cols) and col not in set(stacked_sales_cols)]

sales_cols = sorted(sales_cols)
stacked_sales_cols = sorted(stacked_sales_cols)

new_cols = other_cols + stacked_sales_cols + sales_cols

In [78]:
df_total = df_total.reindex(columns=new_cols)

In [79]:
df_total.head()

Unnamed: 0_level_0,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,month_1,month_2,month_3,...,item_9_store_10_sales,item_9_store_1_sales,item_9_store_2_sales,item_9_store_3_sales,item_9_store_4_sales,item_9_store_5_sales,item_9_store_6_sales,item_9_store_7_sales,item_9_store_8_sales,item_9_store_9_sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,0,0,1,0,0,0,0,1,0,0,...,23.0,23.0,21.0,16.0,21.0,20.0,17.0,20.0,28.0,36.0
2013-01-03,0,0,0,1,0,0,0,1,0,0,...,37.0,25.0,29.0,32.0,25.0,15.0,28.0,18.0,31.0,25.0
2013-01-04,0,0,0,0,1,0,0,1,0,0,...,28.0,22.0,32.0,40.0,37.0,20.0,33.0,24.0,46.0,31.0
2013-01-05,0,0,0,0,0,1,0,1,0,0,...,35.0,29.0,45.0,44.0,37.0,23.0,27.0,14.0,35.0,30.0
2013-01-06,0,0,0,0,0,0,1,1,0,0,...,37.0,35.0,41.0,34.0,37.0,29.0,20.0,24.0,34.0,35.0


# Scaling
With combined datasets and shifted sales, we can now correctly min-max scale all data.

In [81]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [82]:
cols_to_scale = [col for col in df_total.columns if 'weekday' not in col and 'month' not in col]

In [83]:
scaler = MinMaxScaler(feature_range=(0,1))
scaled_cols = scaler.fit_transform(df_total[cols_to_scale])
df_total[cols_to_scale] = scaled_cols
df_total.head()

Unnamed: 0_level_0,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,month_1,month_2,month_3,...,item_9_store_10_sales,item_9_store_1_sales,item_9_store_2_sales,item_9_store_3_sales,item_9_store_4_sales,item_9_store_5_sales,item_9_store_6_sales,item_9_store_7_sales,item_9_store_8_sales,item_9_store_9_sales
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,0,0,1,0,0,0,0,1,0,0,...,0.191667,0.244681,0.165354,0.148148,0.189189,0.238095,0.209877,0.25641,0.208955,0.327273
2013-01-03,0,0,0,1,0,0,0,1,0,0,...,0.308333,0.265957,0.228346,0.296296,0.225225,0.178571,0.345679,0.230769,0.231343,0.227273
2013-01-04,0,0,0,0,1,0,0,1,0,0,...,0.233333,0.234043,0.251969,0.37037,0.333333,0.238095,0.407407,0.307692,0.343284,0.281818
2013-01-05,0,0,0,0,0,1,0,1,0,0,...,0.291667,0.308511,0.354331,0.407407,0.333333,0.27381,0.333333,0.179487,0.261194,0.272727
2013-01-06,0,0,0,0,0,0,1,1,0,0,...,0.308333,0.37234,0.322835,0.314815,0.333333,0.345238,0.246914,0.307692,0.253731,0.318182


# Split back to train and test set
df_total will be still available - it will be necessary to reverse scaling on output data (sales predictions from the model)

In [84]:
df_train = df_total[df_total['is_test'] == False].drop('is_test', axis=1)
df_test = df_total[df_total['is_test'] == True].drop('is_test', axis=1)

# Split to X and y (for training)

In [85]:
X_cols_stacked = [col for col in df_train.columns if '_past_' in col]
X_cols_caldata = [col for col in df_train.columns if 'weekday_' in col or 'month_' in col]
X_cols = X_cols_stacked + X_cols_caldata

X = df_train[X_cols]

In [88]:
X_colset = set(X_cols)
y_cols = [col for col in df_train.columns if col not in X_colset]

y = df_train[y_cols]

In [91]:
X.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1825 entries, 2013-01-02 to 2017-12-31
Freq: D
Columns: 519 entries, item_10_store_10_sales_past_1 to month_12
dtypes: float64(500), uint8(19)
memory usage: 7.0 MB


In [92]:
y.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1825 entries, 2013-01-02 to 2017-12-31
Freq: D
Columns: 500 entries, item_10_store_10_sales to item_9_store_9_sales
dtypes: float64(500)
memory usage: 7.0 MB


# Shape for Keras LSTM

In [93]:
# split values to train and test, use np arrays to allow reshaping
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, shuffle=False)

In [95]:
y.shape

(1825, 500)