In [1]:
## Load library 
import gc
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from datetime import datetime, timedelta

## For LSTM, import Keras libraries and packages
from keras.models import load_model, Model, Sequential
from keras.layers import Dense, Activation, Dropout, Input, LSTM, Reshape, Lambda, RepeatVector
from keras.optimizers import Adam

# Load data
sales = pd.read_csv('data/sales_train_evaluation.csv')
calendar = pd.read_csv('data/calendar.csv')

# Adding sales for test data: d_1942-d_1969
# for d in range(1942, 1970):
#     col = 'd_' + str(d)
#     sales[col] = 0
#     sales[col] = sales[col].astype(np.int16)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
## Function to downcast data
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
                
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
                
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df 

In [3]:
%%time

sales = downcast(sales)
calendar = downcast(calendar)

Wall time: 2min 44s


**Sales data**

In [4]:
sales.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0


* Transpose data so that rows are days and columns are 30490 unique items.
* Remove the first 6 rows which are categorical variables and the first 350 days. 
* Reset index so that a new column is created denoting the rows. 
* Rename that column to `d` (easier to merge with calendar data later. 

In [13]:
%%time 

startday = 350
sales = sales.T[6+startday:].reset_index().rename(columns={'index':'d'})

sales.head()

Wall time: 3.73 s


Unnamed: 0,d,0,1,2,3,4,5,6,7,8,...,30480,30481,30482,30483,30484,30485,30486,30487,30488,30489
0,d_351,0,0,0,2,0,0,0,24,3,...,0,9,1,0,11,0,0,1,0,0
1,d_352,0,0,0,0,0,0,0,9,0,...,0,5,4,0,8,0,1,2,0,0
2,d_353,0,0,0,4,2,0,0,2,1,...,0,15,2,0,3,0,1,2,0,0
3,d_354,0,1,0,2,0,0,0,7,1,...,0,5,1,0,3,0,0,0,0,0
4,d_355,0,0,0,1,2,0,0,0,0,...,0,7,1,0,1,0,1,1,0,0


**Calendar dataset**

In [5]:
calendar.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


* `date` variable already converted to `datetime` during downcast.
* Use this to create time variables such as day of the week, week of the year, month , quarter, year and day of the month will help capture some of the seasonality present in the data. 

In [17]:
## Function to create time variables
def create_timevars(data):
    # Create time variables
    date_feats = {'wday':'weekday', 
                  'week':'weekofyear', 
                  'month':'month', 
                  'quarter':'quarter',
                  'year':'year',
                  'days':'day'}

    for date_name,date_attr in date_feats.items():
        if date_name in data.columns:
            data[date_name] = data[date_name].astype('int16')
        else:
            data[date_name] = getattr(data['date'].dt, date_attr).astype('int16') # note .dt changes date to something thats not datetime64
            # returns series indexed like original series and extracts datetime attribute
            
    data = downcast(data)
            
    return data

In [58]:
%%time

calendar = create_timevars(calendar)

calendar.head()

Wall time: 7.01 ms


Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,week,quarter,days
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0,4,1,29
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0,4,1,30
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0,5,1,31
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0,5,1,1
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1,5,1,2


In [22]:
calendar = calendar.drop(['date', 'wm_yr_wk', 'weekday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI'], axis=1)

calendar.head()

Unnamed: 0,wday,month,year,d,week,quarter,days
0,1,1,2011,d_1,4,1,29
1,2,1,2011,d_2,4,1,30
2,3,1,2011,d_3,5,1,31
3,4,2,2011,d_4,5,1,1
4,5,2,2011,d_5,5,1,2


Save days 1942-1969 as test data.

In [72]:
calendar_test = calendar.iloc[1941:1969].drop(['date', 'wm_yr_wk', 'weekday', 'd', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI'], axis=1)

In [73]:
j=14

In [78]:
calendar_test[0][1941]

KeyError: 0

**Combine Sales and Calendar datasets**

In [28]:
%%time

df = pd.merge(sales, calendar, on='d')
df = df.drop(['d'], axis=1)

df.head()

Wall time: 1.17 s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30486,30487,30488,30489,wday,month,year,week,quarter,days
0,0,0,0,2,0,0,0,24,3,2,...,0,1,0,0,1,1,2012,2,1,14
1,0,0,0,0,0,0,0,9,0,2,...,1,2,0,0,2,1,2012,2,1,15
2,0,0,0,4,2,0,0,2,1,1,...,1,2,0,0,3,1,2012,3,1,16
3,0,1,0,2,0,0,0,7,1,0,...,0,0,0,0,4,1,2012,3,1,17
4,0,0,0,1,2,0,0,0,0,0,...,1,1,0,0,5,1,2012,3,1,18


In [37]:
%%time

## Feature scaling
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0,1))
df_scaled = sc.fit_transform(df)

Wall time: 4.44 s


**Create Training datasets**

* The shape of `X_train` is $(m,T_x,n_f)$, where $m$ is the number of training examples, $T_x$ is the timesteps and $n_f$ is the number of features. 
* $T_x=14$ is the timestep. This means past 14 day sales will be used for prediction. 
* $m=1941-350-14$, since we discarded the first 350 days and use 14 as the timesteps. 
* $n_f=30496$ as there are 30490 items and 6 additional time variables are also used as features. 
* `y_train` is $(T_y,m,n_f-6)$, where $T_y=1$ as only 1 day of sales is being predicted and the 6 time variables do not exist for this dataset.

In [38]:
%%time

X_train = []
y_train = []
timestep = 14

for i in range(timestep, 1941-startday):
    X_train.append(df_scaled[i-timestep:i])
    y_train.append(df_scaled[i][0:30490])

Wall time: 69.1 ms


In [39]:
%%time

X_train = np.array(X_train)
y_train = np.array(y_train)

print('Shape of X:', X_train.shape)
print('Shape of y:', y_train.shape)

Shape of X: (1577, 14, 30496)
Shape of y: (1577, 30490)
Wall time: 3.39 s


In [44]:
del df_scaled
del calendar
del sales

**LSTM Model**

In [48]:
# Initialize RNN model
model = Sequential()

# LSTM layer 1 with dropout regularization
layer_1_units = 50
model.add(LSTM(units=layer_1_units, return_sequences=True, input_shape=(X_train.shape[1],X_train.shape[2])))
model.add(Dropout(0.2))

# LSTM layer 2 with dropout regularization
layer_2_units=400
model.add(LSTM(units=layer_2_units, return_sequences=True))
model.add(Dropout(0.2))

# LSTM layer 3 with dropout
layer_3_units=400
model.add(LSTM(units=layer_3_units))
model.add(Dropout(0.2))

# Dense layer for output
model.add(Dense(units=30490))

# Compile RNN model
model.compile(optimizer='adam', loss='mean_squared_error')

# Fit RNN model
n_epoch = 32
n_batch = 44
model.fit(X_train, y_train, epochs=n_epoch, batch_size=n_batch)


Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<keras.callbacks.callbacks.History at 0x15ba11abc88>

**Prediction**

* Last 14 days will be used to make prediction for the 15th day.
* So for day 1942, days 1928-1941 (14 days) will be used. This is from the training dataset. 
* Moving to day 1943, we have prediction for day 1942 and we need last 13 days from training dataset. 
* Similarly, for day 1944, we have prediction for day 1942 and 1943 and we use last 12 days from training dataset. 
* And so on...

In [50]:
pred_df = df[-timestep:]
pred_df = sc.transform(pred_df)

In [51]:
X_test = []
X_test.append(pred_df[0:timestep])
X_test = np.array(X_test)

print('Shape of X test:', X_test.shape)

Shape of X test: (1, 14, 30496)


In [120]:
%%time

predictions = []

for j in range(timestep, timestep+28):
    predicted_sales = model.predict(X_test[0,j-timestep:j].reshape(1,timestep,30496))
    test_input = np.column_stack((np.array(predicted_sales), np.array(calendar_test[j-timestep:j-timestep+1])))
    X_test = np.append(X_test, test_input).reshape(1, j+1, 30496)
    predicted_sales = sc.inverse_transform(test_input)[:,0:30490]
    predictions.append(predicted_sales)


Wall time: 349 ms


In [121]:
np.array(predictions).shape

(28, 1, 30490)

**Submission**

In [132]:
submission = pd.DataFrame(np.array(predictions).reshape(28,30490))
submission = submission.T
submission = pd.concat((submission, submission), ignore_index=True)

In [133]:
sample_submission = pd.read_csv('data/sample_submission.csv')

# Add id's from sample submission provided
submission[['id']] = sample_submission[['id']]

In [136]:
# Reorganize the columns since id column is the last column now
# Rename forecast days columns
cols = list(submission.columns)
cols = cols[-1:] + cols[:-1]
submission = submission[cols]
submission.columns = ['id'] + [f'F{i}' for i in range(1,29)]

In [138]:
submission.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,1.003818,0.993,1.027685,1.095407,1.275451,1.308772,1.062468,0.975875,0.980926,...,0.876543,0.872891,0.857538,0.843735,0.836405,0.835358,0.851546,0.868296,0.867287,0.866462
1,HOBBIES_1_002_CA_1_validation,0.299119,0.250813,0.213037,0.222291,0.299348,0.361891,0.327362,0.289735,0.239913,...,0.007339,0.013693,0.017372,0.01851,0.01825,0.016686,0.024883,0.034966,0.038179,0.041196
2,HOBBIES_1_003_CA_1_validation,0.412555,0.459211,0.542744,0.643202,0.836725,0.901668,0.634962,0.40547,0.430802,...,0.5912,0.592522,0.584458,0.575195,0.570143,0.571775,0.580643,0.589828,0.58968,0.588684
3,HOBBIES_1_004_CA_1_validation,1.974074,1.534765,1.411934,1.349055,1.624742,2.863577,2.416484,1.240286,0.783681,...,1.073959,1.100335,1.092743,1.064321,1.029363,1.001394,1.003758,1.016354,1.009427,0.999412
4,HOBBIES_1_005_CA_1_validation,1.257914,1.280309,1.274262,1.235707,1.328084,1.698083,1.511122,1.184125,1.085186,...,0.695504,0.704096,0.691735,0.675009,0.663045,0.659511,0.68977,0.72159,0.720754,0.720456


In [139]:
submission.to_csv('data/simple_lstm.csv', index=False)