In [1]:
import pandas as pd
train = pd.read_csv('train.csv', parse_dates=['date'])
train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


# Feature Engineering

In [2]:
train['dayOfWeek'] = train.date.dt.dayofweek
# train['weekOfYear'] = train.date.dt.weekofyear
# train['dayOfYear'] = train.date.dt.dayofyear
train['month'] = train.date.dt.month
train['year'] = train.date.dt.year - train.date.dt.year.min()
# train['median_store'] = train.groupby(["item", "store"])["sales"].transform("median")
# train["median_store_item_month"] = train.groupby(['month', "item", "store"])["sales"].transform("median")
# train["mean_store_item_week"] = train.groupby(["item", "store",'weekOfYear'])["sales"].transform("mean")

target = train['sales']
train.drop(columns=['date', 'sales'], axis = 1, inplace=True)
train

Unnamed: 0,store,item,dayOfWeek,month,year
0,1,1,1,1,0
1,1,1,2,1,0
2,1,1,3,1,0
3,1,1,4,1,0
4,1,1,5,1,0
5,1,1,6,1,0
6,1,1,0,1,0
7,1,1,1,1,0
8,1,1,2,1,0
9,1,1,3,1,0


# Build dataset

In [3]:
categorical = list(train.columns)

categorical.remove('year')

cont = ['year']

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_dev, y_train, y_dev = train_test_split(train, target, test_size=.1, random_state=0, shuffle = True)

In [5]:
X_train = []
X_dev = []

X_train.append(y_train.astype('float32').values)
X_dev.append(y_dev.astype('float32').values)

for cat in categorical:
    X_train.append(x_train[cat].values)
    X_dev.append(x_dev[cat].values)

In [6]:
import keras.backend as K

def custom_smape(x, x_):
    return K.mean(2*K.abs(x-x_)/(K.abs(x)+K.abs(x_)))

Using TensorFlow backend.


# Embeddings

In [7]:
cat_sizes = {}
cat_embsizes = {}
for cat in categorical:
    cat_sizes[cat] = train[cat].nunique()
    cat_embsizes[cat] = min(cat_sizes[cat]//2, int(cat_sizes[cat]**0.5))

In [8]:
cat_sizes, cat_embsizes

({'store': 10, 'item': 50, 'dayOfWeek': 7, 'month': 12},
 {'store': 3, 'item': 7, 'dayOfWeek': 2, 'month': 3})

In [9]:
categorical, cont

(['store', 'item', 'dayOfWeek', 'month'], ['year'])

# Model

In [10]:
from keras.layers import Dense, Dropout, Embedding, Input, Reshape, Concatenate
from keras.models import Model
y = Input((len(cont),), name='cont')
y

<tf.Tensor 'cont:0' shape=(?, 1) dtype=float32>

In [11]:
inputs = [y]
concat = [y]
for cat in categorical:
    x = Input((1,), name=cat)
    inputs.append(x)
    
    x = Embedding(cat_sizes[cat]+1, cat_embsizes[cat], input_length=1)(x)
    x = Reshape((cat_embsizes[cat],))(x)
    
    concat.append(x)
    
inputs, concat

([<tf.Tensor 'cont:0' shape=(?, 1) dtype=float32>,
  <tf.Tensor 'store:0' shape=(?, 1) dtype=float32>,
  <tf.Tensor 'item:0' shape=(?, 1) dtype=float32>,
  <tf.Tensor 'dayOfWeek:0' shape=(?, 1) dtype=float32>,
  <tf.Tensor 'month:0' shape=(?, 1) dtype=float32>],
 [<tf.Tensor 'cont:0' shape=(?, 1) dtype=float32>,
  <tf.Tensor 'reshape_1/Reshape:0' shape=(?, 3) dtype=float32>,
  <tf.Tensor 'reshape_2/Reshape:0' shape=(?, 7) dtype=float32>,
  <tf.Tensor 'reshape_3/Reshape:0' shape=(?, 2) dtype=float32>,
  <tf.Tensor 'reshape_4/Reshape:0' shape=(?, 3) dtype=float32>])

In [12]:
y = Concatenate()(concat)
y = Dense(100, activation= 'relu')(y)
y = Dense(1)(y)

In [13]:
model = Model(inputs=inputs, outputs=y)

model.compile(loss=custom_smape, optimizer='adam')

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
store (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
item (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
dayOfWeek (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
month (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_

In [14]:
model.fit([train[cont], train.store, train.item, train.dayOfWeek, train.month], 
          target, batch_size=128, epochs=2)

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 5 array(s), but instead got the following list of 4 arrays: [array([[0],
       [0],
       [0],
       ...,
       [4],
       [4],
       [4]]), array([[ 1],
       [ 1],
       [ 1],
       ...,
       [10],
       [10],
       [10]]), array([[ 1],
       [...

In [None]:
train[cont]

# Test

In [None]:
train = pd.read_csv('train.csv', parse_dates=['date'])
test = pd.read_csv("test.csv", parse_dates=['date'])

test['year'] = test.date.dt.year-train.date.dt.year.min()
test['month'] = test.date.dt.month
test['day'] = test.date.dt.day
test['dayOfWeek'] = test.date.dt.dayofweek
# test['weekOfYear'] = test.date.dt.weekofyear
# test['dayOfYear'] = test.date.dt.dayofyear

In [None]:
X_test = []
X_test.append(test['year'].astype('float32').values)
for cat in categorical:
    X_test.append(test[cat].values)

In [None]:
test

In [None]:
preds = model.predict(X_test)
# sample_data = pd.read_csv("sample_submission.csv", index_col=0)
# sample_data['sales'] = preds
# sample_data.to_csv('preds.csv')
preds