In [46]:
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras import backend as K

In [47]:
# read model data as numpy format
input_folder = "../../Data/Model_data/train/"

X_train_id = np.load(input_folder + "X_id.npy")
X_train_name = np.load(input_folder + "X_name.npy")
X_train_category = np.load(input_folder + "X_category.npy")
X_train_description = np.load(input_folder + "X_description.npy")
X_train_shipping =np.load(input_folder + "X_shipping.npy")
X_train_brand = np.load(input_folder + "X_brand.npy")
X_train_condition = np.load(input_folder + "X_condition.npy")
y_log_price = np.load(input_folder + "y_price.npy")

In [48]:
#padd the descirption and name seq to be 0
MAX_NAME_LEN = None
MAX_DES_LEN = 100

X_train_name = pad_sequences(X_train_name, maxlen=MAX_NAME_LEN)
X_train_description = pad_sequences(X_train_description, maxlen=MAX_DES_LEN)

In [49]:
#construct model
def rmsle_cust(y_true, y_pred):
    first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
    second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
    return K.sqrt(K.mean(K.square(first_log - second_log), axis=-1))

In [50]:
MAX_TEXT = np.max([np.max(X_train_description), np.max(X_train_name)]) + 1
MAX_BRAND = np.max(X_train_brand) + 1
MAX_CATEGORY = np.max(X_train_category) + 1
MAX_CONDITION = np.max(X_train_condition) + 1

In [51]:
MAX_BRAND

3003

In [52]:
X_train_description.shape

(1482535, 100)

Basic GRU model

In [62]:
#params
dr_r = 0.5
    
#Inputs
name = Input(shape=[X_train_name.shape[1]], name = "name")
item_desc = Input(shape=[X_train_description.shape[1]], name="item_desc")
brand_name = Input(shape=[1], name="brand_name")
category_name = Input(shape=[1], name="category_name")
item_condition = Input(shape=[1], name="item_condition")
shipping = Input(shape=[1], name="shipping")
    
#Embeddings layers
word_emb = Embedding(MAX_TEXT, 50, mask_zero = True)

emb_name = word_emb(name)
emb_item_desc = word_emb(item_desc)

emb_brand_name = Embedding(MAX_BRAND, 8)(brand_name)
emb_category_name = Embedding(MAX_CATEGORY, 8)(category_name)
emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    
#rnn layer
rnn_layer1 = GRU(16) (emb_item_desc)
rnn_layer2 = GRU(12) (emb_name)
    
#main layer
main_l = concatenate([
        Flatten() (emb_brand_name)
        , Flatten() (emb_category_name)
        , Flatten() (emb_item_condition)
        , rnn_layer1
        , rnn_layer2
        , shipping
    ])
main_l = Dropout(dr_r) (Dense(128) (main_l))
main_l = Dropout(dr_r) (Dense(64) (main_l))
    
#output
output = Dense(1, activation="linear") (main_l)
    
#model
model = Model([name, item_desc, brand_name
                   , category_name, item_condition, shipping], output)
model.compile(loss='mse', optimizer="adam", metrics=["mae"])

In [64]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
brand_name (InputLayer)          (None, 1)             0                                            
____________________________________________________________________________________________________
category_name (InputLayer)       (None, 1)             0                                            
____________________________________________________________________________________________________
item_condition (InputLayer)      (None, 1)             0                                            
____________________________________________________________________________________________________
item_desc (InputLayer)           (None, 100)           0                                            
___________________________________________________________________________________________

In [65]:
#split train and validation data
train_percent = 0.9
shuffled_idx = np.arange(X_train_id.shape[0])
np.random.shuffle(shuffled_idx)

train_idx = shuffled_idx[:int(X_train_id.shape[0] * 0.9)]
valid_idx = shuffled_idx[int(X_train_id.shape[0] * 0.9):]

X = [X_train_name,  X_train_description, X_train_brand, X_train_category, \
      X_train_condition, X_train_shipping]

X_train = [arr[train_idx] for arr in X]
X_valid = [arr[valid_idx] for arr in X]

y_train = y_log_price[train_idx]
y_valid = y_log_price[valid_idx]

In [66]:
BATCH_SIZE = 20000
epochs = 10

history = model.fit(X_train, y_train, epochs=epochs, shuffle = True, batch_size=BATCH_SIZE
          , validation_data=(X_valid, y_valid)
          , verbose=1)

Train on 1334281 samples, validate on 148254 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [67]:
#save trained model
out_folder = "../Saved_models/"
model.save(out_folder + "model1.h5")

  str(node.arguments) + '. They will not be included '
  str(node.arguments) + '. They will not be included '


In [71]:
val_preds = model.predict(X_valid)
val_preds = np.exp(val_preds)-1

In [73]:
y_true = np.exp(y_valid) - 1
y_pred = val_preds[:,0]

import math
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5

v_rmsle = rmsle(y_true, y_pred)
print(" RMSLE error on dev test: "+str(v_rmsle))

 RMSLE error on dev test: 0.476860351667
