In [1]:
import os
import numpy as np

from keras.preprocessing import sequence
import keras.models as km
import keras.layers as kl
import keras.constraints as kc
from keras.datasets import imdb
from keras import losses
from keras.models import load_model

import matplotlib.pyplot as plt
%matplotlib inline

import ipywidgets as ipw

import data_preparation as mdp
from data_preparation import MercariConfig

Using TensorFlow backend.


In [2]:
num_words_item_desc = MercariConfig.MAX_WORDS_FROM_INDEX_4_ITEM_DESC + MercariConfig.WORD_I
max_seq_len_item_desc = MercariConfig.MAX_WORDS_IN_ITEM_DESC + 1 # Remember: first word is always <START>
embedding_dims = 64

num_words_name = MercariConfig.MAX_WORDS_FROM_INDEX_4_NAME + MercariConfig.WORD_I
max_seq_len_name = MercariConfig.MAX_WORDS_IN_NAME + 1 # Remember: first word is always <START>

batch_size = 200

In [3]:
%%time

train_data = mdp.load_data(MercariConfig.TRAINING_SET_PREP_FILE, sep=',')
name_seq_train = mdp.load_index_sequence(MercariConfig.TRAINING_NAME_INDEX_FILE, MercariConfig.MAX_WORDS_IN_NAME)
item_desc_seq_train = mdp.load_index_sequence(MercariConfig.TRAINING_ITEM_DESC_INDEX_FILE, MercariConfig.MAX_WORDS_IN_ITEM_DESC)

X_name_seq_train = name_seq_train.as_matrix()
X_item_desc_seq_train = item_desc_seq_train.as_matrix()
x_cat_train = train_data['category_id'].as_matrix()
x_brand_train = train_data['brand_id'].as_matrix()
x_f_train = train_data[['item_condition_id', 'shipping']].as_matrix()
y_train = train_data['price'].as_matrix()

2018-01-28 19:25:27,686 - MerL.data_preparation - INFO - Loading index_sequence_data from mercari_train_name_index.csv ...
2018-01-28 19:25:28,341 - MerL.data_preparation - INFO - Loading index_sequence_data from mercari_train_name_index.csv done.
2018-01-28 19:25:28,342 - MerL.data_preparation - INFO - Loading index_sequence_data from mercari_train_item_desc_index.csv ...
2018-01-28 19:25:44,188 - MerL.data_preparation - INFO - Loading index_sequence_data from mercari_train_item_desc_index.csv done.


CPU times: user 15.7 s, sys: 1.98 s, total: 17.7 s
Wall time: 17.7 s


In [9]:
print(X_name_seq_train.shape, X_item_desc_seq_train.shape, 
      x_cat_train.shape, x_brand_train.shape, x_f_train.shape, y_train.shape)

(296507, 21) (296507, 501) (296507,) (296507,) (296507, 2) (296507,)


In [7]:
%%time

val_data = mdp.load_data(MercariConfig.VALIDATION_SET_PREP_FILE, sep=',')
name_seq_val = mdp.load_index_sequence(MercariConfig.VALIDATION_NAME_INDEX_FILE, MercariConfig.MAX_WORDS_IN_NAME)
item_desc_seq_val = mdp.load_index_sequence(MercariConfig.VALIDATION_ITEM_DESC_INDEX_FILE, MercariConfig.MAX_WORDS_IN_ITEM_DESC)

X_name_seq_val = name_seq_val.as_matrix()
X_item_desc_seq_val = item_desc_seq_val.as_matrix()
x_cat_val = val_data['category_id'].as_matrix()
x_brand_val = val_data['brand_id'].as_matrix()
x_f_val = val_data[['item_condition_id', 'shipping']].as_matrix()
y_val = val_data['price'].as_matrix()

2018-01-28 19:27:53,258 - MerL.data_preparation - INFO - Loading index_sequence_data from mercari_val_name_index.csv ...
2018-01-28 19:27:53,319 - MerL.data_preparation - INFO - Loading index_sequence_data from mercari_val_name_index.csv done.
2018-01-28 19:27:53,320 - MerL.data_preparation - INFO - Loading index_sequence_data from mercari_val_item_desc_index.csv ...
2018-01-28 19:27:55,045 - MerL.data_preparation - INFO - Loading index_sequence_data from mercari_val_item_desc_index.csv done.


CPU times: user 1.89 s, sys: 16 ms, total: 1.91 s
Wall time: 1.9 s


In [10]:
print(X_name_seq_val.shape,X_item_desc_seq_val.shape, 
      x_cat_val.shape, x_brand_val.shape, x_f_val.shape, y_val.shape)

(29651, 21) (29651, 501) (29651,) (29651,) (29651, 2) (29651,)


In [11]:
%%time

X_item_desc_seq_train = sequence.pad_sequences(
    X_item_desc_seq_train, maxlen=max_seq_len_item_desc, padding='post', truncating='post')
X_name_seq_train = sequence.pad_sequences(
    X_name_seq_train, maxlen=max_seq_len_name, padding='post', truncating='post')

X_item_desc_seq_val = sequence.pad_sequences(
    X_item_desc_seq_val, maxlen=max_seq_len_item_desc, padding='post', truncating='post')
X_name_seq_val = sequence.pad_sequences(
    X_name_seq_val, maxlen=max_seq_len_name, padding='post', truncating='post')

CPU times: user 4.45 s, sys: 352 ms, total: 4.8 s
Wall time: 4.8 s


In [None]:
feature_input = kl.Input(shape=(2,), name='feature_input')
category_input = kl.Input(shape=(1,), name='category_input')
brand_input = kl.Input(shape=(1,), name='brand_input')
item_desc_input = kl.Input(shape=(max_seq_len_item_desc,), name='item_desc_input')
name_input = kl.Input(shape=(max_seq_len_name,), name='name_input')

item_desc_embedding = kl.Embedding(num_words_item_desc, embedding_dims, name='item_desc_embedding')
item_desc_embedding_dropout = kl.SpatialDropout1D(0.5, name='item_desc_embedding_dropout')
item_desc_lstm_1 = kl.CuDNNLSTM(units=200, name='item_desc_lstm_1', return_sequences=True)
item_desc_lstm_2 = kl.CuDNNLSTM(units=200, name='item_desc_lstm_2')
item_desc_lstm_dropout = kl.Dropout(0.5, name='item_desc_lstm_dropout')

name_embedding = kl.Embedding(num_words_name, embedding_dims, name='name_embedding')
name_embedding_dropout = kl.SpatialDropout1D(0.5, name='name_embedding_dropout')
name_lstm_1 = kl.CuDNNLSTM(units=200, name='name_lstm_1', return_sequences=True)
name_lstm_2 = kl.CuDNNLSTM(units=200, name='name_lstm_2')
name_lstm_dropout = kl.Dropout(0.5, name='name_lstm_dropout')

category_embedding = kl.Embedding(1105, 32, name='category_embedding')
category_reshape = kl.Reshape(target_shape=(32,), name='category_reshape')

brand_embedding = kl.Embedding(2774, 32, name='brand_embedding')
brand_reshape = kl.Reshape(target_shape=(32,), name='brand_reshape')

input_fusion = kl.Concatenate(axis=1, name='input_fusion')
fusion_dense_1 = kl.Dense(400, activation='relu', name='fusion_dense_1')
fusion_dense_2 = kl.Dense(200, activation='relu', name='fusion_dense_2')
fusion_dense_3 = kl.Dense(1, activation='relu', name='fusion_dense_3')

item_desc_output = item_desc_embedding(item_desc_input)
item_desc_output = item_desc_embedding_dropout(item_desc_output)
item_desc_output = item_desc_lstm_1(item_desc_output)
item_desc_output = item_desc_lstm_2(item_desc_output)
item_desc_output = item_desc_lstm_dropout(item_desc_output)

name_output = name_embedding(name_input)
name_output = name_embedding_dropout(name_output)
name_output = name_lstm_1(name_output)
name_output = name_lstm_2(name_output)
name_output = name_lstm_dropout(name_output)

category_output = category_embedding(category_input)
category_output = category_reshape(category_output)

brand_output = brand_embedding(brand_input)
brand_output = brand_reshape(brand_output)

output = input_fusion([name_output, item_desc_output, category_output, brand_output, feature_input])
output = fusion_dense_1(output)
output = fusion_dense_2(output)
prediction = fusion_dense_3(output)

model = km.Model(inputs=[feature_input, category_input, brand_input, name_input, item_desc_input], outputs=prediction)

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [None]:
model.summary()

In [13]:
from keras import backend as K
import keras

def root_mean_squared_logarithmic_error(y_true, y_pred):
    ret = losses.mean_squared_logarithmic_error(y_true, y_pred)
    return K.sqrt(ret)

def root_mean_squared_error(y_true, y_pred):
    ret = losses.mean_squared_error(y_true, y_pred)
    return K.sqrt(ret)

In [None]:
#model.compile(optimizer='adam', loss=root_mean_squared_logarithmic_error, metrics=[root_mean_squared_error])
model.compile(optimizer='adam', loss=root_mean_squared_error, metrics=[root_mean_squared_logarithmic_error])

In [14]:
model = load_model('merl_model-v3_10.h5', 
                   custom_objects={'root_mean_squared_error': root_mean_squared_error, 
                                   'root_mean_squared_logarithmic_error': root_mean_squared_logarithmic_error})

In [None]:
tf_log_dir = MercariConfig.get_new_tf_log_dir()

tb_callback = keras.callbacks.TensorBoard(log_dir=tf_log_dir, histogram_freq=0, batch_size=batch_size, 
                            write_graph=True, write_grads=False, write_images=False, 
                            embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)

#reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
#                              patience=3, min_lr=0.001)

history_simple = model.fit(
    [x_f_train, x_cat_train, x_brand_train, X_name_seq_train, X_item_desc_seq_train], y_train,
    batch_size=batch_size,
    epochs=5,
    verbose=1,
#    callbacks=[tb_callback, reduce_lr],
    shuffle=True,
    initial_epoch=0,
    validation_data=[[x_f_val, x_cat_val, x_brand_val, X_name_seq_val, X_item_desc_seq_val], y_val])

Train on 296507 samples, validate on 29651 samples
Epoch 1/5
 34600/296507 [==>...........................] - ETA: 10:17 - loss: 34.9703 - root_mean_squared_logarithmic_error: 2.6240

In [None]:
model.save(os.path.join(MercariConfig.MODEL_DIR, 'my_model.h5'))

In [None]:
model.evaluate(x=[x_f_val, x_cat_val, x_brand_val, X_name_seq_val, X_item_desc_seq_val], y=y_val, 
               batch_size=batch_size, verbose=1, sample_weight=None, steps=None)

In [None]:
y_pred = model.predict(x=[x_f_val, x_cat_val, x_brand_val, X_name_seq_val, X_item_desc_seq_val], batch_size=batch_size, verbose=1, steps=None)

In [None]:
import pandas as pd

df = val_data[['name', 'item_condition_id', 'category_name', 'brand_name', 'shipping', 'price']]
df['price_pre'] = y_pred

In [None]:
df[df.price > 100]