In [None]:
import matplotlib.pyplot as plt
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np 
import pandas as pd 
import os, re, pickle, collections
import tensorflow as tf
import random, math
from gensim.models import Word2Vec
from time import time

In [None]:
# some functions to do cleaning of text fields, and splitting out general category from sub categories
def prepare_data(df):
    def _clean(text):
        return re.sub(r'[^\w\s]','',text)
    def _lower(text):
        return text.lower()
    for column in ['name', 'brand_name', 'item_description', 'item_condition_id']:
        df[column] = df[column].astype(str).apply(_clean).apply(_lower)
    df['category_name'] = df['category_name'].astype(str).apply(_lower)
    # general categories
    def _split_cat(text): 
        cats = text.split("/")
        if len(cats) >=3:
            return cats[0:3]
        else: return ("No Label", "No Label", "No Label") 
    df['general_cat'], df['subcat_1'], df['subcat_2'] = zip(*df['category_name'].apply(lambda x: _split_cat(x)))
    # generate the length of item descripiton
    def _length(text):
        tokens = re.sub(r'[^\w\s]','',text).lower()
        return len(tokens.split())
    df['desc_len'] = df['item_description'].apply(_length)
    # string together the remaining text fields, also get the length
    def _concat_(df):
        text = ""
        for col in ['name', 'general_cat', 'subcat_1', 'subcat_2', 'brand_name', 'item_condition_id']:
            text = text + (df[col]) + " "
        return text
    df['other_info'] = df.loc[:, ['name', 'general_cat', 'subcat_1', 'subcat_2', 'brand_name', 'item_condition_id']].apply(_concat_, axis=1)
    df['other_info_len'] = df['other_info'].apply(_length)
    return df

Read in data and do some cleaning

In [None]:
# df = pd.read_csv("../input/train.tsv", sep='\t', nrows=1000)
df = pd.read_csv("../input/train.tsv", sep='\t')
df.head()
df = prepare_data(df)

Build the corpus; the skip-gram model will be applied to this corpus via gensim implementation

In [None]:
corpus = []
for row in range(len(df)):
    for column in ['other_info', 'item_description']:
        tokens = (df.loc[row, column].split())
        corpus.append(tokens)
# get word2vec embeddings using gensim
embedding_size = 50
vocab_size = 50000
w2v_model = Word2Vec(corpus, size=embedding_size, min_count=0, max_vocab_size=vocab_size, workers=8, negative=5, sg=1, iter=1, 
                 batch_words=10000, sorted_vocab=1)

words = list(w2v_model.wv.vocab)
dictionary = {}
word_embeddings = np.zeros((len(words), embedding_size), dtype=float)
row = 0
for word in words:
    dictionary[word] = len(dictionary)
    word_embeddings[row, :] = w2v_model.wv.get_vector(word)
    row += 1
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
pickle.dump(word_embeddings, open("word_embeddings.p", "wb"))
del w2v_model, words, corpus

In [None]:
word_embeddings = word_embeddings.astype("float32") # to get dtype to conform with tensorflow graph later on

Prepare data for RNN-GRU model

Convert strings in data to integer indices according to the dictionary mapping

Only need to keep: log price, item_description, item_desc_len, and other_info, and other_info_len

Model is to extract information from item_description, as well as other information such as name, brand, category, and item condition, to predict prices. Two RNNs are used, one for item_description, and one for other information

In [None]:
max_D, max_O = 75, 20
def to_array(df):
    def _to_array(df, column, max_length):
        # convert a sequence of words as a string to a list of corresponding dictionary indices
        # output is a list of length max_length
        indices = np.zeros((len(df), max_length))
        length = np.zeros((len(df),1))
        for row in range(len(df)):
            tokens = df.loc[row, column].split()
            i = 0
            for token in tokens:
                if token in dictionary: 
                    indices[row, i] = dictionary[token]
                    i += 1
                if i == max_length: break
            length[row] = i
        return indices, length
    desc_ind, desc_len = _to_array(df, 'item_description', max_D)
    other_ind, other_len = _to_array(df, 'other_info', max_O)
    data = np.hstack((desc_ind, other_ind, desc_len, other_len))
    if 'price' in df.columns: 
        log_price = np.log(df['price']+1).values.reshape((len(df),1)).astype('float32')
        data = np.hstack((data, log_price))
    return data

data = to_array(df)
del df
data.shape

In [None]:
def train_valid_split(array):
    train_id_1 = np.arange(len(array)//3) * 3
    train_id_2 = np.arange(len(array)//3) * 3 + 1
    valid_id = np.arange(len(array)//3) * 3 + 2
    train_id = np.hstack((train_id_1, train_id_2))
    train = array[train_id, :]
    valid = array[valid_id, :]
    return train, valid
train, valid = train_valid_split(data)
train.shape, valid.shape
del data

In [None]:
# helper function to get a batch of data for training
def get_batch(array, start, size):
    if start + size > len(array):
        end = len(array)
    else: end = start + size
    batch = array[start:end, :]
    if end == len(array): start = 0
    else: start = start + size
    return batch, start

In [None]:
word_dim = 50
assert word_dim == embedding_size
num_hidden = 50 # dimensionality of the hidden state of the RNN cell (LSTM in this case)
learning_rate = 0.001 # consider experimenting with the learning rate

In [None]:
tf.reset_default_graph()
graph = tf.Graph()
with graph.as_default():
    with tf.device('/gpu:0'):
        desc_indices = tf.placeholder(tf.int32, shape=[None, max_D])
        info_indices = tf.placeholder(tf.int32, shape=[None, max_O])
        desc_seq_length = tf.placeholder(tf.int8, [None,])
        info_seq_length = tf.placeholder(tf.int8, [None,])
        log_price = tf.placeholder(tf.float32, [None,1])
        embeddings = tf.Variable(word_embeddings)

        x_desc = tf.nn.embedding_lookup(embeddings, desc_indices) # this is of shape batch_size, max_T, word_dim
        x_desc = tf.unstack(x_desc, max_D, 1) 
        x_info = tf.nn.embedding_lookup(embeddings, info_indices) # this is of shape batch_size, max_I, word_dim
        x_info = tf.unstack(x_info, max_O, 1) 

        gru_cell = tf.nn.rnn_cell.GRUCell(num_hidden)
        outputs_desc, state_desc = tf.nn.static_rnn(gru_cell, x_desc, sequence_length=desc_seq_length, dtype=tf.float32)
        outputs_info, state_info = tf.nn.static_rnn(gru_cell, x_info, sequence_length=info_seq_length, dtype=tf.float32)

        weights_desc = tf.Variable(tf.random_normal([num_hidden,1]))
        weights_info = tf.Variable(tf.random_normal([num_hidden,1]))
        biases = tf.Variable(tf.random_normal([1,1]))

        pred = tf.matmul(state_desc, weights_desc) + tf.matmul(state_info, weights_info) + biases
        error = log_price - pred
        loss = tf.sqrt(tf.reduce_mean(tf.square(error))) # root mean square log error
        train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
        
        init = tf.global_variables_initializer()
    saver = tf.train.Saver()
print("flow graph constructed")

In [None]:
def get_feed_dict(batch):
    if batch.shape[1] == 98:
        feed_dict={ desc_indices: batch[:, 0:75],
                    info_indices: batch[:, 75:95],
                    desc_seq_length: batch[:, 95],
                    info_seq_length: batch[:, 96],
                    log_price: batch[:, 97:98]}
    else:
        feed_dict={ desc_indices: batch[:, 0:75],
                    info_indices: batch[:, 75:95],
                    desc_seq_length: batch[:, 95],
                    info_seq_length: batch[:, 96],
                    }
    return feed_dict

In [None]:
def predict(session, data, batch_size):
    start = 0
    for step in range(len(data)//batch_size + 1):
        batch, start = get_batch(data, start, batch_size)
        pred_ = sess.run(pred, feed_dict=get_feed_dict(batch))
        if step == 0:
            pred_all = np.array(pred_)
        else:
            pred_all = np.vstack((pred_all, pred_))
    assert len(pred_all) == len(data)
    return pred_all

In [None]:
train_start, valid_start, time_initial = 0, 0, time()
batch_size, epochs = 500, 2
no_batches = len(train)//batch_size + 1
avg_loss, time_0 = 0, time()
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True), graph=graph) as sess:
    init.run()
    for epoch in range(epochs):
        for step in range(no_batches):
            if step % 1000 == 0: print(step)
            batch, train_start = get_batch(train, train_start, batch_size)
            _, loss_step = sess.run([train_op, loss], feed_dict=get_feed_dict(batch))
            avg_loss += loss_step
        avg_loss /= no_batches
        print("epoch: %2d, average loss: %.3f, time: %2.2f" 
              %(epoch, avg_loss, time()-time_0))
        avg_loss, time_0 = 0, time()
        # calculate validation statistics
        valid_pred = predict(sess, valid, batch_size)
        valid_log_price = valid[:, 97:98]
        valid_rsme = np.sqrt(np.mean((valid_pred - valid_log_price)**2))
        print("validation rsme: %.4f" %valid_rsme)
        
        save_path = saver.save(sess,"./params.ckpt")
        print("saved parameters to: ", save_path)
        
    print("done training")
    print("total training time is: %4d" %(time() - time_initial))

In [None]:
del train, valid
# test = pd.read_csv("../input/test_stg2.tsv", sep='\t', nrows=12345)
test = pd.read_csv("../input/test_stg2.tsv", sep='\t')
test = prepare_data(test)
test = to_array(test)
print(test.shape)

In [None]:
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True), graph=graph) as sess:
    saver.restore(sess, "./params.ckpt")
    test_pred = predict(sess, test, batch_size)
    prices = np.exp(test_pred) - 1
    assert len(test_pred) == len(test)
out = pd.DataFrame(prices)
out = out.reset_index().rename(columns={"index":"test_id", 0:"price"})

In [None]:
out.head()

In [None]:
out.to_csv("./out.csv", index=False)
print("output saved to csv")