In [1]:
# Import Libraries
import os
import numpy as np
import pipeline as pl
import tensorflow as tf
import pandas as pd
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sn

# Data Structure Modification

* Given that the dataset has continuous relevance score, neural network model have treated as a regression problem. However, due to the high rmse results, this is an attempt to restructure the dataset into a classification problem and see if neural network model works better. 
* The dataset has been restructured into the following:

Label | Original Value
------------- |:-------------
0 | 1
1 | 1.25, 1.3300000000000001
2 | 1.5, 1.6699999999999999
3 | 1.75, 2.0
4 | 2.25, 2.3300000000000001
5 | 2.5, 2.6699999999999999
6 | 2.75, 3.0

In [2]:
def balanced(train):
    downsize = train[train['relevance'] == 2.3300000000000001].sample(n = 1760, random_state = 2017)
    downsize = pd.concat([downsize, train[train['relevance'] == 2.6699999999999999].sample(n = 1760,\
                                                                                           random_state = 2017),\
                          train[train['relevance'] == 3.0].sample(n = 1760,random_state = 2017),\
                          train[train['relevance'] == 2.0].sample(n = 1760,random_state = 2017),\
                         train[train['relevance'] == 1.6699999999999999].sample(n = 1760,random_state = 2017),
                         train[train['relevance'] == 1.3300000000000001].sample(n = 1760,random_state = 2017)]\
                         ,axis = 0)
    rest_ = train.loc[train['relevance'] != 2.3300000000000001]
    rest_1 = rest_.loc[rest_['relevance'] != 1.3300000000000001]
    rest_1_ = rest_1.loc[rest_1['relevance'] != 1.6699999999999999]
    rest_2 = rest_1_.loc[rest_1_['relevance'] != 2.6699999999999999]
    rest_3 = rest_2.loc[rest_2['relevance'] != 3.0]
    rest_data = rest_3.loc[rest_3['relevance'] != 2.0]
    down_train = pd.concat([rest_data, downsize], 0)
    return down_train

def classified(data):
    zero = data[data['relevance'] == 1].copy()
    zero['relevance'] = 0
    one = data[(data['relevance'] == 1.25) | (data['relevance'] == 1.3300000000000001)].copy()
    one['relevance'] = 1
    two = data[(data['relevance'] == 1.5) | (data['relevance'] == 1.6699999999999999)].copy()
    two['relevance'] = 2
    three = data[(data['relevance'] == 1.75) | (data['relevance'] == 2.0)].copy()
    three['relevance'] = 3
    four = data[(data['relevance'] == 2.25) | (data['relevance'] == 2.3300000000000001)].copy()
    four['relevance'] = 4
    five = data[(data['relevance'] == 2.5) | (data['relevance'] == 2.6699999999999999)].copy()
    five['relevance'] = 5
    six = data[(data['relevance'] == 2.75) | (data['relevance'] == 3.0)].copy()
    six['relevance'] = 6
    classified = pd.concat([zero,one,two,three,four,five,six], 0)
    return classified

## Preprocessing Data

In [16]:
def corpus(filename='temp/classification/train.csv'):
    import csv
    reader = csv.reader(open(filename))
    data = []
    for row in reader:
        i_d,product_uid,product_title,search_term,relevance,name,value = row
        dic = {'search_term':search_term, 'product_title': product_title,'relevance': relevance, 'attr_1':value}
        data.append(dic)
    del data[0]
    return data

def tokenise(input):
    return input.split(' ')

def pipeline(data,vocab=None, max_title_len_=None, max_query_len_=None,max_attr_len_=None):
    
    # Vocab
    exist_vocab = True
    if vocab is None:
        exist_vocab = False
        vocab = {'<PAD>': 0, '<OOV>':1}
    
    ## Placeholder ## 
    
    # Product Title
    max_title_len = -1
    data_title = []
    # Search Term
    max_query_len = -1
    data_query = []
    # Attr Term
    max_attr_len = -1
    data_attr = []
    
    # Relevance
    data_relevance = []
    
    ## Processing ## 
    for instance in data:
        # Product Title Processing
        title = instance['product_title']
        t_tokenised = tokenise(title)
        each_title = []
        for token in t_tokenised:
            if not exist_vocab and token not in vocab:
                vocab[token] = len(vocab)
            if token not in vocab:
                token_id = vocab['<OOV>']
            else:
                token_id = vocab[token]
            each_title.append(token_id)
        # Product Title Length
        if len(each_title) > max_title_len:
            max_title_len = len(each_title)
        data_title.append(each_title)
            
        # Search Term Processing
        query = instance['search_term']
        q_tokenised = tokenise(query)
        each_query = []
        for qtoken in q_tokenised:
            if not exist_vocab and qtoken not in vocab:
                vocab[qtoken] = len(vocab)
            if qtoken not in vocab:
                qtoken_id = vocab['<OOV>']
            else:
                qtoken_id = vocab[qtoken]
            each_query.append(qtoken_id) 
        # Search Terms Length
        if len(each_query) > max_query_len:
            max_query_len = len(each_query)
        data_query.append(each_query)
        

        # Attr Processing
        attr = instance['attr_1']
        a_tokenised = tokenise(attr)
        each_attr = []
        for atoken in a_tokenised:
            if not exist_vocab and atoken not in vocab:
                vocab[atoken] = len(vocab)
            if atoken not in vocab:
                atoken_id = vocab['<OOV>']
            else:
                atoken_id = vocab[atoken]
            each_attr.append(atoken_id) 
        # Search Terms Length
        if len(each_attr) > max_attr_len:
            max_attr_len = len(each_attr)
        data_attr.append(each_attr)
            
        # Relevance 
        data_relevance.append(instance['relevance'])

    if max_title_len_ is not None:
        max_title_len = max_title_len_
    out_title = np.full([len(data_title), max_title_len], vocab['<PAD>'], dtype=np.int32)
    
    for index, item in enumerate(data_title):
        if len(item) <= out_title.shape[1]:
            out_title[index, 0:len(item)] = item
    
    if max_query_len_ is not None:
        max_query_len = max_query_len_
    out_query = np.full([len(data_query), max_query_len], vocab['<PAD>'], dtype=np.int32)
    
    for index, q in enumerate(data_query):
        out_query[index, 0:len(q)] = q
        
    if max_attr_len_ is not None:
        max_attr_len = max_attr_len_
    out_attr = np.full([len(data_attr), max_attr_len], vocab['<PAD>'], dtype=np.int32)
    
    for index, item in enumerate(data_attr):
        if len(item) <= out_title.shape[1]:
            out_attr[index, 0:len(item)] = item
        
    out_relevance = np.array(data_relevance, dtype=np.float64)
    
    return out_title, out_query, out_relevance, out_attr , vocab

In [17]:
# Loading the corpus

train = corpus()
test = corpus(filename='temp/classification/dev.csv')

In [18]:
# convert train set to integer IDs
train_title, train_query, train_relevance,train_attr, vocab = pipeline(train)

In [19]:
# get the length of the longest title
max_title_len = train_title.shape[1]

# get the length of the longest query
max_query_len = train_query.shape[1]

# get the length of the longest query
max_attr_len = train_attr.shape[1]

# convert dev set to integer IDs, based on the train's information
dev_title, dev_query, dev_relevance,dev_attr,_ = pipeline(test,vocab=vocab,max_query_len_= max_query_len, \
                                                                max_title_len_ = max_title_len\
                                                               ,max_attr_len_ = max_attr_len)

## Model

In [20]:
### MODEL PARAMETERS ###
target_size = 1
vocab_size = len(vocab)
#qvocab_size = len(qvocab)
#avocab_size = len(avocab)
input_size = 10
hidden_size = 30
output_size = 1
num_of_layers = 3
prob_keep = 0.6

In [21]:
# PLACEHOLDERS

title = tf.placeholder(tf.int64, [None, None], "title")        # [batch_size x max_title_length]
query = tf.placeholder(tf.int64, [None, None], "query")        # [batch_size x max_query_length]
attr = tf.placeholder(tf.int64, [None, None], "attr")       # [batch_size x max_attr_length]
relevance = tf.placeholder(tf.int64, [None], "relevance")      # [batch_size]

#batch_size = tf.shape(title)[0]

# WORD EMBEDDINGS

initializer = tf.random_uniform_initializer(-0.1, 0.1)
embeddings = tf.get_variable("E", [vocab_size, input_size], initializer=initializer)
title_embedded = tf.nn.embedding_lookup(embeddings, title)     # [batch_size x max_title_length x input_size]
query_embedded = tf.nn.embedding_lookup(embeddings, query)     # [batch_size x max_query_length x input_size]
attr_embedded = tf.nn.embedding_lookup(embeddings, attr)       # [batch_size x max_attr_length x input_size]

# MODEL CONSTRUCTION

with tf.variable_scope("query") as varscope:
    cell_ = tf.contrib.rnn.LSTMCell(hidden_size,state_is_tuple=True)
    cell = tf.contrib.rnn.DropoutWrapper(cell=cell_, output_keep_prob=prob_keep)
    mcell = tf.contrib.rnn.MultiRNNCell([cell]*num_of_layers, state_is_tuple=True)
    output, final_state_q = tf.nn.dynamic_rnn(mcell, query_embedded, dtype=tf.float32)

with tf.variable_scope("title") as varscope:
    cell_t = tf.contrib.rnn.LSTMCell(hidden_size,state_is_tuple=True)
    cellt = tf.contrib.rnn.DropoutWrapper(cell=cell_t, output_keep_prob=prob_keep)
    mcellt = tf.contrib.rnn.MultiRNNCell([cellt]*num_of_layers, state_is_tuple=True)
    output, final_state_t = tf.nn.dynamic_rnn(mcellt, query_embedded,initial_state = final_state_q, dtype=tf.float32)

with tf.variable_scope("attr") as varscope:
    cell_a = tf.contrib.rnn.LSTMCell(hidden_size,state_is_tuple=True)
    cella = tf.contrib.rnn.DropoutWrapper(cell=cell_a, output_keep_prob=prob_keep)
    mcella = tf.contrib.rnn.MultiRNNCell([cella]*num_of_layers, state_is_tuple=True)
    output, final_state_a = tf.nn.dynamic_rnn(mcella, query_embedded,initial_state = final_state_t, dtype=tf.float32)
    joint_h = final_state_a[num_of_layers - 1].h
    
# LOSS FUNCTION 
logits_flat = tf.contrib.layers.linear(joint_h, 7) # 7 is the number of classes 
logits = tf.reshape(logits_flat, [-1, 7])          # [BATCH_SIZE x 7]

loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = logits, labels = relevance))

# Prediction function
softmax = tf.nn.softmax(logits)
predict = tf.arg_max(softmax,1)

# OPTIMIZER
opt_op = tf.train.AdamOptimizer().minimize(loss)

In [22]:
def calculate_accuracy(true_relevance,predicted_relevance):
    num_correct = np.sum(true_relevance == predicted_relevance)
    num_total =  true_relevance.shape[0]
    return num_correct / num_total

In [42]:
# TRAINING

BATCH_SIZE = 100
EPOCH = 50
train_loss_list = []
train_rmse_list = []
train_pred_list = []
dev_rmse_list = []
dev_pred_list = []

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    n = train_title.shape[0]
    
    # index to draw random batch of train data
    idx = np.arange(n)

    for epoch in range(EPOCH):
        print('----- Epoch', epoch, '-----')
        total_loss = 0
        for i in range(n // BATCH_SIZE):
            inst_title = train_title[idx[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]]
            inst_query = train_query[idx[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]]
            inst_attr = train_attr[idx[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]]
            inst_relevance = train_relevance[idx[i * BATCH_SIZE: (i + 1) * BATCH_SIZE]]
            feed_dict = {title: inst_title, relevance: inst_relevance, query: inst_query,attr: inst_attr}
            _, current_loss = sess.run([opt_op, loss], feed_dict=feed_dict)
            total_loss += current_loss
        train_loss_list.append(total_loss/n)
        print(' Train loss: ', total_loss / n)

        train_feed_dict = {title: train_title, relevance: train_relevance, query: train_query,attr: train_attr}
        train_predicted = sess.run(predict, feed_dict=train_feed_dict)
        train_pred_list.append(train_predicted)
        train_accuracy = calculate_accuracy(train_relevance, train_predicted)
        train_rmse_list.append(train_accuracy)
        print(' Train Accuracy: ', train_accuracy)
        
        dev_feed_dict = {title: dev_title, relevance: dev_relevance, query:dev_query,attr:dev_query}
        dev_predicted = sess.run(predict, feed_dict=dev_feed_dict)
        dev_pred_list.append(dev_predicted)
        dev_accuracy = calculate_accuracy(dev_relevance, dev_predicted)
        dev_rmse_list.append(dev_accuracy)
        print(' Dev Accuracy: ', dev_accuracy)

    
    #save_model(sess)

----- Epoch 0 -----
 Train loss:  0.0174351349863
 Train Accuracy:  0.143099547511
 Dev Accuracy:  0.218943661972
----- Epoch 1 -----
 Train loss:  0.0151850038702
 Train Accuracy:  0.143099547511
 Dev Accuracy:  0.218943661972
----- Epoch 2 -----
 Train loss:  0.0132748848616
 Train Accuracy:  0.143261150614
 Dev Accuracy:  0.219072769953
----- Epoch 3 -----
 Train loss:  0.012887986696
 Train Accuracy:  0.145685197156
 Dev Accuracy:  0.223943661972
----- Epoch 4 -----
 Train loss:  0.0126389788094
 Train Accuracy:  0.143745959922
 Dev Accuracy:  0.226737089202
----- Epoch 5 -----
 Train loss:  0.0117607764375
 Train Accuracy:  0.14067550097
 Dev Accuracy:  0.220950704225
----- Epoch 6 -----
 Train loss:  0.00915638474192
 Train Accuracy:  0.142533936652
 Dev Accuracy:  0.215845070423
----- Epoch 7 -----
 Train loss:  0.00839146358125
 Train Accuracy:  0.145281189399
 Dev Accuracy:  0.197429577465
----- Epoch 8 -----
 Train loss:  0.0081925446164
 Train Accuracy:  0.144957983193
 Dev 

KeyboardInterrupt: 