In [1]:
import numpy as np
import tensorflow as tf
import pickle
import datetime
import TACNN
import sys

tf.flags.DEFINE_string("word2vec", None, "Word2vec file with pre-trained embeddings (default: None)")
tf.flags.DEFINE_string("valid_data","../data/music.test", " Data for validation")
tf.flags.DEFINE_string("para_data", "../data/music.para", "Data parameters")
tf.flags.DEFINE_string("train_data", "../data/music.train", "Data for training")
tf.app.flags.DEFINE_string('f', '', 'kernel')
# ==================================================

# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 50, "Dimensionality of character embedding ")
tf.flags.DEFINE_string("filter_sizes", "3", "Comma-separated filter sizes ")
tf.flags.DEFINE_integer("num_filters", 100, "Number of filters per filter size")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability ")
tf.flags.DEFINE_float("l2_reg_lambda", 0.001, "L2 regularizaion lambda")
# Training parameters
tf.flags.DEFINE_integer("batch_size", 100, "Batch Size ")
tf.flags.DEFINE_integer("num_epochs", 40, "Number of training epochs ")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

In [2]:
def train_step(u_batch, i_batch, uid, iid, reuid, reiid, y_batch,batch_num):
    """
    A single training step
    """
    feed_dict = {
        deep.input_u: u_batch,
        deep.input_i: i_batch,
        deep.input_uid: uid,
        deep.input_iid: iid,
        deep.input_y: y_batch,
        deep.input_reuid: reuid,
        deep.input_reiid: reiid,
        deep.drop0: 0.8,

        deep.dropout_keep_prob: FLAGS.dropout_keep_prob
    }
    _, step, loss, accuracy, mae, u_a, i_a, fm = sess.run(
        [train_op, global_step, deep.loss, deep.accuracy, deep.mae, deep.u_a, deep.i_a, deep.score],
        feed_dict)
    time_str = datetime.datetime.now().isoformat()
    #print("{}: step {}, loss {:g}, rmse {:g},mae {:g}".format(time_str, batch_num, loss, accuracy, mae))
    return accuracy, mae, u_a, i_a, fm


def dev_step(u_batch, i_batch, uid, iid, reuid, reiid, y_batch, writer=None):
    """
    Evaluates model on a dev set

    """
    feed_dict = {
        deep.input_u: u_batch,
        deep.input_i: i_batch,
        deep.input_y: y_batch,
        deep.input_uid: uid,
        deep.input_iid: iid,
        deep.input_reuid: reuid,
        deep.input_reiid: reiid,
        deep.drop0: 1.0,
        deep.dropout_keep_prob: 1.0
    }
    step, loss, accuracy, mae = sess.run(
        [global_step, deep.loss, deep.accuracy, deep.mae],
        feed_dict)
    time_str = datetime.datetime.now().isoformat()
    # print("{}: step{}, loss {:g}, rmse {:g},mae {:g}".format(time_str, step, loss, accuracy, mae))

    return [loss, accuracy, mae]

In [3]:
def extract_word2vec(vocabulary):
    # initial matrix with random uniform
    u = 0
    initW = np.random.uniform(-1.0, 1.0, (len(vocabulary), FLAGS.embedding_dim))
    # load any vectors from the word2vec
    print("Load word2vec u file {}\n".format(FLAGS.word2vec))
    with open(FLAGS.word2vec, "rb") as f:
        header = f.readline()
        vocab_size, layer1_size = map(int, header.split())
        binary_len = np.dtype('float32').itemsize * layer1_size
        for line in range(vocab_size):
            word = []
            while True:
                ch = f.read(1)
                if ch == ' ':
                    word = ''.join(word)
                    break
                if ch != '\n':
                    word.append(ch)
            idx = 0

            if word in vocabulary:
                u = u + 1
                idx = vocabulary[word]
                initW[idx] = np.fromstring(f.read(binary_len), dtype='float32')
            else:
                f.read(binary_len)
    return initW

In [4]:
def extract_glove(vocabulary, glove_file, vocab_size=400000):
    """
    Use pre-trained GloVe embedding as alternative to word2vec.
    Load in file with specified vocab size and dimensional embedding size.
    If word exists in user/item vocabulary, replace the row with the values.
    """
    # initial matrix with random uniform
    u = 0
    initW = np.random.uniform(-1.0, 1.0, (len(vocabulary), FLAGS.embedding_dim))
    
    print("Load glove u file {}\n".format(glove_file))
    with open(glove_file, "r", encoding="utf-8") as f:
        for i in range(vocab_size):
            line = f.readline()
            word_embed = line.split()
            word = word_embed[0]
            embed = [float(x) for x in word_embed[1:]]
            idx = 0

            if word in vocabulary:
                u = u + 1
                idx = vocabulary[word]
                initW[idx] = embed
    return initW

Trying GloVe: https://nlp.stanford.edu/projects/glove/

In [5]:
glove_file = "../data/glove.6B.50d.txt" 
use_glove = True # turn on if use glove instead of word2vec

In [6]:
num_epochs = 40

In [7]:
if __name__ == '__main__':
    FLAGS = tf.flags.FLAGS
    FLAGS(sys.argv)
    print("\nParameters:")
    for attr, value in sorted(FLAGS.__flags.items()):
        print("{}={}".format(attr.upper(), value))
    print("")

    print("Loading data...")
    pkl_file = open(FLAGS.para_data, 'rb')

    para = pickle.load(pkl_file)
    user_num = para['user_num']
    item_num = para['item_num']
    review_num_u = para['review_num_u']
    review_num_i = para['review_num_i']
    review_len_u = para['review_len_u']
    review_len_i = para['review_len_i']
    vocabulary_user = para['user_vocab']
    vocabulary_item = para['item_vocab']
    train_length = para['train_length']
    test_length = para['test_length']
    u_text = para['u_text']
    i_text = para['i_text']
    item_review = para['item_text']

    np.random.seed(2020)
    random_seed = 2020
    print("num users: " + str(user_num))
    print("num items: " + str(item_num))
    print(review_num_u)
    print(review_len_u)
    print(review_num_i)
    print(review_len_i)
    
    with tf.Graph().as_default():

        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        session_conf.gpu_options.allow_growth = True
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            deep = TACNN.TACNN(
                review_num_u=review_num_u,
                review_num_i=review_num_i,
                review_len_u=review_len_u,
                review_len_i=review_len_i,
                user_num=user_num,
                item_num=item_num,
                num_classes=1,
                user_vocab_size=len(vocabulary_user),
                item_vocab_size=len(vocabulary_item),
                embedding_size=FLAGS.embedding_dim,
                embedding_id=32,
                filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
                num_filters=FLAGS.num_filters,
                l2_reg_lambda=FLAGS.l2_reg_lambda,
                attention_size=32,
                n_latent=32)
            tf.set_random_seed(random_seed)
            print (user_num)
            print (item_num)
            global_step = tf.Variable(0, name="global_step", trainable=False)

            # optimizer = tf.train.AdagradOptimizer(learning_rate=0.01, initial_accumulator_value=1e-8).minimize(deep.loss)
            optimizer = tf.train.AdamOptimizer(0.002, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(deep.loss)
            
            train_op = optimizer  # .apply_gradients(grads_and_vars, global_step=global_step)


            sess.run(tf.initialize_all_variables())

            saver = tf.train.Saver()

            if FLAGS.word2vec:
                initW = extract_word2vec(vocabulary_user) # extract for user vocab
                sess.run(deep.W1.assign(initW))
                
                initW = extract_word2vec(vocabulary_item) # extract for item vocab
                sess.run(deep.W2.assign(initW))
                # word2vec code
                
            elif use_glove:
                initW = extract_glove(vocabulary_user, glove_file)
                sess.run(deep.W1.assign(initW))
                
                initW = extract_glove(vocabulary_item, glove_file)
                sess.run(deep.W2.assign(initW))

            epoch = 1
            best_mae = 5
            best_rmse = 5
            train_mae = 0
            train_rmse = 0

            pkl_file = open(FLAGS.train_data, 'rb')

            train_data = pickle.load(pkl_file)

            train_data = np.array(train_data)
            pkl_file.close()

            pkl_file = open(FLAGS.valid_data, 'rb')

            test_data = pickle.load(pkl_file)
            test_data = np.array(test_data)
            pkl_file.close()

            data_size_train = len(train_data)
            data_size_test = len(test_data)
            batch_size = FLAGS.batch_size
            ll = int(len(train_data) / batch_size)
            print('ll')
            print(ll)
            for epoch in range(num_epochs):
                # Shuffle the data at each epoch
                shuffle_indices = np.random.permutation(np.arange(data_size_train))
                #shuffled_data = train_data[shuffle_indices]
                shuffled_data = train_data
                for batch_num in range(ll):

                    start_index = batch_num * batch_size
                    end_index = min((batch_num + 1) * batch_size, data_size_train)
                    data_train = shuffled_data[start_index:end_index]

                    uid, iid, reuid, reiid, y_batch = zip(*data_train)
                    u_batch = []
                    i_batch = []
                    item_id = np.zeros(100)
                    for i in range(len(uid)):
                        u_batch.append(u_text[uid[i][0]])
                        i_batch.append(i_text[iid[i][0]])
                        item_id[i] = iid[i]
                    u_batch = np.array(u_batch)
                    i_batch = np.array(i_batch)

                    t_rmse, t_mae, u_a, i_a, fm = train_step(u_batch, i_batch, uid, iid, reuid, reiid, y_batch,batch_num)
                    i_attention = np.zeros((100,16))
                    i_attention = i_a
                    current_step = tf.train.global_step(sess, global_step)
                    train_rmse += t_rmse
                    train_mae += t_mae
                    if batch_num % 500 == 0 and batch_num > 1:
                        print("\nEvaluation:")
                        print (batch_num)

                        loss_s = 0
                        accuracy_s = 0
                        mae_s = 0

                        ll_test = int(len(test_data) / batch_size) + 1
                        for batch_num in range(ll_test):
                            start_index = batch_num * batch_size
                            end_index = min((batch_num + 1) * batch_size, data_size_test)
                            data_test = test_data[start_index:end_index]

                            userid_valid, itemid_valid, reuid, reiid, y_valid = zip(*data_test)
                            u_valid = []
                            i_valid = []
                            for i in range(len(userid_valid)):
                                u_valid.append(u_text[userid_valid[i][0]])
                                i_valid.append(i_text[itemid_valid[i][0]])
                            u_valid = np.array(u_valid)
                            i_valid = np.array(i_valid)

                            loss, accuracy, mae = dev_step(u_valid, i_valid, userid_valid, itemid_valid, reuid, reiid,
                                                           y_valid)
                            loss_s = loss_s + len(u_valid) * loss
                            accuracy_s = accuracy_s + len(u_valid) * np.square(accuracy)
                            mae_s = mae_s + len(u_valid) * mae
                        print ("loss_valid {:g}, rmse_valid {:g}, mae_valid {:g}".format(loss_s / test_length,
                                                                                         np.sqrt(
                                                                                             accuracy_s / test_length),
                                                                                         mae_s / test_length))
                        rmse = np.sqrt(accuracy_s / test_length)
                        mae = mae_s / test_length
                        if best_rmse > rmse:
                            best_rmse = rmse
                        if best_mae > mae:
                            best_mae = mae
                        print("")

                print(str(epoch) + ':\n')
                print("\nEvaluation:")
                print ("train:rmse,mae:", train_rmse / ll, train_mae / ll)
                u_a = np.reshape(u_a[0], (1, -1))
                i_a = np.reshape(i_a[0], (1, -1))

                train_rmse = 0
                train_mae = 0

                loss_s = 0
                accuracy_s = 0
                mae_s = 0

                ll_test = int(len(test_data) / batch_size) + 1
                for batch_num in range(ll_test):
                    start_index = batch_num * batch_size
                    end_index = min((batch_num + 1) * batch_size, data_size_test)
                    data_test = test_data[start_index:end_index]

                    userid_valid, itemid_valid, reuid, reiid, y_valid = zip(*data_test)
                    u_valid = []
                    i_valid = []
                    for i in range(len(userid_valid)):
                        u_valid.append(u_text[userid_valid[i][0]])
                        i_valid.append(i_text[itemid_valid[i][0]])
                    u_valid = np.array(u_valid)
                    i_valid = np.array(i_valid)

                    loss, accuracy, mae = dev_step(u_valid, i_valid, userid_valid, itemid_valid, reuid, reiid, y_valid)
                    loss_s = loss_s + len(u_valid) * loss
                    accuracy_s = accuracy_s + len(u_valid) * np.square(accuracy)
                    mae_s = mae_s + len(u_valid) * mae
                print ("loss_valid {:g}, rmse_valid {:g}, mae_valid {:g}".format(loss_s / test_length,
                                                                                 np.sqrt(accuracy_s / test_length),
                                                                                 mae_s / test_length))
                rmse = np.sqrt(accuracy_s / test_length)
                mae = mae_s / test_length
                if best_rmse > rmse:
                    best_rmse = rmse
                if best_mae > mae:
                    best_mae = mae
                print("")
            print ('best rmse:', best_rmse)
            print ('best mae:', best_mae)


Parameters:
ALLOW_SOFT_PLACEMENT=<absl.flags._flag.BooleanFlag object at 0x000001ECDDD4B208>
BATCH_SIZE=<absl.flags._flag.Flag object at 0x000001ECDDD4B0B8>
DROPOUT_KEEP_PROB=<absl.flags._flag.Flag object at 0x000001ECDDD48F60>
EMBEDDING_DIM=<absl.flags._flag.Flag object at 0x000001ECDDD48D30>
F=<absl.flags._flag.Flag object at 0x000001ECDDD48BA8>
FILTER_SIZES=<absl.flags._flag.Flag object at 0x000001ECDDD48CF8>
L2_REG_LAMBDA=<absl.flags._flag.Flag object at 0x000001ECDDD48EB8>
LOG_DEVICE_PLACEMENT=<absl.flags._flag.BooleanFlag object at 0x000001ECDDD4B2B0>
NUM_EPOCHS=<absl.flags._flag.Flag object at 0x000001ECDDD4B080>
NUM_FILTERS=<absl.flags._flag.Flag object at 0x000001ECDDD48DA0>
PARA_DATA=<absl.flags._flag.Flag object at 0x000001ECDDD48B70>
TRAIN_DATA=<absl.flags._flag.Flag object at 0x000001ECDDD48B38>
VALID_DATA=<absl.flags._flag.Flag object at 0x000001ECD57F0B00>
WORD2VEC=<absl.flags._flag.Flag object at 0x000001ECD57F0B70>

Loading data...
num users: 1429
num items: 900
9
198

In [16]:
random_id = 1
real_id = item_id[random_id]
print("The attention weights of item {} are" .format(real_id))
print(i_attention[random_id].T)

The attention weights of item 413.0 are
[[0.09183503 0.07736229 0.04312806 0.09026127 0.03081937 0.08218352
  0.14646076 0.02864431 0.04825962 0.04910137 0.01644995 0.12264312
  0.04437253 0.12691875 0.00078002 0.00078002]]


In [17]:
max_id = np.argmax(i_attention[random_id])
min_id = np.argmin(i_attention[random_id][0:len(item_review[real_id])])
print("The review No. {} achieves highest attention score." .format(max_id))
print("The review No. {} achieves lowest attention score." .format(min_id))

The review No. 6 achieves highest attention score.
The review No. 10 achieves lowest attention score.


In [18]:
print("The content of the review with highest attention is ")
print(item_review[real_id][max_id])

print("\n")
print("The content of the review lowest attention is ")
print(item_review[real_id][min_id])

The content of the review with highest attention is 
['if', 'you', 'are', 'fan', 'of', 'the', 'jazz', 'iii', 'you', 'will', 'definitely', 'like', 'the', 'ultex', 'version', 'from', 'my', 'experience', 'the', 'ultex', 'has', 'the', 'sound', 'and', 'feel', 'of', 'tortex', 'but', 'without', 'the', 'memory', 'aka', 'wear', 'the', 'ultex', 'is', 'thicker', 'than', 'the', 'standard', 'nylong', 'jazz', 'iii', 's', 'and', 'only', 'when', 'directly', 'compared', 'will', 'you', 'notice', 'that', 'the', 'ultex', 'version', 'has', 'a', 'slightly', 'larger', 'gripping', 'surface', 'five', 'stars', 'for', 'being', 'a', 'better', 'jazz', 'iii', 'however', 'the', 'prices', 'for', 'these', 'on', 'amazon', 'are', 'absurd', 'i', 'got', 'mine', 'through', 'musiciansfriend', 'for', 'around', 'shipped']


The content of the review lowest attention is 
['this', 'is', 'a', 'nice', 'pick', 'thicker', 'and', 'easy', 'to', 'hold', 'on', 'to', 'i', 'like', 'the', 'size', 'smaller', 'than', 'your', 'regular', 'pic

In [23]:
random_id = 3
real_id = item_id[random_id]
print("The attention weights of item {} are" .format(real_id))
print(i_attention[random_id].T)

The attention weights of item 309.0 are
[[0.08092023 0.06187459 0.23819509 0.12926787 0.19424197 0.12346549
  0.14697674 0.00278422 0.00278422 0.00278422 0.00278422 0.00278422
  0.00278422 0.00278422 0.00278422 0.00278422]]


In [24]:
max_id = np.argmax(i_attention[random_id])
min_id = np.argmin(i_attention[random_id][0:len(item_review[real_id])])
print("The review No. {} achieves highest attention score." .format(max_id))
print("The review No. {} achieves lowest attention score." .format(min_id))

The review No. 2 achieves highest attention score.
The review No. 1 achieves lowest attention score.


In [25]:
print("The content of the review with highest attention is ")
print(item_review[real_id][max_id])

print("\n")
print("The content of the review lowest attention is ")
print(item_review[real_id][min_id])

The content of the review with highest attention is 
['i', 'don', 't', 'know', 'who', 'would', 'need', 'better', 'unless', 'you', 'traveled', 'a', 'lot', 'where', 'you', 'might', 'need', 'the', 'hard', 'case', 'this', 'one', 'surpassed', 'my', 'expectations', 'months', 'into', 'using', 'the', 'gig', 'bag', 'and', 'not', 'a', 'single', 'problem', 'the', 'pockets', 'are', 'packed', 'with', 'books', 'tools', 'and', 'strings', 'the', 'shoulder', 'straps', 'make', 'carrying', 'the', 'bag', 'across', 'town', 'a', 'breeze']


The content of the review lowest attention is 
['this', 'is', 'a', 'fine', 'bag', 'for', 'a', 'guitar', 'i', 'got', 'it', 'for', 'my', 'brother', 'i', 'm', 'not', 'a', 'music', 'person', 'but', 'its', 'good']


In [41]:
random_id = 2
real_id = item_id[random_id]
print("The attention weights of item {} are" .format(real_id))
print(i_attention[random_id].T)

The attention weights of item 321.0 are
[[0.10858686 0.03728418 0.18509732 0.05699949 0.07346902 0.05040829
  0.00362795 0.04157391 0.0597245  0.17576022 0.05685295 0.04113961
  0.1063227  0.001051   0.001051   0.001051  ]]


In [42]:
max_id = np.argmax(i_attention[random_id])
min_id = np.argmin(i_attention[random_id][0:len(item_review[real_id])])
print("The review No. {} achieves highest attention score." .format(max_id))
print("The review No. {} achieves lowest attention score." .format(min_id))

The review No. 2 achieves highest attention score.
The review No. 6 achieves lowest attention score.


In [43]:
print("The content of the review with highest attention is ")
print(item_review[real_id][max_id])

print("\n")
print("The content of the review lowest attention is ")
print(item_review[real_id][min_id])

The content of the review with highest attention is 
['it', 's', 'a', 'great', 'versatile', 'foot', 'rest', 'that', 'any', 'guitar', 'player', 'can', 'benefit', 'from', 'very', 'good', 'value', 'at', 'only', 'not', 'counting', 'shipping', 'very', 'satisfied']


The content of the review lowest attention is 
['']
