In [1]:
import argparse
import logging
import numpy as np
from time import time
import import_ipynb
import utils as U
import codecs

logging.basicConfig(
    # filename='out.log',
    level=logging.INFO,
    format='%(asctime)s %(levelname)s %(message)s')
logger = logging.getLogger(__name__)

importing Jupyter notebook from utils.ipynb


In [2]:
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--out-dir", dest="out_dir_path", type=str, metavar='<str>', required=True,
                    help="The path to the output directory")
parser.add_argument("-e", "--embdim", dest="emb_dim", type=int, metavar='<int>', default=200,
                    help="Embeddings dimension (default=200)")
parser.add_argument("-b", "--batch-size", dest="batch_size", type=int, metavar='<int>', default=8,
                    help="Batch size (default=8)")
parser.add_argument("-v", "--vocab-size", dest="vocab_size", type=int, metavar='<int>', default=9000,
                    help="Vocab size. '0' means no limit (default=9000)")
parser.add_argument("-as", "--aspect-size", dest="aspect_size", type=int, metavar='<int>', default=14,
                    help="The number of aspects specified by users (default=14)")
parser.add_argument("--emb", dest="emb_path", type=str, metavar='<str>', help="The path to the word embeddings file")
parser.add_argument("--epochs", dest="epochs", type=int, metavar='<int>', default=10,
                    help="Number of epochs (default=10)")
parser.add_argument("-n", "--neg-size", dest="neg_size", type=int, metavar='<int>', default=4,
                    help="Number of negative instances (default=4)")
parser.add_argument("--maxlen", dest="maxlen", type=int, metavar='<int>', default=0,
                    help="Maximum allowed number of words during training. '0' means no limit (default=0)")
parser.add_argument("--seed", dest="seed", type=int, metavar='<int>', default=1234, help="Random seed (default=1234)")
parser.add_argument("-a", "--algorithm", dest="algorithm", type=str, metavar='<str>', default='adam',
                    help="Optimization algorithm (rmsprop|sgd|adagrad|adadelta|adam|adamax) (default=adam)")
parser.add_argument("--ortho-reg", dest="ortho_reg", type=float, metavar='<float>', default=0.1,
                    help="The weight of orthogonol regularizaiton (default=0.1)")

_StoreAction(option_strings=['--ortho-reg'], dest='ortho_reg', nargs=None, const=None, default=0.1, type=<class 'float'>, choices=None, help='The weight of orthogonol regularizaiton (default=0.1)', metavar='<float>')

In [3]:
args = parser.parse_args("--emb w2v_embedding --aspect-size 18 -o output_dir --epochs 50 --batch-size 512 --neg-size 1 --algorithm adam --vocab-size 0".split())
out_dir = args.out_dir_path
U.mkdir_p(out_dir)
U.print_args(args)

2020-10-27 16:20:21,257 INFO Arguments:
2020-10-27 16:20:21,259 INFO   algorithm: adam
2020-10-27 16:20:21,259 INFO   aspect_size: 18
2020-10-27 16:20:21,260 INFO   batch_size: 512
2020-10-27 16:20:21,261 INFO   command: /mnt/disks/user/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py -f /root/.local/share/jupyter/runtime/kernel-099f09d8-d224-4058-9041-37eaae76d129.json
2020-10-27 16:20:21,262 INFO   emb_dim: 200
2020-10-27 16:20:21,263 INFO   emb_path: w2v_embedding
2020-10-27 16:20:21,263 INFO   epochs: 50
2020-10-27 16:20:21,264 INFO   maxlen: 0
2020-10-27 16:20:21,265 INFO   neg_size: 1
2020-10-27 16:20:21,266 INFO   ortho_reg: 0.1
2020-10-27 16:20:21,267 INFO   out_dir_path: output_dir
2020-10-27 16:20:21,268 INFO   seed: 1234
2020-10-27 16:20:21,268 INFO   vocab_size: 0


In [4]:
#np.random.seed(2020)

In [5]:
from keras.preprocessing import sequence
import reader as dataset

Using TensorFlow backend.


importing Jupyter notebook from reader.ipynb


In [6]:
vocab, train_x, test_x, overall_maxlen = dataset.fetch_data(vocab_size=args.vocab_size, maxlen=args.maxlen)

train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen)
test_x = sequence.pad_sequences(test_x, maxlen=overall_maxlen)

 Reader :: Reading data from training set
 Reader ::  Creating vocab ...
 Reader ::    909857 total words, 29160 unique words
 Reader ::  Reading dataset ...
 Reader ::   train set
 Reader ::    <num> hit rate: 0.00%, <unk> hit rate: 0.00%
 Reader ::   test set
 Reader ::    <num> hit rate: 0.00%, <unk> hit rate: 0.00%
175
90


In [7]:
print('Number of training examples: ', len(train_x))
print('Length of vocab: ', len(vocab))

Number of training examples:  76111
Length of vocab:  29163


In [8]:
def sent_batch_generator(data, batch_size):
    n_batch = len(data) // batch_size
    batch_count = 0
    np.random.shuffle(data)

    while True:
        if batch_count >= n_batch:
            np.random.shuffle(data)
            batch_count = 0

        batch = data[batch_count * batch_size: (batch_count + 1) * batch_size]
        batch_count += 1
        yield batch

In [9]:
def neg_batch_generator(data, batch_size, neg_size):
    data_len = data.shape[0]
    dim = data.shape[1]

    while True:
        indices = np.random.choice(data_len, batch_size * neg_size)
        samples = data[indices].reshape(batch_size, neg_size, dim)
        yield samples

In [10]:
from optimizers import get_optimizer

importing Jupyter notebook from optimizers.ipynb


In [11]:
optimizer = get_optimizer(args)

In [12]:
from model import create_model
import keras.backend as K

logger.info('  Building model')

2020-10-27 16:20:37,732 INFO   Building model


importing Jupyter notebook from model.ipynb
importing Jupyter notebook from custom_layers.ipynb
importing Jupyter notebook from w2v_emb_reader.ipynb


In [13]:
def max_margin_loss(y_true, y_pred):
    return K.mean(y_pred)

In [14]:
model = create_model(args, overall_maxlen, vocab)

2020-10-27 16:20:37,914 INFO Loading embeddings from: w2v_embedding
2020-10-27 16:20:37,916 INFO loading Word2Vec object from w2v_embedding
2020-10-27 16:20:37,996 INFO loading wv recursively from w2v_embedding.wv.* with mmap=None
2020-10-27 16:20:37,997 INFO setting ignored attribute vectors_norm to None
2020-10-27 16:20:37,998 INFO loading vocabulary recursively from w2v_embedding.vocabulary.* with mmap=None
2020-10-27 16:20:37,999 INFO loading trainables recursively from w2v_embedding.trainables.* with mmap=None
2020-10-27 16:20:38,000 INFO setting ignored attribute cum_table to None
2020-10-27 16:20:38,001 INFO loaded w2v_embedding
2020-10-27 16:20:38,322 INFO   #vectors: 5475, #dimensions: 200
2020-10-27 16:20:38,393 INFO Initializing word embedding matrix
2020-10-27 16:20:38,523 INFO 5475/29163 word vectors initialized (hit rate: 18.77%)
2020-10-27 16:20:38,557 INFO Initializing aspect embedding matrix as centroid of kmean clusters


In [15]:
model.get_layer('word_emb').trainable = False
model.compile(optimizer=optimizer, loss=max_margin_loss, metrics=[max_margin_loss])

<b>  Training </b>

In [16]:
from tqdm import tqdm

In [17]:
vocab_inv = {}

for w, ind in vocab.items():
    vocab_inv[ind] = w

sen_gen = sent_batch_generator(train_x, args.batch_size)
neg_gen = neg_batch_generator(train_x, args.batch_size, args.neg_size)
batches_per_epoch = len(train_x) // args.batch_size
#batches_per_epoch = 32

In [18]:
print("Batches per epoch", batches_per_epoch)

Batches per epoch 148


In [19]:
min_loss = float('inf')

In [20]:
batch_var = 0
batch_no = []
Total_loss = []
Total_max_margin_loss = []
final_aspect_list = []
for ii in range(args.epochs):
    t0 = time()
    loss, max_margin_loss = 0., 0.

    for b in tqdm(range(batches_per_epoch)):
        sen_input = next(sen_gen)
        neg_input = next(neg_gen)

        try:
            batch_loss, batch_max_margin_loss = model.train_on_batch([sen_input,neg_input], np.ones((args.batch_size, 1)))
            #print(model.test_on_batch([sen_input,neg_input], np.ones((args.batch_size, 1))))
        except Exception as e:
            print(e)
            print(sen_input.shape, sen_input)
            print(neg_input.shape, neg_input)

            print()
            quit()

        loss += batch_loss / batches_per_epoch
        max_margin_loss += batch_max_margin_loss / batches_per_epoch

    tr_time = time() - t0

    if loss < min_loss:

        min_loss = loss
        word_emb = K.get_value(model.get_layer('word_emb').embeddings)
        aspect_emb = K.get_value(model.get_layer('aspect_emb').W)
        word_emb = word_emb / np.linalg.norm(word_emb, axis=-1, keepdims=True)
        aspect_emb = aspect_emb / np.linalg.norm(aspect_emb, axis=-1, keepdims=True)
        aspect_file = codecs.open(out_dir + '/aspect_adamlr1.log', 'w', 'utf-8')
        aspect_txt = codecs.open(out_dir + '/aspect_adamlr1.txt', 'w', 'utf-8')
        model.save_weights(out_dir + '/model_param_adamlr1')
        
        for ind in range(len(aspect_emb)):
            aspect_txt.write('[')
            desc = aspect_emb[ind]
            sims = word_emb.dot(desc.T)
            ordered_words = np.argsort(sims)[::-1]
            desc_list = [vocab_inv[w] + ":" + str(sims[w]) for w in ordered_words[:30]]
            asp_list = ["'" + vocab_inv[w] +"'," for w in ordered_words[:30]]
            #final_aspect_list = asp_list
            aspect_txt.write(''.join(asp_list))
            #print('Aspect %d:' % ind)
            #print(desc_list)
            #print(asp_list)
            aspect_file.write('Aspect %d:\n' % ind)
            aspect_file.write(' '.join(desc_list) + '\n\n')
            aspect_txt.write('],')
        
    logger.info('Epoch %d, train: %is' % (ii, tr_time))
    logger.info(
        'Total loss: %.4f, max_margin_loss: %.4f, ortho_reg: %.4f' % (loss, max_margin_loss, loss - max_margin_loss))
    #logger.info(model.test_on_batch([sen_input,neg_input], np.ones((args.batch_size, 1))))
    batch_var = batch_var + 1
    batch_no.append(batch_var)
    Total_loss.append(loss)
    Total_max_margin_loss.append(max_margin_loss)

100%|██████████| 148/148 [00:34<00:00,  4.29it/s]
2020-10-27 16:21:20,103 INFO Epoch 0, train: 34s
2020-10-27 16:21:20,104 INFO Total loss: 0.8283, max_margin_loss: 0.7407, ortho_reg: 0.0876
100%|██████████| 148/148 [00:33<00:00,  4.45it/s]
2020-10-27 16:21:53,615 INFO Epoch 1, train: 33s
2020-10-27 16:21:53,616 INFO Total loss: 0.5703, max_margin_loss: 0.5684, ortho_reg: 0.0019
100%|██████████| 148/148 [00:32<00:00,  4.55it/s]
2020-10-27 16:22:26,393 INFO Epoch 2, train: 32s
2020-10-27 16:22:26,394 INFO Total loss: 0.5031, max_margin_loss: 0.5014, ortho_reg: 0.0017
100%|██████████| 148/148 [00:32<00:00,  4.51it/s]
2020-10-27 16:22:59,504 INFO Epoch 3, train: 32s
2020-10-27 16:22:59,505 INFO Total loss: 0.4869, max_margin_loss: 0.4853, ortho_reg: 0.0016
100%|██████████| 148/148 [00:32<00:00,  4.62it/s]
2020-10-27 16:23:31,792 INFO Epoch 4, train: 32s
2020-10-27 16:23:31,794 INFO Total loss: 0.4791, max_margin_loss: 0.4775, ortho_reg: 0.0016
100%|██████████| 148/148 [00:32<00:00,  4.60i

100%|██████████| 148/148 [00:32<00:00,  4.60it/s]
2020-10-27 16:45:04,176 INFO Epoch 43, train: 32s
2020-10-27 16:45:04,177 INFO Total loss: 0.4583, max_margin_loss: 0.4569, ortho_reg: 0.0014
100%|██████████| 148/148 [00:32<00:00,  4.59it/s]
2020-10-27 16:45:36,429 INFO Epoch 44, train: 32s
2020-10-27 16:45:36,430 INFO Total loss: 0.4582, max_margin_loss: 0.4568, ortho_reg: 0.0014
100%|██████████| 148/148 [00:32<00:00,  4.49it/s]
2020-10-27 16:46:09,616 INFO Epoch 45, train: 32s
2020-10-27 16:46:09,617 INFO Total loss: 0.4579, max_margin_loss: 0.4564, ortho_reg: 0.0014
100%|██████████| 148/148 [00:33<00:00,  4.47it/s]
2020-10-27 16:46:42,995 INFO Epoch 46, train: 33s
2020-10-27 16:46:42,997 INFO Total loss: 0.4579, max_margin_loss: 0.4564, ortho_reg: 0.0014
100%|██████████| 148/148 [00:33<00:00,  4.48it/s]
2020-10-27 16:47:16,262 INFO Epoch 47, train: 33s
2020-10-27 16:47:16,263 INFO Total loss: 0.4576, max_margin_loss: 0.4562, ortho_reg: 0.0014
100%|██████████| 148/148 [00:32<00:00,  

In [21]:
print(model.summary())

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
sentence_input (InputLayer)     (None, 175)          0                                            
__________________________________________________________________________________________________
word_emb (Embedding)            multiple             5832600     sentence_input[0][0]             
                                                                 neg_input[0][0]                  
__________________________________________________________________________________________________
average__layer_1 (Average_Layer (None, 200)          0           word_emb[0][0]                   
__________________________________________________________________________________________________
att_weights (Attention_Layer)   (None, 175)          40001       word_emb[0][0]             

In [1]:
# print("epochs = ")
# print(batch_no)
# print ("\n")
# print("Total Loss = ")
# print(Total_loss)
# print ("\n")
# print("Max Margin Loss = " )
# print(Total_max_margin_loss)

In [23]:
# print(Total_max_margin_loss)

In [24]:
model.save("output_dir/train_model_adamlr1.h5")