In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
import xlnet
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import model_utils




In [3]:
from prepro_utils import preprocess_text, encode_ids

def tokenize_fn(text):
    text = preprocess_text(text, lower= False)
    return encode_ids(sp_model, text)

In [4]:
import sentencepiece as spm
from prepro_utils import preprocess_text, encode_ids

sp_model = spm.SentencePieceProcessor()
sp_model.Load('xlnet-base/sp10m.cased.v9.model')

True

In [5]:
import json
import glob

left, right, label = [], [], []
for file in glob.glob('../Malaya-Dataset/text-similarity/quora/*.json'):
    with open(file) as fopen:
        x = json.load(fopen)
    for i in x:
        splitted = i[0].split(' <> ')
        if len(splitted) != 2:
            continue
        left.append(splitted[0])
        right.append(splitted[1])
        label.append(i[1])

In [8]:
l = {'contradiction': 0, 'entailment': 1}

snli = glob.glob('../malaya-dataset/Malaya-Dataset/text-similarity/snli/*.json')
for file in snli:
    with open(file) as fopen:
        x = json.load(fopen)
    for i in x:
        splitted = i[1].split(' <> ')
        if len(splitted) != 2:
            continue
        if not l.get(i[0]):
            continue
        left.append(splitted[0])
        right.append(splitted[1])
        try:
            label.append(l[i[0]])
        except:
            print(splitted, i[0])

In [9]:
SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

special_symbols = {
    "<unk>"  : 0,
    "<s>"    : 1,
    "</s>"   : 2,
    "<cls>"  : 3,
    "<sep>"  : 4,
    "<pad>"  : 5,
    "<mask>" : 6,
    "<eod>"  : 7,
    "<eop>"  : 8,
}

VOCAB_SIZE = 32000
UNK_ID = special_symbols["<unk>"]
CLS_ID = special_symbols["<cls>"]
SEP_ID = special_symbols["<sep>"]
MASK_ID = special_symbols["<mask>"]
EOD_ID = special_symbols["<eod>"]
MAX_SEQ_LENGTH = 100

In [10]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
              tokens_a.pop()
        else:
              tokens_b.pop()
                
def get_inputs(left, right):

    input_ids, input_mask, all_seg_ids = [], [], []
    
    for i in tqdm(range(len(left))):
        tokens = tokenize_fn(left[i])
        tokens_right = tokenize_fn(right[i])
        _truncate_seq_pair(tokens, tokens_right, MAX_SEQ_LENGTH - 3)
        
        segment_ids = [SEG_ID_A] * len(tokens)
        tokens.append(SEP_ID)
        segment_ids.append(SEG_ID_A)

        tokens.extend(tokens_right)
        segment_ids.extend([SEG_ID_B] * len(tokens_right))
        tokens.append(SEP_ID)
        segment_ids.append(SEG_ID_B)

        tokens.append(CLS_ID)
        segment_ids.append(SEG_ID_CLS)
        
        cur_input_ids = tokens
        cur_input_mask = [0] * len(cur_input_ids)
        input_ids.append(tokens)
        input_mask.append(cur_input_mask)
        all_seg_ids.append(segment_ids)
        
    return input_ids, input_mask, all_seg_ids

In [11]:
input_ids, input_masks, segment_ids = get_inputs(left, right)

100%|██████████| 593596/593596 [01:34<00:00, 6279.85it/s]


In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

input_ids = pad_sequences(input_ids,padding='post')
input_masks = pad_sequences(input_masks,padding='post', value = 1)
segment_ids = pad_sequences(segment_ids,padding='post', value = 4)

In [13]:
input_ids.shape, input_masks.shape, segment_ids.shape, len(label)

((593596, 100), (593596, 100), (593596, 100), 593596)

In [14]:
kwargs = dict(
      is_training=True,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.1,
      dropatt=0.1,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='xlnet-base/config.json')




In [15]:
epoch = 20
batch_size = 60
warmup_proportion = 0.1
num_train_steps = int(len(input_ids) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
print(num_train_steps, num_warmup_steps)
learning_rate = 2e-5

training_parameters = dict(
      decay_method = 'poly',
      train_steps = num_train_steps,
      learning_rate = learning_rate,
      warmup_steps = num_warmup_steps,
      min_lr_ratio = 0.0,
      weight_decay = 0.00,
      adam_epsilon = 1e-8,
      num_core_per_host = 1,
      lr_layer_decay_rate = 1,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clip = 1.0,
      clamp_len=-1,)

197865 19786


In [16]:
class Parameter:
    def __init__(self, decay_method, warmup_steps, weight_decay, adam_epsilon, 
                num_core_per_host, lr_layer_decay_rate, use_tpu, learning_rate, train_steps,
                min_lr_ratio, clip, **kwargs):
        self.decay_method = decay_method
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.adam_epsilon = adam_epsilon
        self.num_core_per_host = num_core_per_host
        self.lr_layer_decay_rate = lr_layer_decay_rate
        self.use_tpu = use_tpu
        self.learning_rate = learning_rate
        self.train_steps = train_steps
        self.min_lr_ratio = min_lr_ratio
        self.clip = clip
        
training_parameters = Parameter(**training_parameters)

In [17]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.float32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        
        xlnet_model = xlnet.XLNetModel(
            xlnet_config=xlnet_config,
            run_config=xlnet_parameters,
            input_ids=tf.transpose(self.X, [1, 0]),
            seg_ids=tf.transpose(self.segment_ids, [1, 0]),
            input_mask=tf.transpose(self.input_masks, [1, 0]))
        
        summary = xlnet_model.get_pooled_out("last", True)
        print(summary)
        
        self.logits = tf.layers.dense(summary, dimension_output)
        self.logits = tf.identity(self.logits, name = 'logits')
        
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        
        self.optimizer, self.learning_rate, _ = model_utils.get_train_op(training_parameters, self.cost)
        
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [18]:
dimension_output = 2

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())




INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>

Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `layer.__call__` method instead.
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.Dense instead.
Tensor("model_1/sequnece_summary/dropout/dropout/mul_1:0", shape=(?, 768), dtype=float32)


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



In [19]:
import collections
import re

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable[name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [20]:
tvars = tf.trainable_variables()
checkpoint = 'xlnet-base/model.ckpt'
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                checkpoint)

In [21]:
from sklearn.model_selection import train_test_split

train_input_ids, test_input_ids, train_input_masks, test_input_masks, train_segment_ids, test_segment_ids, train_Y, test_Y = train_test_split(
    input_ids, input_masks, segment_ids, label, test_size = 0.2)

In [22]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 1, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = [], [], [], []
    pbar = tqdm(
        range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_input_ids))
        batch_x = train_input_ids[i: index]
        batch_masks = train_input_masks[i: index]
        batch_segment = train_segment_ids[i: index]
        batch_y = train_Y[i: index]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        assert not np.isnan(cost)
        train_loss.append(cost)
        train_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        index = min(i + batch_size, len(test_input_ids))
        batch_x = test_input_ids[i: index]
        batch_masks = test_input_masks[i: index]
        batch_segment = test_segment_ids[i: index]
        batch_y = test_Y[i: index]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss = np.mean(train_loss)
    train_acc = np.mean(train_acc)
    test_loss = np.mean(test_loss)
    test_acc = np.mean(test_acc)

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1
        
    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 7915/7915 [1:19:15<00:00,  1.66it/s, accuracy=0.75, cost=0.416] 
test minibatch loop: 100%|██████████| 1979/1979 [07:05<00:00,  4.65it/s, accuracy=0.65, cost=0.607] 
train minibatch loop:   0%|          | 0/7915 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.788917
time taken: 5181.4275097846985
epoch: 0, training loss: 0.465134, training acc: 0.753531, valid loss: 0.396933, valid acc: 0.788917



train minibatch loop: 100%|██████████| 7915/7915 [1:19:17<00:00,  1.66it/s, accuracy=0.833, cost=0.415]
test minibatch loop: 100%|██████████| 1979/1979 [07:04<00:00,  4.66it/s, accuracy=0.65, cost=0.508] 
train minibatch loop:   0%|          | 0/7915 [00:00<?, ?it/s]

epoch: 1, pass acc: 0.788917, current acc: 0.803798
time taken: 5182.63897895813
epoch: 1, training loss: 0.369613, training acc: 0.816349, valid loss: 0.369464, valid acc: 0.803798



train minibatch loop: 100%|██████████| 7915/7915 [1:19:08<00:00,  1.67it/s, accuracy=0.833, cost=0.349]
test minibatch loop: 100%|██████████| 1979/1979 [07:03<00:00,  4.68it/s, accuracy=0.725, cost=0.438]
train minibatch loop:   0%|          | 0/7915 [00:00<?, ?it/s]

epoch: 2, pass acc: 0.803798, current acc: 0.834997
time taken: 5171.688594818115
epoch: 2, training loss: 0.336626, training acc: 0.838042, valid loss: 0.340342, valid acc: 0.834997



train minibatch loop: 100%|██████████| 7915/7915 [1:19:06<00:00,  1.67it/s, accuracy=0.861, cost=0.401]
test minibatch loop: 100%|██████████| 1979/1979 [07:03<00:00,  4.68it/s, accuracy=0.825, cost=0.322]
train minibatch loop:   0%|          | 0/7915 [00:00<?, ?it/s]

epoch: 3, pass acc: 0.834997, current acc: 0.849196
time taken: 5169.649661540985
epoch: 3, training loss: 0.305056, training acc: 0.856933, valid loss: 0.320067, valid acc: 0.849196



train minibatch loop: 100%|██████████| 7915/7915 [1:19:10<00:00,  1.67it/s, accuracy=0.861, cost=0.325] 
test minibatch loop: 100%|██████████| 1979/1979 [07:06<00:00,  4.64it/s, accuracy=0.8, cost=0.359]  
train minibatch loop:   0%|          | 0/7915 [00:00<?, ?it/s]

epoch: 4, pass acc: 0.849196, current acc: 0.854792
time taken: 5177.5598912239075
epoch: 4, training loss: 0.278160, training acc: 0.873619, valid loss: 0.314210, valid acc: 0.854792



train minibatch loop: 100%|██████████| 7915/7915 [1:19:10<00:00,  1.67it/s, accuracy=0.944, cost=0.147] 
test minibatch loop: 100%|██████████| 1979/1979 [07:07<00:00,  4.63it/s, accuracy=0.825, cost=0.317] 

time taken: 5177.579535007477
epoch: 5, training loss: 0.254044, training acc: 0.886641, valid loss: 0.322444, valid acc: 0.854704

break epoch:6






In [23]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'xlnet-base-similarity/model.ckpt')

'xlnet-base-similarity/model.ckpt'

In [24]:
kwargs = dict(
      is_training=False,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='xlnet-base/config.json')

In [25]:
dimension_output = 2
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())

INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>




Tensor("model_1/sequnece_summary/dropout/Identity:0", shape=(?, 768), dtype=float32)


In [26]:
saver = tf.train.Saver(tf.trainable_variables())
saver.restore(sess, 'xlnet-base-similarity/model.ckpt')

INFO:tensorflow:Restoring parameters from xlnet-base-similarity/model.ckpt


In [27]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_input_ids), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    index = min(i + batch_size, len(test_input_ids))
    batch_x = test_input_ids[i: index]
    batch_masks = test_input_masks[i: index]
    batch_segment = test_segment_ids[i: index]
    batch_y = test_Y[i: index]
    predict_Y += np.argmax(sess.run(model.logits,
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x,
                model.segment_ids: batch_segment,
                model.input_masks: batch_masks
            },
    ), 1, ).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 1979/1979 [06:04<00:00,  5.42it/s]


In [28]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['not similar', 'similar'],
        digits = 5
    )
)

              precision    recall  f1-score   support

 not similar    0.77288   0.94355   0.84973     51004
     similar    0.94900   0.79116   0.86292     67716

    accuracy                        0.85663    118720
   macro avg    0.86094   0.86736   0.85633    118720
weighted avg    0.87334   0.85663   0.85725    118720



In [29]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name
        or 'self/Softmax' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'global_step' not in n.name
    ]
)
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Placeholder_2',
 'Placeholder_3',
 'model/transformer/r_w_bias',
 'model/transformer/r_r_bias',
 'model/transformer/word_embedding/lookup_table',
 'model/transformer/r_s_bias',
 'model/transformer/seg_embed',
 'model/transformer/layer_0/rel_attn/q/kernel',
 'model/transformer/layer_0/rel_attn/k/kernel',
 'model/transformer/layer_0/rel_attn/v/kernel',
 'model/transformer/layer_0/rel_attn/r/kernel',
 'model/transformer/layer_0/rel_attn/o/kernel',
 'model/transformer/layer_0/rel_attn/LayerNorm/gamma',
 'model/transformer/layer_0/ff/layer_1/kernel',
 'model/transformer/layer_0/ff/layer_1/bias',
 'model/transformer/layer_0/ff/layer_2/kernel',
 'model/transformer/layer_0/ff/layer_2/bias',
 'model/transformer/layer_0/ff/LayerNorm/gamma',
 'model/transformer/layer_1/rel_attn/q/kernel',
 'model/transformer/layer_1/rel_attn/k/kernel',
 'model/transformer/layer_1/rel_attn/v/kernel',
 'model/transformer/layer_1/rel_attn/r/kernel',
 'model/transformer/layer_1/rel

In [30]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [31]:
freeze_graph('xlnet-base-similarity', strings)

INFO:tensorflow:Restoring parameters from xlnet-base-similarity/model.ckpt
Instructions for updating:
Use `tf.compat.v1.graph_util.convert_variables_to_constants`
Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`
INFO:tensorflow:Froze 165 variables.
INFO:tensorflow:Converted 165 variables to const ops.
7669 ops in the final graph.


In [32]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

g = load_graph('xlnet-base-similarity/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
seg = g.get_tensor_by_name('import/Placeholder_1:0')
m = g.get_tensor_by_name('import/Placeholder_2:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)



In [33]:
test_sess.run(logits, feed_dict = {x: batch_x,
                             seg: batch_segment,
                             m: batch_masks})

array([[ 1.5434387 , -2.7328088 ],
       [-4.7541265 ,  5.163885  ],
       [-5.470607  ,  4.3164625 ],
       [-4.553798  ,  5.1717024 ],
       [-1.9533242 ,  0.44452572],
       [-4.439697  ,  5.793971  ],
       [-0.2911216 , -0.0261218 ],
       [-4.7054543 ,  4.7711077 ],
       [ 0.28674954, -1.8945521 ],
       [-1.6190811 ,  0.5553683 ],
       [ 0.13827887, -3.5472271 ],
       [ 0.5268437 , -2.6319551 ],
       [-0.9359468 ,  1.0296911 ],
       [ 2.0339055 , -2.1630902 ],
       [ 0.9112381 ,  0.58463794],
       [-1.0450312 ,  0.10713197],
       [-5.4646316 ,  4.3009014 ],
       [ 1.4644347 , -2.5155988 ],
       [ 1.329641  , -2.7189822 ],
       [-0.62557185, -0.08425825],
       [ 2.237051  , -1.6611747 ],
       [-0.09839378, -0.66356647],
       [-1.615195  ,  1.8174384 ],
       [ 1.3267742 , -2.6401014 ],
       [ 1.2640498 ,  0.7178159 ],
       [ 1.6135963 , -2.1411867 ],
       [ 2.2238655 , -2.0346007 ],
       [-4.353797  ,  4.63577   ],
       [-5.0723886 ,

In [34]:
import boto3

bucketName = 'huseinhouse-storage'
Key = 'xlnet-base-similarity/frozen_model.pb'
outPutname = "v34/similarity/xlnet-base-similarity.pb"

s3 = boto3.client('s3')

s3.upload_file(Key,bucketName,outPutname)