In [1]:
# !wget https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-dev.conllu
# !wget https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu
# !wget https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-test.conllu
# !wget https://storage.googleapis.com/xlnet/released_models/cased_L-12_H-768_A-12.zip -O xlnet.zip
# !unzip xlnet.zip

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
tag2idx = {'PAD': 0, 'X': 1}
tag_idx = 2

In [4]:
import sentencepiece as spm
from prepro_utils import preprocess_text, encode_ids

sp_model = spm.SentencePieceProcessor()
sp_model.Load('xlnet_cased_L-12_H-768_A-12/spiece.model')

def tokenize_fn(text):
    text = preprocess_text(text, lower= False)
    return encode_ids(sp_model, text)

In [5]:
SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

special_symbols = {
    "<unk>"  : 0,
    "<s>"    : 1,
    "</s>"   : 2,
    "<cls>"  : 3,
    "<sep>"  : 4,
    "<pad>"  : 5,
    "<mask>" : 6,
    "<eod>"  : 7,
    "<eop>"  : 8,
}

VOCAB_SIZE = 32000
UNK_ID = special_symbols["<unk>"]
CLS_ID = special_symbols["<cls>"]
SEP_ID = special_symbols["<sep>"]
MASK_ID = special_symbols["<mask>"]
EOD_ID = special_symbols["<eod>"]

In [6]:
def process_corpus(corpus, until = None):
    global word2idx, tag2idx, char2idx, word_idx, tag_idx, char_idx
    sentences, words, depends, labels, pos, sequences = [], [], [], [], [], []
    temp_sentence, temp_word, temp_depend, temp_label, temp_pos = [], [], [], [], []
    segments, masks = [], []
    first_time = True
    for sentence in corpus:
        try:
            if len(sentence):
                if sentence[0] == '#':
                    continue
                if first_time:
                    print(sentence)
                    first_time = False
                sentence = sentence.split('\t')
                if sentence[7] not in tag2idx:
                    tag2idx[sentence[7]] = tag_idx
                    tag_idx += 1
                temp_word.append(sentence[1])
                temp_depend.append(int(sentence[6]) + 1)
                temp_label.append(tag2idx[sentence[7]])
                temp_sentence.append(sentence[1])
                temp_pos.append(sentence[3])
            else:
                if len(temp_sentence) < 2 or len(temp_word) != len(temp_label):
                    temp_word = []
                    temp_depend = []
                    temp_label = []
                    temp_sentence = []
                    temp_pos = []
                    continue
                bert_tokens = []
                labels_ = []
                depends_ = []
                seq_ = []
                for no, orig_token in enumerate(temp_word):
                    t = tokenize_fn(orig_token)
                    labels_.append(temp_label[no])
                    depends_.append(temp_depend[no])
                    bert_tokens.extend(t)
                    labels_.extend([1] * (len(t) - 1))
                    depends_.extend([0] * (len(t) - 1))
                    seq_.append(no + 1)
                bert_tokens.extend([4, 3])
                labels_.extend([0, 0])
                depends_.extend([0, 0])
                segment = [0] * (len(bert_tokens) - 1) + [SEG_ID_CLS]
                input_mask = [0] * len(segment)
                words.append(bert_tokens)
                depends.append(depends_)
                labels.append(labels_)
                sentences.append(temp_sentence)
                pos.append(temp_pos)
                sequences.append(seq_)
                segments.append(segment)
                masks.append(input_mask)
                temp_word = []
                temp_depend = []
                temp_label = []
                temp_sentence = []
                temp_pos = []
        except Exception as e:
            print(e, sentence)
    return sentences[:-1], words[:-1], depends[:-1], labels[:-1], pos[:-1], sequences[:-1], segments[:-1], masks[:-1]

In [7]:
with open('en_ewt-ud-dev.conllu') as fopen:
    dev = fopen.read().split('\n')

sentences_dev, words_dev, depends_dev, labels_dev, _, seq_dev, segments_dev, masks_dev = process_corpus(dev)

1	From	from	ADP	IN	_	3	case	3:case	_
invalid literal for int() with base 10: '_' ['10.1', 'has', 'have', 'VERB', 'VBZ', '_', '_', '_', '8:parataxis', 'CopyOf=-1']
invalid literal for int() with base 10: '_' ['21.1', 'has', 'have', 'VERB', 'VBZ', '_', '_', '_', '16:conj:and', 'CopyOf=-1']


In [8]:
with open('en_ewt-ud-test.conllu') as fopen:
    test = fopen.read().split('\n')

sentences_test, words_test, depends_test, labels_test, _, seq_test, segments_test, masks_test = process_corpus(test)
sentences_test.extend(sentences_dev)
words_test.extend(words_dev)
depends_test.extend(depends_dev)
labels_test.extend(labels_dev)
seq_test.extend(seq_dev)
segments_test.extend(segments_dev)
masks_test.extend(masks_dev)

1	What	what	PRON	WP	PronType=Int	0	root	0:root	_
invalid literal for int() with base 10: '_' ['24.1', 'left', 'left', 'VERB', 'VBN', 'Tense=Past|VerbForm=Part', '_', '_', '6:parataxis', 'CopyOf=6']


In [9]:
with open('en_ewt-ud-train.conllu') as fopen:
    train = fopen.read().split('\n')

sentences_train, words_train, depends_train, labels_train, _, _, segments_train, masks_train = process_corpus(train)

1	Al	Al	PROPN	NNP	Number=Sing	0	root	0:root	SpaceAfter=No
invalid literal for int() with base 10: '_' ['8.1', 'reported', 'report', 'VERB', 'VBN', 'Tense=Past|VerbForm=Part|Voice=Pass', '_', '_', '5:conj:and', 'CopyOf=5']
invalid literal for int() with base 10: '_' ['22.1', 'used', 'use', 'VERB', 'VBN', 'Tense=Past|VerbForm=Part', '_', '_', '13:advcl:with|17:conj:and', 'CopyOf=17']
invalid literal for int() with base 10: '_' ['22.1', 'used', 'use', 'VERB', 'VBN', 'Tense=Past|VerbForm=Part', '_', '_', '13:advcl:with|17:conj:and', 'CopyOf=17']
invalid literal for int() with base 10: '_' ['11.1', 'called', 'call', 'VERB', 'VBN', 'Tense=Past|VerbForm=Part|Voice=Pass', '_', '_', '3:conj:and', 'CopyOf=3']
invalid literal for int() with base 10: '_' ['14.1', 'is', 'be', 'VERB', 'VBZ', '_', '_', '_', '1:conj:and', 'CopyOf=1']
invalid literal for int() with base 10: '_' ['20.1', 'reflect', 'reflect', 'VERB', 'VBP', 'Mood=Ind|Tense=Pres|VerbForm=Fin', '_', '_', '7:acl:relcl|9:conj', 'CopyOf=9']


In [10]:
len(sentences_train), len(sentences_test)

(12000, 3824)

In [11]:
idx2tag = {v:k for k, v in tag2idx.items()}

In [12]:
train_X = words_train
train_Y = labels_train
train_depends = depends_train

test_X = words_test
test_Y = labels_test
test_depends = depends_test

In [13]:
import xlnet
import model_utils
import tensorflow as tf
import numpy as np

kwargs = dict(
      is_training=True,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.1,
      dropatt=0.1,
      init='normal',
      init_range=0.1,
      init_std=0.05,
      clamp_len=-1)

xlnet_parameters = xlnet.RunConfig(**kwargs)
xlnet_config = xlnet.XLNetConfig(json_path='xlnet_cased_L-12_H-768_A-12/xlnet_config.json')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])






  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [14]:
epoch = 15
batch_size = 32
warmup_proportion = 0.1
num_train_steps = int(len(train_X) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
print(num_train_steps, num_warmup_steps)

training_parameters = dict(
      decay_method = 'poly',
      train_steps = num_train_steps,
      learning_rate = 2e-5,
      warmup_steps = num_warmup_steps,
      min_lr_ratio = 0.0,
      weight_decay = 0.00,
      adam_epsilon = 1e-8,
      num_core_per_host = 1,
      lr_layer_decay_rate = 1,
      use_tpu=False,
      use_bfloat16=False,
      dropout=0.0,
      dropatt=0.0,
      init='normal',
      init_range=0.1,
      init_std=0.02,
      clip = 1.0,
      clamp_len=-1,)

5625 562


In [15]:
class Parameter:
    def __init__(self, decay_method, warmup_steps, weight_decay, adam_epsilon, 
                num_core_per_host, lr_layer_decay_rate, use_tpu, learning_rate, train_steps,
                min_lr_ratio, clip, **kwargs):
        self.decay_method = decay_method
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.adam_epsilon = adam_epsilon
        self.num_core_per_host = num_core_per_host
        self.lr_layer_decay_rate = lr_layer_decay_rate
        self.use_tpu = use_tpu
        self.learning_rate = learning_rate
        self.train_steps = train_steps
        self.min_lr_ratio = min_lr_ratio
        self.clip = clip
        
training_parameters = Parameter(**training_parameters)

In [16]:
class BiAAttention:
    def __init__(self, input_size_encoder, input_size_decoder, num_labels):
        self.input_size_encoder = input_size_encoder
        self.input_size_decoder = input_size_decoder
        self.num_labels = num_labels
        
        self.W_d = tf.get_variable("W_d", shape=[self.num_labels, self.input_size_decoder],
           initializer=tf.contrib.layers.xavier_initializer())
        self.W_e = tf.get_variable("W_e", shape=[self.num_labels, self.input_size_encoder],
           initializer=tf.contrib.layers.xavier_initializer())
        self.U = tf.get_variable("U", shape=[self.num_labels, self.input_size_decoder, self.input_size_encoder],
           initializer=tf.contrib.layers.xavier_initializer())
        
    def forward(self, input_d, input_e, mask_d=None, mask_e=None):
        batch = tf.shape(input_d)[0]
        length_decoder = tf.shape(input_d)[1]
        length_encoder = tf.shape(input_e)[1]
        out_d = tf.expand_dims(tf.matmul(self.W_d, tf.transpose(input_d, [0, 2, 1])), 3)
        out_e = tf.expand_dims(tf.matmul(self.W_e, tf.transpose(input_e, [0, 2, 1])), 2)
        output = tf.matmul(tf.expand_dims(input_d, 1), self.U)
        output = tf.matmul(output, tf.transpose(tf.expand_dims(input_e, 1), [0, 1, 3, 2]))
        
        output = output + out_d + out_e
        
        if mask_d is not None:
            d = tf.expand_dims(tf.expand_dims(mask_d, 1), 3)
            e = tf.expand_dims(tf.expand_dims(mask_e, 1), 2)
            output = output * d * e
            
        return output
    
class BiLinear:
    def __init__(self, left_features, right_features, out_features):
        self.left_features = left_features
        self.right_features = right_features
        self.out_features = out_features
        
        self.U = tf.get_variable("U-bi", shape=[out_features, left_features, right_features],
           initializer=tf.contrib.layers.xavier_initializer())
        self.W_l = tf.get_variable("Wl", shape=[out_features, left_features],
           initializer=tf.contrib.layers.xavier_initializer())
        self.W_r = tf.get_variable("Wr", shape=[out_features, right_features],
           initializer=tf.contrib.layers.xavier_initializer())
    
    def forward(self, input_left, input_right):
        left_size = tf.shape(input_left)
        output_shape = tf.concat([left_size[:-1], [self.out_features]], axis = 0)
        batch = tf.cast(tf.reduce_prod(left_size[:-1]), tf.int32)
        input_left = tf.reshape(input_left, (batch, self.left_features))
        input_right = tf.reshape(input_right, (batch, self.right_features))
        tiled = tf.tile(tf.expand_dims(input_left, axis = 0), (self.out_features,1,1))
        output = tf.transpose(tf.reduce_sum(tf.matmul(tiled, self.U), axis = 2))
        output = output + tf.matmul(input_left, tf.transpose(self.W_l))\
        + tf.matmul(input_right, tf.transpose(self.W_r))
        
        return tf.reshape(output, output_shape)

class Attention:
    def __init__(self, word_dim, num_words, char_dim, num_chars, num_filters, kernel_size,
                 hidden_size, encoder_layers, num_labels, arc_space, type_space):
        
        def cells(size, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size,
                                           initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        self.word_embedd = tf.Variable(tf.random_uniform([num_words, word_dim], -1, 1))
        self.char_embedd = tf.Variable(tf.random_uniform([num_chars, char_dim], -1, 1))
        self.conv1d = tf.layers.Conv1D(num_filters, kernel_size, 1, padding='VALID')
        self.num_labels = num_labels
        self.encoder = tf.nn.rnn_cell.MultiRNNCell([cells(hidden_size) for _ in range(encoder_layers)])

        
        
    def encode(self, input_word, input_char):
        word = tf.nn.embedding_lookup(self.word_embedd, input_word)
        char = tf.nn.embedding_lookup(self.char_embedd, input_char)
        b = tf.shape(char)[0]
        wl = tf.shape(char)[1]
        cl = tf.shape(char)[2]
        d = char.shape[3]
        char = tf.reshape(char, [b * wl, cl, d])
        char = tf.reduce_max(self.conv1d(char), axis = 1)
        char = tf.nn.tanh(char)
        d = char.shape[-1]
        char = tf.reshape(char, [b, wl, d])
        
        src_encoding = tf.concat([word, char], axis=2)
        output, hn = tf.nn.dynamic_rnn(self.encoder, src_encoding, dtype = tf.float32,
                                      scope = 'encoder')
        arc_h = tf.nn.elu(self.arc_h(output))
        arc_c = tf.nn.elu(self.arc_c(output))
        
        type_h = tf.nn.elu(self.type_h(output))
        type_c = tf.nn.elu(self.type_c(output))
        
        return (arc_h, arc_c), (type_h, type_c), hn
    
    def forward(self, input_word, input_char, mask):
        arcs, types, _ = self.encode(input_word, input_char)
        
        out_arc = tf.squeeze(self.attention.forward(arcs[0], arcs[1], mask_d=mask, mask_e=mask), axis = 1)
        return out_arc, types, mask
    
    def loss(self, input_word, input_char, mask, heads, types):
        out_arc, out_type, _ = self.forward(input_word, input_char, mask)
        type_h, type_c = out_type
        batch = tf.shape(out_arc)[0]
        max_len = tf.shape(out_arc)[1]
        batch_index = tf.range(0, batch)
        t = tf.transpose(heads)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0), 
                                               tf.expand_dims(t, axis = 0)], axis = 0))
        type_h = tf.gather_nd(type_h, concatenated)
        out_type = self.bilinear.forward(type_h, type_c)
        minus_inf = -1e8
        minus_mask = (1 - mask) * minus_inf
        out_arc = out_arc + tf.expand_dims(minus_mask, axis = 2) + tf.expand_dims(minus_mask, axis = 1)
        loss_arc = tf.nn.log_softmax(out_arc, dim=1)
        loss_type = tf.nn.log_softmax(out_type, dim=2)
        loss_arc = loss_arc * tf.expand_dims(mask, axis = 2) * tf.expand_dims(mask, axis = 1)
        loss_type = loss_type * tf.expand_dims(mask, axis = 2)
        num = tf.reduce_sum(mask) - tf.cast(batch, tf.float32)
        child_index = tf.tile(tf.expand_dims(tf.range(0, max_len), 1), [1, batch])
        t = tf.transpose(heads)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0),
                                               tf.expand_dims(t, axis = 0),
                                               tf.expand_dims(child_index, axis = 0)], axis = 0))
        loss_arc = tf.gather_nd(loss_arc, concatenated)
        loss_arc = tf.transpose(loss_arc, [1, 0])
        
        t = tf.transpose(types)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0),
                                               tf.expand_dims(child_index, axis = 0),
                                               tf.expand_dims(t, axis = 0)], axis = 0))
        loss_type = tf.gather_nd(loss_type, concatenated)
        loss_type = tf.transpose(loss_type, [1, 0])
        return tf.reduce_sum(-loss_arc) / num, tf.reduce_sum(-loss_type) / num
    
    def decode(self, input_word, input_char, mask, leading_symbolic=0):
        out_arc, out_type, _ = self.forward(input_word, input_char, mask)
        batch = tf.shape(out_arc)[0]
        max_len = tf.shape(out_arc)[1]
        sec_max_len = tf.shape(out_arc)[2]
        out_arc = out_arc + tf.linalg.diag(tf.fill([max_len], -np.inf))
        minus_mask = tf.expand_dims(tf.cast(1 - mask, tf.bool), axis = 2)
        minus_mask = tf.tile(minus_mask, [1, 1, sec_max_len])
        out_arc = tf.where(minus_mask, tf.fill(tf.shape(out_arc), -np.inf), out_arc)
        heads = tf.argmax(out_arc, axis = 1)
        type_h, type_c = out_type
        batch = tf.shape(type_h)[0]
        max_len = tf.shape(type_h)[1]
        batch_index = tf.range(0, batch)
        t = tf.cast(tf.transpose(heads), tf.int32)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0), 
                                               tf.expand_dims(t, axis = 0)], axis = 0))
        type_h = tf.gather_nd(type_h, concatenated)
        out_type = self.bilinear.forward(type_h, type_c)
        out_type = out_type[:, :, leading_symbolic:]
        types = tf.argmax(out_type, axis = 2)
        return heads, types
    
class Model:
    def __init__(
        self,
        learning_rate,
        hidden_size_word,
        cov = 0.0):
        
        self.words = tf.placeholder(tf.int32, (None, None))
        self.segment_ids = tf.placeholder(tf.int32, [None, None])
        self.input_masks = tf.placeholder(tf.float32, [None, None])
        self.heads = tf.placeholder(tf.int32, (None, None))
        self.types = tf.placeholder(tf.int32, (None, None))
        self.mask = tf.cast(tf.math.not_equal(self.words, 0), tf.float32)
        self.maxlen = tf.shape(self.words)[1]
        self.lengths = tf.count_nonzero(self.words, 1)
        mask = self.mask
        heads = self.heads
        types = self.types
        
        self.arc_h = tf.layers.Dense(hidden_size_word)
        self.arc_c = tf.layers.Dense(hidden_size_word)
        self.attention = BiAAttention(hidden_size_word, hidden_size_word, 1)

        self.type_h = tf.layers.Dense(hidden_size_word)
        self.type_c = tf.layers.Dense(hidden_size_word)
        self.bilinear = BiLinear(hidden_size_word, hidden_size_word, len(tag2idx))
        
        xlnet_model = xlnet.XLNetModel(
            xlnet_config=xlnet_config,
            run_config=xlnet_parameters,
            input_ids=tf.transpose(self.words, [1, 0]),
            seg_ids=tf.transpose(self.segment_ids, [1, 0]),
            input_mask=tf.transpose(self.input_masks, [1, 0]))
        output_layer = xlnet_model.get_sequence_output()
        output_layer = tf.transpose(output_layer, [1, 0, 2])
        
        arc_h = tf.nn.elu(self.arc_h(output_layer))
        arc_c = tf.nn.elu(self.arc_c(output_layer))
        
        type_h = tf.nn.elu(self.type_h(output_layer))
        type_c = tf.nn.elu(self.type_c(output_layer))
        
        out_arc = tf.squeeze(self.attention.forward(arc_h, arc_h, mask_d=self.mask, 
                                                    mask_e=self.mask), axis = 1)
        
        batch = tf.shape(out_arc)[0]
        max_len = tf.shape(out_arc)[1]
        sec_max_len = tf.shape(out_arc)[2]
        batch_index = tf.range(0, batch)
        
        decode_arc = out_arc + tf.linalg.diag(tf.fill([max_len], -np.inf))
        minus_mask = tf.expand_dims(tf.cast(1 - mask, tf.bool), axis = 2)
        minus_mask = tf.tile(minus_mask, [1, 1, sec_max_len])
        decode_arc = tf.where(minus_mask, tf.fill(tf.shape(decode_arc), -np.inf), decode_arc)
        self.heads_seq = tf.argmax(decode_arc, axis = 1)
        
        t = tf.cast(tf.transpose(self.heads_seq), tf.int32)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0), 
                                               tf.expand_dims(t, axis = 0)], axis = 0))
        type_h = tf.gather_nd(type_h, concatenated)
        out_type = self.bilinear.forward(type_h, type_c)
        self.tags_seq = tf.argmax(out_type, axis = 2)
        
        batch = tf.shape(out_arc)[0]
        max_len = tf.shape(out_arc)[1]
        batch_index = tf.range(0, batch)
        t = tf.transpose(heads)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0), 
                                               tf.expand_dims(t, axis = 0)], axis = 0))
        type_h = tf.gather_nd(type_h, concatenated)
        out_type = self.bilinear.forward(type_h, type_c)
        minus_inf = -1e8
        minus_mask = (1 - mask) * minus_inf
        out_arc = out_arc + tf.expand_dims(minus_mask, axis = 2) + tf.expand_dims(minus_mask, axis = 1)
        loss_arc = tf.nn.log_softmax(out_arc, dim=1)
        loss_type = tf.nn.log_softmax(out_type, dim=2)
        loss_arc = loss_arc * tf.expand_dims(mask, axis = 2) * tf.expand_dims(mask, axis = 1)
        loss_type = loss_type * tf.expand_dims(mask, axis = 2)
        num = tf.reduce_sum(mask) - tf.cast(batch, tf.float32)
        child_index = tf.tile(tf.expand_dims(tf.range(0, max_len), 1), [1, batch])
        t = tf.transpose(heads)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0),
                                               tf.expand_dims(t, axis = 0),
                                               tf.expand_dims(child_index, axis = 0)], axis = 0))
        loss_arc = tf.gather_nd(loss_arc, concatenated)
        loss_arc = tf.transpose(loss_arc, [1, 0])
        
        t = tf.transpose(types)
        broadcasted = tf.broadcast_to(batch_index, tf.shape(t))
        concatenated = tf.transpose(tf.concat([tf.expand_dims(broadcasted, axis = 0),
                                               tf.expand_dims(child_index, axis = 0),
                                               tf.expand_dims(t, axis = 0)], axis = 0))
        loss_type = tf.gather_nd(loss_type, concatenated)
        loss_type = tf.transpose(loss_type, [1, 0])
        self.cost = (tf.reduce_sum(-loss_arc) / num) + (tf.reduce_sum(-loss_type) / num)
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        
        mask = tf.sequence_mask(self.lengths, maxlen = self.maxlen)
        
        self.prediction = tf.boolean_mask(self.tags_seq, mask)
        mask_label = tf.boolean_mask(self.types, mask)
        correct_pred = tf.equal(tf.cast(self.prediction, tf.int32), mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        self.prediction = tf.cast(tf.boolean_mask(self.heads_seq, mask), tf.int32)
        mask_label = tf.boolean_mask(self.heads, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy_depends = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [17]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

learning_rate = 2e-5
hidden_size_word = 128

model = Model(learning_rate, hidden_size_word)
sess.run(tf.global_variables_initializer())

Instructions for updating:
reduction_indices is deprecated, use axis instead
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.




INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>

Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
dim is deprecated, use axis instead


In [18]:
import collections
import re

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable[name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [19]:
tvars = tf.trainable_variables()
checkpoint = 'xlnet_cased_L-12_H-768_A-12/xlnet_model.ckpt'
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                checkpoint)

In [20]:
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, checkpoint)

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from xlnet_cased_L-12_H-768_A-12/xlnet_model.ckpt


In [21]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

batch_x = train_X[:5]
batch_x = pad_sequences(batch_x,padding='post')
batch_y = train_Y[:5]
batch_y = pad_sequences(batch_y,padding='post')
batch_depends = train_depends[:5]
batch_depends = pad_sequences(batch_depends,padding='post')
batch_segments = segments_train[:5]
batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
batch_masks = masks_train[:5]
batch_masks = pad_sequences(batch_masks, padding='post', value = 1)

In [22]:
sess.run([model.accuracy, model.accuracy_depends, model.cost],
        feed_dict = {model.words: batch_x,
                model.types: batch_y,
                model.heads: batch_depends,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks})

[0.0, 0.0397351, 242.8986]

In [23]:
tags_seq, heads = sess.run(
    [model.tags_seq, model.heads_seq],
    feed_dict = {
        model.words: batch_x,
        model.segment_ids: batch_segments,
        model.input_masks: batch_masks
    },
)
tags_seq[0], heads[0], batch_depends[0]

(array([30, 30, 30, 26, 30, 30, 26, 30, 30, 30, 30, 26, 30, 30, 30, 30, 30,
        43, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 26, 30, 30, 30, 30, 26,
        30, 20, 30, 30, 26, 43, 43, 30, 30, 30, 30, 30]),
 array([16, 16, 16, 16, 17, 16, 16, 22, 17, 16, 16, 16, 16, 37, 16, 16, 32,
        16, 16, 16, 16, 11,  9, 13, 16, 22, 40, 17, 16, 16, 16, 16, 16, 16,
        16, 23, 22, 16, 16, 16, 16,  0,  0,  0,  0,  0]),
 array([ 1,  2,  0,  2,  0,  2,  0,  7,  8,  2,  8,  0,  9,  9,  9,  0,  9,
         0,  9,  0, 16,  9, 19, 19,  8, 22, 22, 19, 24, 22,  0, 22,  0, 29,
        29, 29, 22,  2,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32))

In [24]:
from tqdm import tqdm

for e in range(epoch):
    train_acc, train_loss = [], []
    test_acc, test_loss = [], []
    train_acc_depends, test_acc_depends = [], []
    
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(train_X))
        batch_x = train_X[i: index]
        batch_x = pad_sequences(batch_x,padding='post')
        batch_y = train_Y[i: index]
        batch_y = pad_sequences(batch_y,padding='post')
        batch_depends = train_depends[i: index]
        batch_depends = pad_sequences(batch_depends,padding='post')
        batch_segments = segments_train[i: index]
        batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
        batch_masks = masks_train[i: index]
        batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
        
        acc_depends, acc, cost, _ = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.words: batch_x,
                model.types: batch_y,
                model.heads: batch_depends,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks
            },
        )
        train_loss.append(cost)
        train_acc.append(acc)
        train_acc_depends.append(acc_depends)
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
        
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        index = min(i + batch_size, len(test_X))
        batch_x = test_X[i: index]
        batch_x = pad_sequences(batch_x,padding='post')
        batch_y = test_Y[i: index]
        batch_y = pad_sequences(batch_y,padding='post')
        batch_depends = test_depends[i: index]
        batch_depends = pad_sequences(batch_depends,padding='post')
        batch_segments = segments_test[i: index]
        batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
        batch_masks = masks_test[i: index]
        batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
        
        acc_depends, acc, cost = sess.run(
            [model.accuracy_depends, model.accuracy, model.cost],
            feed_dict = {
                model.words: batch_x,
                model.types: batch_y,
                model.heads: batch_depends,
                model.segment_ids: batch_segments,
                model.input_masks: batch_masks
            },
        )
        test_loss.append(cost)
        test_acc.append(acc)
        test_acc_depends.append(acc_depends)
        pbar.set_postfix(cost = cost, accuracy = acc, accuracy_depends = acc_depends)
    
    
    print(
    'epoch: %d, training loss: %f, training acc: %f, training depends: %f, valid loss: %f, valid acc: %f, valid depends: %f\n'
    % (e, np.mean(train_loss), 
       np.mean(train_acc), 
       np.mean(train_acc_depends), 
       np.mean(test_loss), 
       np.mean(test_acc), 
       np.mean(test_acc_depends)
    ))

train minibatch loop: 100%|██████████| 375/375 [01:38<00:00,  3.81it/s, accuracy=0.142, accuracy_depends=0.0446, cost=7.04]
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.27it/s, accuracy=0.132, accuracy_depends=0.0337, cost=6.79]
train minibatch loop:   0%|          | 0/375 [00:00<?, ?it/s]

epoch: 0, training loss: 9.437640, training acc: 0.159448, training depends: 0.047825, valid loss: 6.539594, valid acc: 0.213945, valid depends: 0.069883



train minibatch loop: 100%|██████████| 375/375 [01:32<00:00,  4.05it/s, accuracy=0.205, accuracy_depends=0.146, cost=6.44] 
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.21it/s, accuracy=0.206, accuracy_depends=0.175, cost=5.97]
train minibatch loop:   0%|          | 0/375 [00:00<?, ?it/s]

epoch: 1, training loss: 6.450710, training acc: 0.215757, training depends: 0.092876, valid loss: 5.756104, valid acc: 0.287649, valid depends: 0.188275



train minibatch loop: 100%|██████████| 375/375 [01:32<00:00,  4.05it/s, accuracy=0.34, accuracy_depends=0.249, cost=5.41] 
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.26it/s, accuracy=0.35, accuracy_depends=0.27, cost=5.15]  
train minibatch loop:   0%|          | 0/375 [00:00<?, ?it/s]

epoch: 2, training loss: 5.495392, training acc: 0.314352, training depends: 0.242720, valid loss: 4.609732, valid acc: 0.421680, valid depends: 0.348417



train minibatch loop: 100%|██████████| 375/375 [01:32<00:00,  4.05it/s, accuracy=0.501, accuracy_depends=0.27, cost=4.6]  
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.25it/s, accuracy=0.509, accuracy_depends=0.328, cost=4.11]
train minibatch loop:   0%|          | 0/375 [00:00<?, ?it/s]

epoch: 3, training loss: 4.508115, training acc: 0.465066, training depends: 0.328635, valid loss: 3.790256, valid acc: 0.575229, valid depends: 0.390596



train minibatch loop: 100%|██████████| 375/375 [01:32<00:00,  4.05it/s, accuracy=0.665, accuracy_depends=0.302, cost=3.73]
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.24it/s, accuracy=0.69, accuracy_depends=0.313, cost=3.47] 
train minibatch loop:   0%|          | 0/375 [00:00<?, ?it/s]

epoch: 4, training loss: 3.713588, training acc: 0.623716, training depends: 0.354390, valid loss: 3.090042, valid acc: 0.714599, valid depends: 0.415153



train minibatch loop: 100%|██████████| 375/375 [01:32<00:00,  4.05it/s, accuracy=0.762, accuracy_depends=0.309, cost=3.24]
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.27it/s, accuracy=0.798, accuracy_depends=0.331, cost=2.92]
train minibatch loop:   0%|          | 0/375 [00:00<?, ?it/s]

epoch: 5, training loss: 3.111908, training acc: 0.735516, training depends: 0.375809, valid loss: 2.655969, valid acc: 0.785030, valid depends: 0.438616



train minibatch loop: 100%|██████████| 375/375 [01:32<00:00,  4.05it/s, accuracy=0.83, accuracy_depends=0.348, cost=2.86] 
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.28it/s, accuracy=0.847, accuracy_depends=0.368, cost=2.51]
train minibatch loop:   0%|          | 0/375 [00:00<?, ?it/s]

epoch: 6, training loss: 2.734684, training acc: 0.794773, training depends: 0.398003, valid loss: 2.349284, valid acc: 0.825458, valid depends: 0.465114



train minibatch loop: 100%|██████████| 375/375 [01:32<00:00,  4.05it/s, accuracy=0.841, accuracy_depends=0.386, cost=2.55]
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.32it/s, accuracy=0.883, accuracy_depends=0.433, cost=2.06]
train minibatch loop:   0%|          | 0/375 [00:00<?, ?it/s]

epoch: 7, training loss: 2.434094, training acc: 0.833364, training depends: 0.429542, valid loss: 2.104939, valid acc: 0.852945, valid depends: 0.503956



train minibatch loop: 100%|██████████| 375/375 [01:32<00:00,  4.05it/s, accuracy=0.872, accuracy_depends=0.455, cost=2.22]
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.28it/s, accuracy=0.89, accuracy_depends=0.497, cost=1.84] 
train minibatch loop:   0%|          | 0/375 [00:00<?, ?it/s]

epoch: 8, training loss: 2.165653, training acc: 0.859650, training depends: 0.474487, valid loss: 1.857052, valid acc: 0.872633, valid depends: 0.551248



train minibatch loop: 100%|██████████| 375/375 [01:32<00:00,  4.05it/s, accuracy=0.883, accuracy_depends=0.477, cost=2.02]
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.28it/s, accuracy=0.923, accuracy_depends=0.506, cost=1.6] 
train minibatch loop:   0%|          | 0/375 [00:00<?, ?it/s]

epoch: 9, training loss: 1.916253, training acc: 0.878971, training depends: 0.522010, valid loss: 1.647684, valid acc: 0.887113, valid depends: 0.594111



train minibatch loop: 100%|██████████| 375/375 [01:32<00:00,  4.05it/s, accuracy=0.897, accuracy_depends=0.53, cost=1.83] 
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.29it/s, accuracy=0.942, accuracy_depends=0.61, cost=1.26]  
train minibatch loop:   0%|          | 0/375 [00:00<?, ?it/s]

epoch: 10, training loss: 1.702642, training acc: 0.895447, training depends: 0.565976, valid loss: 1.482933, valid acc: 0.899495, valid depends: 0.632607



train minibatch loop: 100%|██████████| 375/375 [01:32<00:00,  4.04it/s, accuracy=0.922, accuracy_depends=0.58, cost=1.6]   
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.28it/s, accuracy=0.936, accuracy_depends=0.663, cost=1.09] 
train minibatch loop:   0%|          | 0/375 [00:00<?, ?it/s]

epoch: 11, training loss: 1.512414, training acc: 0.908186, training depends: 0.606668, valid loss: 1.335665, valid acc: 0.909348, valid depends: 0.665817



train minibatch loop: 100%|██████████| 375/375 [01:32<00:00,  4.04it/s, accuracy=0.917, accuracy_depends=0.657, cost=1.3]  
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.26it/s, accuracy=0.936, accuracy_depends=0.736, cost=0.889]
train minibatch loop:   0%|          | 0/375 [00:00<?, ?it/s]

epoch: 12, training loss: 1.348060, training acc: 0.917312, training depends: 0.644409, valid loss: 1.201391, valid acc: 0.916411, valid depends: 0.699593



train minibatch loop: 100%|██████████| 375/375 [01:32<00:00,  4.04it/s, accuracy=0.925, accuracy_depends=0.679, cost=1.23] 
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.24it/s, accuracy=0.942, accuracy_depends=0.742, cost=0.861]
train minibatch loop:   0%|          | 0/375 [00:00<?, ?it/s]

epoch: 13, training loss: 1.205230, training acc: 0.925562, training depends: 0.678194, valid loss: 1.100796, valid acc: 0.923252, valid depends: 0.722325



train minibatch loop: 100%|██████████| 375/375 [01:32<00:00,  4.04it/s, accuracy=0.932, accuracy_depends=0.718, cost=1.09] 
test minibatch loop: 100%|██████████| 120/120 [00:09<00:00, 12.29it/s, accuracy=0.96, accuracy_depends=0.761, cost=0.696] 

epoch: 14, training loss: 1.077971, training acc: 0.932818, training depends: 0.706353, valid loss: 1.007649, valid acc: 0.927428, valid depends: 0.746163






In [25]:
tags_seq, heads = sess.run(
    [model.tags_seq, model.heads_seq],
    feed_dict = {
        model.words: batch_x,
        model.segment_ids: batch_segments,
        model.input_masks: batch_masks
    },
)
tags_seq[0], heads[0] - 1, batch_depends[0] - 1

(array([22,  6, 22, 26, 23, 18, 16,  1,  1,  1,  5,  3, 13, 10, 11,  6, 12,
        15, 10, 16,  7,  1,  0,  0,  7,  7,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1]),
 array([ 8,  8,  5,  5,  2,  8,  8, -1, -1, -1,  0, 11, 10,  8, 14, 13,  8,
        17, 14, 14,  8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1]),
 array([ 2,  8,  5,  5,  2,  8,  8, -1, -1, -1,  0, 11, 11,  8, 14, 14,  8,
        16, 14, 14,  8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1], dtype=int32))

In [26]:
def evaluate(heads_pred, types_pred, heads, types, lengths,
             symbolic_root=False, symbolic_end=False):
    batch_size, _ = heads_pred.shape
    ucorr = 0.
    lcorr = 0.
    total = 0.
    ucomplete_match = 0.
    lcomplete_match = 0.

    corr_root = 0.
    total_root = 0.
    start = 1 if symbolic_root else 0
    end = 1 if symbolic_end else 0
    for i in range(batch_size):
        ucm = 1.
        lcm = 1.
        for j in range(start, lengths[i] - end):

            total += 1
            if heads[i, j] == heads_pred[i, j]:
                ucorr += 1
                if types[i, j] == types_pred[i, j]:
                    lcorr += 1
                else:
                    lcm = 0
            else:
                ucm = 0
                lcm = 0

            if heads[i, j] == 0:
                total_root += 1
                corr_root += 1 if heads_pred[i, j] == 0 else 0

        ucomplete_match += ucm
        lcomplete_match += lcm
    
    return ucorr / total, lcorr / total, corr_root / total_root

In [27]:
arcs, types, roots = [], [], []

for i in range(0, len(test_X), batch_size):
    index = min(i + batch_size, len(test_X))
    batch_x = test_X[i: index]
    batch_x = pad_sequences(batch_x,padding='post')
    batch_y = test_Y[i: index]
    batch_y = pad_sequences(batch_y,padding='post')
    batch_depends = test_depends[i: index]
    batch_depends = pad_sequences(batch_depends,padding='post')
    batch_segments = segments_test[i: index]
    batch_segments = pad_sequences(batch_segments, padding='post', value = 4)
    batch_masks = masks_test[i: index]
    batch_masks = pad_sequences(batch_masks, padding='post', value = 1)
    
    tags_seq, heads = sess.run(
        [model.tags_seq, model.heads_seq],
        feed_dict = {
            model.words: batch_x,
            model.segment_ids: batch_segments,
            model.input_masks: batch_masks
        },
    )
    
    arc_accuracy, type_accuracy, root_accuracy = evaluate(heads - 1, tags_seq, batch_depends - 1, batch_y, 
            np.count_nonzero(batch_x, axis = 1))
    arcs.append(arc_accuracy)
    types.append(type_accuracy)
    roots.append(root_accuracy)


In [28]:
print('arc accuracy:', np.mean(arcs))
print('types accuracy:', np.mean(types))
print('root accuracy:', np.mean(roots))

arc accuracy: 0.7441355787352213
types accuracy: 0.7137185640516154
root accuracy: 0.7317708333333334
