In [89]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
import re
import sys
from collections import Counter
from itertools import islice
from urllib.parse import urlparse, urlsplit, parse_qs, parse_qsl

import numpy as np
import parsel
from sklearn_crfsuite.metrics import flat_classification_report, sequence_accuracy_score

sys.path.insert(0, '..')
from autopager.storage import Storage
from autopager.htmlutils import (get_link_text, get_text_around_selector_list,
                                 get_link_href, get_selector_root)
from autopager.utils import (
    get_domain, normalize_whitespaces, normalize, ngrams, tokenize, ngrams_wb, replace_digits
)
from autopager.model import link_to_features, _num_tokens_feature, _elem_attr
from autopager import AUTOPAGER_LIMITS
from autopager.parserutils import (TagParser, MyHTMLParser, draw_scaled_page, position_check, compare_tag, get_first_tag)
parser = MyHTMLParser()
tagParser = TagParser()

In [90]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [91]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if len(gpus)!=0:
  # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print("No GPUs visible")

1 Physical GPUs, 1 Logical GPU


In [4]:
storage = Storage()
urls = [rec['Page URL'] for rec in storage.iter_records(contain_button = True, file_type='T')]
groups = [get_domain(url) for url in urls]
train_groups_set = set(groups)

In [5]:
X_raw, y, page_positions = storage.get_Xy(contain_button = True, file_type='T')

Finish: Get Page 1 (Encoding: UTF-8)records ... (len: 303)
Finish: Get Page 2 (Encoding: UTF-8)records ... (len: 243)
Finish: Get Page 3 (Encoding: UTF-8)records ... (len: 119)
Finish: Get Page 4 (Encoding: UTF-8)records ... (len: 944)
Finish: Get Page 5 (Encoding: UTF-8)records ... (len: 93)
Finish: Get Page 6 (Encoding: UTF-8)records ... (len: 994)
Finish: Get Page 7 (Encoding: UTF-8)records ... (len: 1014)
Finish: Get Page 8 (Encoding: UTF-8)records ... (len: 7)
Finish: Get Page 9 (Encoding: UTF-8)records ... (len: 288)
Finish: Get Page 10 (Encoding: UTF-8)records ... (len: 678)
Finish: Get Page 11 (Encoding: UTF-8)records ... (len: 789)
Finish: Get Page 12 (Encoding: UTF-8)records ... (len: 814)
Finish: Get Page 13 (Encoding: UTF-8)records ... (len: 814)
Finish: Get Page 14 (Encoding: UTF-8)records ... (len: 171)
Finish: Get Page 15 (Encoding: UTF-8)records ... (len: 168)
Finish: Get Page 16 (Encoding: UTF-8)records ... (len: 91)
Finish: Get Page 17 (Encoding: UTF-8)records ... (le

Finish: Get Page 150 (Encoding: UTF-8)records ... (len: 368)
Finish: Get Page 151 (Encoding: UTF-8)records ... (len: 369)
Finish: Get Page 152 (Encoding: cp1252)records ... (len: 294)
Finish: Get Page 153 (Encoding: UTF-8)records ... (len: 271)
Finish: Get Page 154 (Encoding: UTF-8)records ... (len: 300)
Finish: Get Page 155 (Encoding: UTF-8)records ... (len: 314)
Finish: Get Page 156 (Encoding: UTF-8)records ... (len: 278)
Finish: Get Page 157 (Encoding: UTF-8)records ... (len: 288)
Finish: Get Page 158 (Encoding: UTF-8)records ... (len: 178)
Finish: Get Page 159 (Encoding: UTF-8)records ... (len: 108)
Finish: Get Page 160 (Encoding: UTF-8)records ... (len: 98)
Finish: Get Page 161 (Encoding: UTF-8)records ... (len: 101)
Finish: Get Page 162 (Encoding: UTF-8)records ... (len: 308)
Finish: Get Page 163 (Encoding: UTF-8)records ... (len: 298)
Finish: Get Page 164 (Encoding: UTF-8)records ... (len: 285)
Finish: Get Page 165 (Encoding: UTF-8)records ... (len: 221)
Finish: Get Page 166 (En

Finish: Get Page 292 (Encoding: UTF-8)records ... (len: 280)
Finish: Get Page 293 (Encoding: UTF-8)records ... (len: 74)
Finish: Get Page 294 (Encoding: UTF-8)records ... (len: 63)
Finish: Get Page 295 (Encoding: UTF-8)records ... (len: 65)
Finish: Get Page 296 (Encoding: UTF-8)records ... (len: 20)
Finish: Get Page 297 (Encoding: UTF-8)records ... (len: 367)
Finish: Get Page 298 (Encoding: UTF-8)records ... (len: 371)
Finish: Get Page 299 (Encoding: UTF-8)records ... (len: 361)
Finish: Get Page 300 (Encoding: UTF-8)records ... (len: 361)
Finish: Get Page 301 (Encoding: UTF-8)records ... (len: 364)
Finish: Get Page 302 (Encoding: UTF-8)records ... (len: 170)
Finish: Get Page 303 (Encoding: UTF-8)records ... (len: 154)
Finish: Get Page 304 (Encoding: cp1252)records ... (len: 117)
Finish: Get Page 305 (Encoding: UTF-8)records ... (len: 1987)
Finish: Get Page 306 (Encoding: UTF-8)records ... (len: 59)
Finish: Get Page 307 (Encoding: UTF-8)records ... (len: 60)
Finish: Get Page 308 (Encodi

In [6]:
max_page_seq = 512

## Slice data into chunks

In [7]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [8]:
def get_chunks_data(x, y, p):
    new_tmp_x_array = []
    new_tmp_y_array = []
    new_tmp_p_array = []
    for tmp_x, tmp_y, tmp_p in zip(x, y, p):
        new_tmp_x_array.extend(chunks(tmp_x, max_page_seq))
        new_tmp_y_array.extend(chunks(tmp_y, max_page_seq))
        new_tmp_p_array.extend(chunks(tmp_p, max_page_seq))
    return new_tmp_x_array, new_tmp_y_array, new_tmp_p_array

In [9]:
chunks_x, chunks_y, chunk_positions = get_chunks_data(X_raw, y, page_positions)

In [10]:
len(chunks_x)

356

## Load Pre-trained Bert model

In [11]:
from tensorflow.keras import Model
from tensorflow.data import Dataset

In [12]:
from BertModel import BertModel

1 Physical GPUs, 1 Logical GPU


In [13]:
bert_short_model = BertModel(128)

In [14]:
bert_long_model = BertModel(256)

In [14]:
pbert = bert_short_model

# Feature extraction

In [92]:
%%time
# XXX: these functions should be copy-pasted from autopager/model.py

def _as_list(generator, limit=None):
    """
    >>> _as_list(ngrams_wb("text", 2, 2), 0)
    []
    >>> _as_list(ngrams_wb("text", 2, 2), 2)
    ['te', 'ex']
    >>> _as_list(ngrams_wb("text", 2, 2))
    ['te', 'ex', 'xt']
    """
    return list(generator if limit is None else islice(generator, 0, limit))

def feat_to_tokens(feat, tokenizer):
    if type(feat) == type([]):
        feat = ' '.join(feat)
    tokens = tokenizer.tokenize(feat)
    return tokens

def link_to_features(link):
    text = normalize(get_link_text(link))
    href = get_link_href(link)
    if href is None:
        href = ""
    p = urlsplit(href)
    parent = link.xpath('..').extract()
    parent = get_first_tag(parser, parent[0])
    query_parsed = parse_qsl(p.query) #parse query string from path
    query_param_names = [k.lower() for k, v in query_parsed]
    query_param_names_ngrams = _as_list(ngrams_wb(
        " ".join([normalize(name) for name in query_param_names]), 3, 5, True
    ))

    # Classes of link itself and all its children.
    # It is common to have e.g. span elements with fontawesome
    # arrow icon classes inside <a> links.
    self_and_children_classes = ' '.join(_as_list(link.xpath(".//@class").extract(), 5))
    parent_classes = ' '.join(_as_list(link.xpath('../@class').extract(), 5))
    css_classes = normalize(parent_classes + ' ' + self_and_children_classes)
#     print(css_classes)
    token_feature = {
        'text-before': '',
        'text-exact': replace_digits(text.strip()[:40].strip()),
        'text-after': '',
        'class': css_classes,
        'query': _as_list(query_param_names, 10),
        'parent-tag': parent,
    }
    tag_feature = {
        'isdigit': 1 if text.isdigit() is True else 0,
        'isalpha': 1 if text.isalpha() is True else 0,
        'has-href': 0 if href is "" else 1,
        'path-has-page': 1 if 'page' in p.path.lower() else 0,
        'path-has-pageXX': 1 if re.search(r'[/-](?:p|page\w?)/?\d+', p.path.lower()) is not None else 0,
        'path-has-number': 1 if any(part.isdigit() for part in p.path.split('/')) else 0,
        'href-has-year': 1 if re.search('20\d\d', href) is not None else 0,
        'class-has-disabled': 1 if 'disabled' in css_classes else 0,
    }
    tag_feature = [v for k,v in tag_feature.items()]
#     attribute_feature = elem_rel + elem_target
    non_token_feature = tag_feature #+ attribute_feature
    return [token_feature, non_token_feature]


def page_to_features(xseq):
    feat_list = [link_to_features(a) for a in xseq]
    around = get_text_around_selector_list(xseq, max_length=15)
    
    # weight is less than 1 because there is a lot of duplicate information
    # in these ngrams and so we want to regularize them stronger
    # (as if they are a single feature, not many features)
    k = 0.2
    for feat, (before, after) in zip(feat_list, around):
        feat[0]['text-before'] = normalize(before)
        feat[0]['text-after'] = normalize(after)
        
    return feat_list

CPU times: user 6 µs, sys: 4 µs, total: 10 µs
Wall time: 20.3 µs


In [93]:
def get_token_tag_features_from_chunks(chunks):
    token_features = []
    tag_features = []
    for page in chunks:
        feat_list = page_to_features(page)
        token_features.append([node[0] for node in feat_list])
        tag_features.append([node[1] for node in feat_list])
    return token_features, tag_features

In [94]:
def extract_tokens_from_token_features(token_features):
    train_tag_feature_token_list = []
    for page in token_features:
        tmp_page_list = []
        for node in page: 
            tmp_list = []
            for k, v in node.items():
                if k == 'text-exact':
                    continue
                else:
                    tmp_list.extend(v)
            tmp_page_list.append(tmp_list)
        train_tag_feature_token_list.append(tmp_page_list)
    return train_tag_feature_token_list

In [97]:
def page_to_two_bert_embeddings(token_features, tokenizer):
    text_first_segs = []
    text_second_segs = []
    for page in token_features:
        page_one_features = []
        page_two_features = []
        for node in page:
            text_before = tokenizer.tokenize(node["text-before"])
            text_exact = tokenizer.tokenize(node["text-exact"])
            text_after = tokenizer.tokenize(node["text-after"])
            page_one_features.append(["[CLS]"]+text_before+["[SEP]"]+text_exact+["[SEP]"])
            page_two_features.append(["[CLS]"]+text_exact+["[SEP]"]+text_after+["[SEP]"])
        text_first_segs.append(page_one_features)
        text_second_segs.append(page_two_features)
    print("Start encode first seg embeddings")
    first_emb = pbert.page_list_to_bert_embedding_list(text_first_segs, Token=True)
    print("Start encode second seg embeddings")
    second_emb = pbert.page_list_to_bert_embedding_list(text_second_segs, Token=True)
    full_text_emb = [np.concatenate([first_emb[page], second_emb[page]], axis = 1) for page in range(len(token_features))]
    return first_emb, second_emb, full_text_emb

In [96]:
token_features, tag_features = get_token_tag_features_from_chunks(chunks_x)
# train_tag_feature_token_list = extract_tokens_from_token_features(token_features)

In [99]:
first_emb, second_emb, full_text_emb = page_to_two_bert_embeddings(token_features, pbert.get_tokenizer())

Start encode first seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=356)

Start encode second seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=356)

In [112]:
np.save('embedding/train/first.npy', first_emb)
np.save('embedding/train/second.npy', second_emb)
np.save('embedding/train/full_text.npy', full_text_emb)

  return array(a, dtype, copy=False, order=order, subok=True)


In [114]:
first_emb = np.load('embedding/train/first.npy', allow_pickle=True)
second_emb = np.load('embedding/train/second.npy', allow_pickle=True)
full_text_emb = np.load('embedding/train/full_text.npy', allow_pickle=True)

In [118]:
print(f"First emb:{first_emb[0].shape}")
print(f"Second emb:{second_emb[0].shape}")
print(f"Full_text emb:{full_text_emb[0].shape}")

First emb:(303, 768)
Second emb:(303, 768)
Full_text emb:(303, 1536)


In [137]:
max_node = -1
page_sum = 0
for page in train_token_features:
    sum = 0
    for node in page:
        sum+=len(node)
        if len(node) > max_node:
            max_node = len(node)
    page_sum+=sum/len(page)
print("Average: ", page_sum/len(train_token_features))
print("Max_node: ",max_node)

Average:  10.835698937066166
Max_node:  63


In [23]:
train_tag_info_list = tag_features #features which only have tag true/false information

In [602]:
# Extract text-exact feature in token_features
train_text_list = [[ data['text-exact'] for data in x] for x in token_features]
train_text_before_list = [[ data['text-before'] for data in x] for x in token_features]
train_text_after_list = [[ data['text-after'] for data in x] for x in token_features]

In [457]:
def concat_text(before, mid, after):
    res = ""
    if before != "":
        res+=before + ","
    if mid != "":
        res+=mid + ","
    if after == "":
        res = res[:-1]
    else:
        res+=after
    return res
# Extract text-exact feature in token_features
train_full_text_list = [[ concat_text(data['text-before'],data['text-exact'],data['text-after']) for data in x] for x in token_features]

In [24]:
pbert.max_seq_length

128

## Feature List
    * train_tag_feature_token_list => Tag Attributes tokens
    * train_tag_info_list => Tag information
    * train_text_emb => Only Text node => Bert Text embedding
    * train_tag_emb => Text-before Text Text-after [SEP] Other Attributes => Bert Text embedding

In [25]:
# chunks_text_emb = train_text_emb # text
# chunks_text_emb = train_tag_emb # tag
chunks_text_emb = full_text_emb # full text embedding (two bert)

chunks_tag_infos = train_tag_info_list
chunks_filtered_y = chunks_y

## Padding to fixed size

In [26]:
def feature_pad_to_npdata(embedding):
    dataset = Dataset.from_generator(lambda: iter(embedding), tf.float32)
    dataset = dataset.padded_batch(1, padded_shapes= (max_page_seq, len(embedding[0][0])), padding_values=-1.,drop_remainder=False)
    after_pad = np.array([ data[0] for data in list(dataset.as_numpy_iterator())])
    return after_pad

In [27]:
# train_tag_token = feature_pad_to_npdata(chunks_tag_tokens)
train_text_emb_x = feature_pad_to_npdata(chunks_text_emb)

In [28]:
train_tag_x = feature_pad_to_npdata(chunks_tag_infos)

In [29]:
train_positions_x = feature_pad_to_npdata(chunk_positions)

In [30]:
train_tag_x = np.concatenate([train_tag_x, train_positions_x], axis = 2)

In [31]:
train_info_x = np.concatenate([train_text_emb_x, train_tag_x], axis = 2)

In [32]:
labels = ["O", "PREV", "PAGE", "NEXT", "[PAD]"]
tag2idx = { label:idx for idx,label in enumerate(labels)}
idx2tag = { idx:label for idx,label in enumerate(labels)}
num_tags = len(labels)

In [33]:
train_y = pad_sequences([[tag2idx.get(l) for l in lab] for lab in chunks_filtered_y], maxlen=max_page_seq, padding="post", truncating="post", value=tag2idx["[PAD]"])

In [34]:
print("Current Shape:")
print(f"train_text_emb_x: {train_text_emb_x.shape}")
print(f"train_tag_x: {train_tag_x.shape}")
print(f"train_info_x: {train_info_x.shape}")
print(f"train_y: {train_y.shape}")

Current Shape:
train_text_emb_x: (356, 512, 1536)
train_tag_x: (356, 512, 10)
train_info_x: (356, 512, 1546)
train_y: (356, 512)


In [35]:
# train_x = train_text_emb_x
train_x = train_info_x
# train_x = train_tag_x

In [36]:
train_x.shape

(356, 512, 1546)

## Build BERT-BiLSTM-CRF Model

In [56]:
from tensorflow_addons.layers.crf import CRF

In [57]:
TIME_STAMPS = max_page_seq
HIDDEN_UNITS = 200
DROPOUT_RATE = 0.1
# NUM_CLASS = 5
NUM_CLASS = num_tags
print(f"TIME_STAMP: {TIME_STAMPS}")
print(f"HIDDEN_UNITS: {HIDDEN_UNITS}")
print(f"DROPOUT_RATE: {DROPOUT_RATE}")
print(f"NUM_CLASS: {NUM_CLASS}")

TIME_STAMP: 512
HIDDEN_UNITS: 200
DROPOUT_RATE: 0.1
NUM_CLASS: 5


In [58]:
def get_BERT_BILSTM_CRF(SHAPE, numtags):
    print(f"SHAPE: {SHAPE}")
    model=tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=SHAPE))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=HIDDEN_UNITS, return_sequences=True)))
    crf=CRF(numtags,name='crf_layer')
    model.add(crf)
    model.compile('adam',loss={'crf_layer': crf.get_loss})
    model.summary()
    return model

In [61]:
def get_BERT_FFN_CRF(SHAPE, numtags):
    print(f"SHAPE: {SHAPE}")
    model=tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=SHAPE))
    model.add(tf.keras.layers.Dense(units = 768, activation = 'relu'))
    model.add(tf.keras.layers.Dense(units = 324, activation = 'relu'))
    model.add(tf.keras.layers.Dense(units = 162, activation = 'relu'))
#     model.add(tf.keras.layers.Dense(units = 81, activation = 'relu'))
    crf=CRF(numtags,name='crf_layer')
    model.add(crf)
    model.compile('adam',loss={'crf_layer': crf.get_loss})
    model.summary()
    return model

In [62]:
def get_BERT_BILSTM_SOFTMAX(SHAPE, numtags):
    print(f"SHAPE: {SHAPE}")
    model=tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=SHAPE))
    model.add(tf.keras.layers.Masking(input_shape=SHAPE, mask_value=-1.))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=HIDDEN_UNITS, return_sequences=True)))
    model.add(tf.keras.layers.Dense(units = numtags, activation='softmax'))
    model.compile('adam',loss=tf.keras.losses.SparseCategoricalCrossentropy())
    model.summary()
    return model

In [71]:
FFN_CRF = get_BERT_FFN_CRF(train_x.shape[1:], num_tags)

SHAPE: (512, 1546)
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 512, 768)          1188096   
_________________________________________________________________
dense_5 (Dense)              (None, 512, 324)          249156    
_________________________________________________________________
dense_6 (Dense)              (None, 512, 162)          52650     
_________________________________________________________________
crf_layer (CRF)              (None, 512)               850       
Total params: 1,490,752
Trainable params: 1,490,752
Non-trainable params: 0
_________________________________________________________________


In [76]:
CRF_model = get_BERT_BILSTM_CRF(train_x.shape[1:], num_tags)

Softmax_model = get_BERT_BILSTM_SOFTMAX(train_x.shape[1:], num_tags)

SHAPE: (512, 1546)
Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_4 (Bidirection (None, 512, 400)          2795200   
_________________________________________________________________
crf_layer (CRF)              (None, 512)               2040      
Total params: 2,797,240
Trainable params: 2,797,240
Non-trainable params: 0
_________________________________________________________________
SHAPE: (512, 1546)
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_2 (Masking)          (None, 512, 1546)         0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 512, 400)          2795200   
_________________________________________________________________
dense_8 (Dense)              (None, 512, 5)       

In [77]:
train_x.shape

(356, 512, 1546)

In [78]:
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [67]:
FFN_CRF.fit(train_x, train_y, batch_size=128, epochs=1000, validation_split=0.1, verbose=1, callbacks=[earlyStopping])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<tensorflow.python.keras.callbacks.History at 0x7fa38f1b47f0>

In [68]:
CRF_model.fit(train_x, train_y, batch_size=128, epochs=1000, validation_split=0.1, verbose=1, callbacks=[earlyStopping])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<tensorflow.python.keras.callbacks.History at 0x7fa38e8e1320>

In [79]:
Softmax_model.fit(train_x, train_y, batch_size=128, epochs=1000, validation_split=0.1, verbose=1, callbacks=[earlyStopping])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000


<tensorflow.python.keras.callbacks.History at 0x7fa382b6a710>

In [69]:
TEST_MODELS = [FFN_CRF, CRF_model]

In [80]:
TEST_MODELS = [Softmax_model]

## Test on val set

In [37]:
def get_test_data(type=None, scaled_page='normal'):
    if type is None:
        print("Please assign type of test_data")
        return
    if type != 'EVENT_SOURCE':
        storage.test_file = 'NORMAL'
        test_urls = [rec['Page URL'] for rec in storage.iter_test_records()]
        test_X_one, test_y_one, test_page_positions_one = storage.get_test_Xy(validate=False, scaled_page=scaled_page)
        print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
        if type == 'NORMAL':
            return test_X_one, test_y_one, test_page_positions_one
    if type != 'NORMAL':
        storage.test_file = 'EVENT_SOURCE'
        test_urls = [rec['Page URL'] for rec in storage.iter_test_records()]
        test_X_two, test_y_two, test_page_positions_two = storage.get_test_Xy(validate=False, scaled_page=scaled_page)
        print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
        if type == 'EVENT_SOURCE':
            return test_X_two, test_y_two, test_page_positions_two
    test_X_raw = test_X_one + test_X_two
    test_y = test_y_one + test_y_two
    test_positions = test_page_positions_one + test_page_positions_two
    return test_X_raw, test_y, test_positions

In [38]:
def label_distribution_to_label(predict_y):
    if len(predict_y.shape) != 3:
        return predict_y
    label_y = list()
    for page in predict_y:
        tmp = list()
        for lab in page:
            lab = lab.tolist()
            tmp.append(lab.index(max(lab)))
        label_y.append(tmp)
    return label_y

In [39]:
def recursive_predict_and_evaluate(models, x_test, y_test, evaluate_labels):
    for idx, model in enumerate(models):
        print(f"Start predict model {idx}")
        print(model.summary())
        print("--------------------------")
        predict_y = model.predict(x_test)
        predict_y = label_distribution_to_label(predict_y)
        predict_y = np.asarray([[idx2tag.get(lab) for lab in page] for page in predict_y])
        print(flat_classification_report(y_test, predict_y, labels=evaluate_labels, digits=len(evaluate_labels)))
        print("--------------------------")

In [40]:
# test_X_raw, test_y, test_page_positions = get_test_data('EVENT_SOURCE')
test_X_raw, test_y, test_page_positions = get_test_data('NORMAL')

pages: 100  domains: 58


In [41]:
test_urls = [rec['Page URL'] for rec in storage.iter_test_records()]
test_groups = set([get_domain(url) for url in test_urls])

In [42]:
chunks_test_x, chunks_test_y, chunks_test_positions = get_chunks_data(test_X_raw, test_y, test_page_positions)

In [43]:
test_token_features, test_tag_features = get_token_tag_features_from_chunks(chunks_test_x)

In [100]:
test_first_emb, test_second_emb, test_full_text_emb = page_to_two_bert_embeddings(test_token_features, pbert.get_tokenizer())

Start encode first seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=125)

Start encode second seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=125)

In [119]:
np.save('embedding/test/first.npy', test_first_emb)
np.save('embedding/test/second.npy', test_second_emb)
np.save('embedding/test/full_text.npy', test_full_text_emb)

  return array(a, dtype, copy=False, order=order, subok=True)


In [120]:
test_first_emb = np.load('embedding/test/first.npy', allow_pickle=True)
test_second_emb = np.load('embedding/test/second.npy', allow_pickle=True)
test_full_text_emb = np.load('embedding/test/full_text.npy', allow_pickle=True)

In [122]:
print(f"First emb:{test_first_emb[0].shape}")
print(f"Second emb:{test_second_emb[0].shape}")
print(f"Full_text emb:{test_full_text_emb[0].shape}")

First emb:(185, 768)
Second emb:(185, 768)
Full_text emb:(185, 1536)


In [123]:
test_tag_info_list = test_tag_features

In [489]:
max_node = -1
page_sum = 0
for page in test_tag_emb_features:
    sum = 0
    for node in page:
        sum+=len(node)
        if len(node) > max_node:
            max_node = len(node)
    page_sum+=sum/len(page)
print("Average: ", page_sum/len(train_token_features))
print("Max_node: ",max_node)

Average:  1.0638181757827387
Max_node:  59


In [124]:
test_text_emb_x = feature_pad_to_npdata(test_full_text_emb) # full text emb / two-bert emb
# test_text_emb_x = feature_pad_to_npdata(test_text_emb) # text emb

In [125]:
test_tag_x = feature_pad_to_npdata(test_tag_info_list)

In [126]:
test_positions_x = feature_pad_to_npdata(chunks_test_positions)
test_tag_x = np.concatenate([test_tag_x, test_positions_x], axis = 2)

In [127]:
test_info_x = np.concatenate([test_text_emb_x, test_tag_x], axis = 2)

In [128]:
x_test = test_info_x
# x_test = test_text_emb_x
# x_test = test_tag_x

In [129]:
x_test.shape

(125, 512, 1546)

In [130]:
y_test = pad_sequences([[tag2idx.get(l) for l in lab] for lab in chunks_test_y], maxlen=max_page_seq, padding="post", truncating="post", value=tag2idx["[PAD]"])

In [131]:
y_test = [[idx2tag.get(lab) for lab in page] for page in y_test]

In [132]:
y_test = np.asarray(y_test)

In [133]:
# evaluate_labels = ['PREV', 'PAGE', 'NEXT', '[PAD]', 'O']
evaluate_labels = ['PAGE', 'NEXT']

In [134]:
recursive_predict_and_evaluate(TEST_MODELS, x_test, y_test, evaluate_labels)

Start predict model 0
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_2 (Masking)          (None, 512, 1546)         0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 512, 400)          2795200   
_________________________________________________________________
dense_8 (Dense)              (None, 512, 5)            2005      
Total params: 2,797,205
Trainable params: 2,797,205
Non-trainable params: 0
_________________________________________________________________
None
--------------------------




              precision    recall  f1-score   support

        PAGE       0.68      0.74      0.71       279
        NEXT       0.67      0.04      0.08        49

   micro avg       0.68      0.63      0.66       328
   macro avg       0.68      0.39      0.39       328
weighted avg       0.68      0.63      0.62       328

--------------------------
