In [10]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
import re
import sys
from collections import Counter
from itertools import islice
from urllib.parse import urlparse, urlsplit, parse_qs, parse_qsl

import numpy as np
import parsel
from sklearn_crfsuite.metrics import flat_classification_report, sequence_accuracy_score

sys.path.insert(0, '..')
from autopager.storage import Storage
from autopager.htmlutils import (get_link_text, get_text_around_selector_list,
                                 get_link_href, get_selector_root)
from autopager.utils import (
    get_domain, normalize_whitespaces, normalize, ngrams, tokenize, ngrams_wb, replace_digits
)
from autopager.model import link_to_features, _num_tokens_feature, _elem_attr
from autopager import AUTOPAGER_LIMITS
from autopager.parserutils import (TagParser, MyHTMLParser, draw_scaled_page, position_check, compare_tag)


In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if len(gpus)!=0:
  # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print("No GPUs visible")

1 Physical GPUs, 1 Logical GPU


In [5]:
storage = Storage()
urls = [rec['Page URL'] for rec in storage.iter_records(contain_button = True, file_type='T')]
groups = [get_domain(url) for url in urls]
train_groups_set = set(groups)

In [6]:
X_raw, y, page_positions = storage.get_Xy(contain_button = True, file_type='T')

Finish: Get Page 1 (Encoding: UTF-8)records ... (len: 303)
Finish: Get Page 2 (Encoding: UTF-8)records ... (len: 243)
Finish: Get Page 3 (Encoding: UTF-8)records ... (len: 119)
Finish: Get Page 4 (Encoding: UTF-8)records ... (len: 944)
Finish: Get Page 5 (Encoding: UTF-8)records ... (len: 93)
Finish: Get Page 6 (Encoding: UTF-8)records ... (len: 994)
Finish: Get Page 7 (Encoding: UTF-8)records ... (len: 1014)
Finish: Get Page 8 (Encoding: UTF-8)records ... (len: 7)
Finish: Get Page 9 (Encoding: UTF-8)records ... (len: 288)
Finish: Get Page 10 (Encoding: UTF-8)records ... (len: 678)
Finish: Get Page 11 (Encoding: UTF-8)records ... (len: 789)
Finish: Get Page 12 (Encoding: UTF-8)records ... (len: 814)
Finish: Get Page 13 (Encoding: UTF-8)records ... (len: 814)
Finish: Get Page 14 (Encoding: UTF-8)records ... (len: 171)
Finish: Get Page 15 (Encoding: UTF-8)records ... (len: 168)
Finish: Get Page 16 (Encoding: UTF-8)records ... (len: 91)
Finish: Get Page 17 (Encoding: UTF-8)records ... (le

Finish: Get Page 151 (Encoding: UTF-8)records ... (len: 369)
Finish: Get Page 152 (Encoding: cp1252)records ... (len: 294)
Finish: Get Page 153 (Encoding: UTF-8)records ... (len: 271)
Finish: Get Page 154 (Encoding: UTF-8)records ... (len: 300)
Finish: Get Page 155 (Encoding: UTF-8)records ... (len: 314)
Finish: Get Page 156 (Encoding: UTF-8)records ... (len: 278)
Finish: Get Page 157 (Encoding: UTF-8)records ... (len: 288)
Finish: Get Page 158 (Encoding: UTF-8)records ... (len: 178)
Finish: Get Page 159 (Encoding: UTF-8)records ... (len: 108)
Finish: Get Page 160 (Encoding: UTF-8)records ... (len: 98)
Finish: Get Page 161 (Encoding: UTF-8)records ... (len: 101)
Finish: Get Page 162 (Encoding: UTF-8)records ... (len: 308)
Finish: Get Page 163 (Encoding: UTF-8)records ... (len: 298)
Finish: Get Page 164 (Encoding: UTF-8)records ... (len: 285)
Finish: Get Page 165 (Encoding: UTF-8)records ... (len: 221)
Finish: Get Page 166 (Encoding: UTF-8)records ... (len: 21)
Finish: Get Page 167 (Enc

Finish: Get Page 297 (Encoding: UTF-8)records ... (len: 367)
Finish: Get Page 298 (Encoding: UTF-8)records ... (len: 371)
Finish: Get Page 299 (Encoding: UTF-8)records ... (len: 361)
Finish: Get Page 300 (Encoding: UTF-8)records ... (len: 361)
Finish: Get Page 301 (Encoding: UTF-8)records ... (len: 364)
Finish: Get Page 302 (Encoding: UTF-8)records ... (len: 170)
Finish: Get Page 303 (Encoding: UTF-8)records ... (len: 154)
Finish: Get Page 304 (Encoding: cp1252)records ... (len: 117)
Finish: Get Page 305 (Encoding: UTF-8)records ... (len: 1987)
Finish: Get Page 306 (Encoding: UTF-8)records ... (len: 59)
Finish: Get Page 307 (Encoding: UTF-8)records ... (len: 60)
Finish: Get Page 308 (Encoding: UTF-8)records ... (len: 60)
Finish: Get Page 309 (Encoding: UTF-8)records ... (len: 145)
Finish: Get Page 310 (Encoding: UTF-8)records ... (len: 116)
Finish: Get Page 311 (Encoding: UTF-8)records ... (len: 119)
Finish: Get Page 312 (Encoding: cp1252)records ... (len: 136)
Finish: Get Page 313 (En

In [7]:
max_page_seq = 512

## Slice data into chunks

In [8]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [22]:
def get_chunks_data(x, y, p):
    new_tmp_x_array = []
    new_tmp_y_array = []
    new_tmp_p_array = []
    for tmp_x, tmp_y, tmp_p in zip(x, y, p):
        new_tmp_x_array.extend(chunks(tmp_x, max_page_seq))
        new_tmp_y_array.extend(chunks(tmp_y, max_page_seq))
        new_tmp_p_array.extend(chunks(tmp_p, max_page_seq))
    return new_tmp_x_array, new_tmp_y_array, new_tmp_p_array

In [23]:
chunks_x, chunks_y, chunk_positions = get_chunks_data(X_raw, y, page_positions)

In [24]:
len(chunks_x)

355

## Load Pre-trained Bert model

In [11]:
from tensorflow.keras import Model
from tensorflow.data import Dataset

In [12]:
from BertModel import BertModel

1 Physical GPUs, 1 Logical GPU


In [13]:
bert_short_model = BertModel(128)

In [14]:
bert_long_model = BertModel(256)

In [14]:
pbert = bert_short_model

# Feature extraction

In [44]:
%%time
# XXX: these functions should be copy-pasted from autopager/model.py

def _as_list(generator, limit=None):
    """
    >>> _as_list(ngrams_wb("text", 2, 2), 0)
    []
    >>> _as_list(ngrams_wb("text", 2, 2), 2)
    ['te', 'ex']
    >>> _as_list(ngrams_wb("text", 2, 2))
    ['te', 'ex', 'xt']
    """
    return list(generator if limit is None else islice(generator, 0, limit))

def feat_to_tokens(feat, tokenizer):
    if type(feat) == type([]):
        feat = ' '.join(feat)
    tokens = tokenizer.tokenize(feat)
    return tokens

def link_to_features(link):
    text = normalize(get_link_text(link))
    href = get_link_href(link)
    if href is None:
        href = ""
    p = urlsplit(href)
    query_parsed = parse_qsl(p.query) #parse query string from path
    query_param_names = [k.lower() for k, v in query_parsed]
    query_param_names_ngrams = _as_list(ngrams_wb(
        " ".join([normalize(name) for name in query_param_names]), 3, 5, True
    ))

    # Classes of link itself and all its children.
    # It is common to have e.g. span elements with fontawesome
    # arrow icon classes inside <a> links.
    self_and_children_classes = ' '.join(_as_list(link.xpath(".//@class").extract(), 5))
    parent_classes = ' '.join(_as_list(link.xpath('../@class').extract(), 5))
    css_classes = normalize(parent_classes + ' ' + self_and_children_classes)
#     print(css_classes)
    token_feature = {
        'text-before': '',
        'text-exact': replace_digits(text.strip()[:40].strip()),
        'text-after': '',
        'class': css_classes,
        'query': _as_list(query_param_names, 10)
    }
    tag_feature = {
        'isdigit': 1 if text.isdigit() is True else 0,
        'isalpha': 1 if text.isalpha() is True else 0,
        'has-href': 0 if href is "" else 1,
        'path-has-page': 1 if 'page' in p.path.lower() else 0,
        'path-has-pageXX': 1 if re.search(r'[/-](?:p|page\w?)/?\d+', p.path.lower()) is not None else 0,
        'path-has-number': 1 if any(part.isdigit() for part in p.path.split('/')) else 0,
        'href-has-year': 1 if re.search('20\d\d', href) is not None else 0,
        'class-has-disabled': 1 if 'disabled' in css_classes else 0,
    }
    tag_feature = [v for k,v in tag_feature.items()]
#     attribute_feature = elem_rel + elem_target
    non_token_feature = tag_feature #+ attribute_feature
    return [token_feature, non_token_feature]


def page_to_features(xseq):
    feat_list = [link_to_features(a) for a in xseq]
    around = get_text_around_selector_list(xseq, max_length=15)
    
    # weight is less than 1 because there is a lot of duplicate information
    # in these ngrams and so we want to regularize them stronger
    # (as if they are a single feature, not many features)
    k = 0.2
    for feat, (before, after) in zip(feat_list, around):
        feat[0]['text-before'] = normalize(before)
        feat[0]['text-after'] = normalize(after)
        
    return feat_list

CPU times: user 16 µs, sys: 0 ns, total: 16 µs
Wall time: 24.3 µs


In [42]:
def get_token_tag_features_from_chunks(chunks):
    token_features = []
    tag_features = []
    for page in chunks:
        feat_list = page_to_features(page)
        token_features.append([node[0] for node in feat_list])
        tag_features.append([node[1] for node in feat_list])
    return token_features, tag_features

In [26]:
def extract_tokens_from_token_features(token_features):
    train_tag_feature_token_list = []
    for page in token_features:
        tmp_page_list = []
        for node in page: 
            tmp_list = []
            for k, v in node.items():
                if k == 'text-exact':
                    continue
                else:
                    tmp_list.extend(v)
            tmp_page_list.append(tmp_list)
        train_tag_feature_token_list.append(tmp_page_list)
    return train_tag_feature_token_list

In [27]:
def token_features_bert_preprocessing(token_features, tokenizer, type = None, addNone = False, only_text = False):
    train_token_features = []
    if type == None or type not in ['single','multi','multi-two']:
        print("Must Given a type of pre-processing")
        return
    for page in token_features:
        page_features = []
        for node in page:
            node_features = ["[CLS]"]
            sep_two = False
            for k,v in node.items():
                value_tokens = feat_to_tokens(v, tokenizer)
                if addNone == True and len(value_tokens) == 0:
                    value_tokens = ["None"]
                if type == 'single':
                    node_features = node_features + value_tokens
                    if k == 'text-after':
                        node_features = node_features + ["[SEP]"]
                elif type == 'multi':
                    if k == 'text-after' or sep_two is True:
                        sep_two = True
                        node_features = node_features + value_tokens + ["[SEP]"]
                    else:
                        node_features = node_features + value_tokens
                elif type == 'multi-two':
                    node_features = node_features + value_tokens + ["[SEP]"]
                    if k == 'text-after' and only_text is True:
                        break
            if 'multi' in type:
                node_features = node_features[:-1]
            page_features.append(node_features)
        train_token_features.append(page_features)
    return train_token_features

In [28]:
def page_to_two_bert_embeddings(token_features, tokenizer):
    text_first_segs = []
    text_second_segs = []
    for page in token_features:
        page_one_features = []
        page_two_features = []
        for node in page:
            text_before = tokenizer.tokenize(node["text-before"])
            text_exact = tokenizer.tokenize(node["text-exact"])
            text_after = tokenizer.tokenize(node["text-after"])
            page_one_features.append(["[CLS]"]+text_before+["[SEP]"]+text_exact+["[SEP]"])
            page_two_features.append(["[CLS]"]+text_exact+["[SEP]"]+text_after+["[SEP]"])
        text_first_segs.append(page_one_features)
        text_second_segs.append(page_two_features)
    print("Start encode first seg embeddings")
    first_emb = pbert.page_list_to_bert_embedding_list(text_first_segs, Token=True)
    print("Start encode second seg embeddings")
    second_emb = pbert.page_list_to_bert_embedding_list(text_second_segs, Token=True)
    full_text_emb = [np.concatenate([first_emb[page], second_emb[page]], axis = 1) for page in range(len(token_features))]
    return full_text_emb

In [45]:
token_features, tag_features = get_token_tag_features_from_chunks(chunks_x)
# train_tag_feature_token_list = extract_tokens_from_token_features(token_features)

In [118]:
full_text_emb = page_to_two_bert_embeddings(token_features, pbert.get_tokenizer())

Start encode first seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=355)

Start encode second seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=355)

In [119]:
full_text_emb[0].shape

(303, 1536)

In [136]:
# train_token_features = token_features_bert_preprocessing(token_features,type = 'single')
# train_token_features = token_features_bert_preprocessing(token_features,type = 'multi')
train_token_features = token_features_bert_preprocessing(token_features, pbert.get_tokenizer(), type = 'multi-two', only_text = True)

In [137]:
max_node = -1
page_sum = 0
for page in train_token_features:
    sum = 0
    for node in page:
        sum+=len(node)
        if len(node) > max_node:
            max_node = len(node)
    page_sum+=sum/len(page)
print("Average: ", page_sum/len(train_token_features))
print("Max_node: ",max_node)

Average:  10.835698937066166
Max_node:  63


In [48]:
train_tag_info_list = tag_features #features which only have tag true/false information

In [602]:
# Extract text-exact feature in token_features
train_text_list = [[ data['text-exact'] for data in x] for x in token_features]
train_text_before_list = [[ data['text-before'] for data in x] for x in token_features]
train_text_after_list = [[ data['text-after'] for data in x] for x in token_features]

In [457]:
def concat_text(before, mid, after):
    res = ""
    if before != "":
        res+=before + ","
    if mid != "":
        res+=mid + ","
    if after == "":
        res = res[:-1]
    else:
        res+=after
    return res
# Extract text-exact feature in token_features
train_full_text_list = [[ concat_text(data['text-before'],data['text-exact'],data['text-after']) for data in x] for x in token_features]

In [29]:
pbert.max_seq_length

128

In [335]:
train_full_text_emb = pbert.page_list_to_bert_embedding_list(train_full_text_list, Token=False)

Use custom Token: False


IntProgress(value=0, description='(Init)', max=353)

In [139]:
train_tag_emb = pbert.page_list_to_bert_embedding_list(train_token_features, Token=True)

Use custom Token: True


IntProgress(value=0, description='(Init)', max=353)

In [94]:
len(train_tag_emb)

353

In [95]:
train_tag_emb[0].shape

(303, 768)

In [96]:
train_text_emb[0].shape

NameError: name 'train_text_emb' is not defined

## Split chunks into three type of training data
    (1) Every chunks
    (2) Chunks only have tag informations + Chunks only have Other

### Feature List
    * train_tag_feature_token_list => Tag Attributes tokens
    * train_tag_info_list => Tag information
    * train_text_emb => Only Text node => Bert Text embedding
    * train_tag_emb => Text-before Text Text-after [SEP] Other Attributes => Bert Text embedding

In [30]:
def onlyHavaOther(y):
    for tag in y:
        if tag != 'O':
            return False
    return True

In [31]:
chunks_only_data_idx = [idx for idx, row_y in enumerate(chunks_y) if not onlyHavaOther(row_y)]
chunks_only_other_idx = [idx for idx, row_y in enumerate(chunks_y) if onlyHavaOther(row_y)]

In [36]:
print("Tags data: ", len(chunks_only_data_idx))
print("Other data: ", len(chunks_only_other_idx))

Tags data:  228
Other data:  125


In [37]:
def getFilterChunks(chunks, filterIdx):
    # chunks: pages
    # filterIdx: Idx of list which indicate the return data
    return [page for idx, page in enumerate(chunks) if idx in filterIdx]

In [38]:
def getTrainingData(types = None):
    if types == None:
        if len(train_text_emb) != len(train_tag_info_list) or len(train_tag_info_list) != len(chunks_y) or len(train_tag_info_list) != len(train_tag_feature_token_list):
            raise Exception('Every chunks should have equal size')
        print(f"return {len(chunks_y)} data.")
        return (train_text_emb, train_tag_feature_token_list, train_tag_info_list, chunks_y)
    if types == 'Tags':
        chunks_text_x = getFilterChunks(train_text_emb, chunks_only_data_idx)
        chunks_token_x = getFilterChunks(train_tag_feature_token_list, chunks_only_data_idx)
        chunks_tag_x = getFilterChunks(train_tag_info_list, chunks_only_data_idx)
        chunks_filtered_y = getFilterChunks(chunks_y, chunks_only_data_idx)
        if len(chunks_text_x) != len(chunks_tag_x) or len(chunks_tag_x) != len(chunks_filtered_y) or len(chunks_token_x) != len(chunks_tag_x):
            raise Exception('Every chunks should have equal size')
        print(f"return {len(chunks_filtered_y)} data.")
        return (chunks_text_x, chunks_token_x, chunks_tag_x, chunks_filtered_y)

In [120]:
# chunks_text_emb = train_concat_text_emb # text concat
# chunks_text_emb = train_text_emb # text
# chunks_text_emb = train_full_text_emb # full text
# chunks_text_emb = train_tag_emb # tag
chunks_text_emb = full_text_emb # full text embedding (two bert)

chunks_tag_infos = train_tag_info_list
chunks_filtered_y = chunks_y

## Padding to fixed size

In [50]:
def feature_pad_to_npdata(embedding):
    dataset = Dataset.from_generator(lambda: iter(embedding), tf.float32)
    dataset = dataset.padded_batch(1, padded_shapes= (max_page_seq, len(embedding[0][0])), padding_values=-1.,drop_remainder=False)
    after_pad = np.array([ data[0] for data in list(dataset.as_numpy_iterator())])
    return after_pad

In [143]:
# train_tag_token = feature_pad_to_npdata(chunks_tag_tokens)
train_text_emb_x = feature_pad_to_npdata(chunks_text_emb)

In [144]:
train_tag_x = feature_pad_to_npdata(chunks_tag_infos)

In [145]:
train_positions_x = feature_pad_to_npdata(chunk_positions)

In [146]:
train_tag_x = np.concatenate([train_tag_x, train_positions_x], axis = 2)

In [147]:
train_info_x = np.concatenate([train_text_emb_x, train_tag_x], axis = 2)

In [148]:
labels = ["O", "PREV", "PAGE", "NEXT", "[PAD]"]
tag2idx = { label:idx for idx,label in enumerate(labels)}
idx2tag = { idx:label for idx,label in enumerate(labels)}
num_tags = len(labels)

In [149]:
train_y = pad_sequences([[tag2idx.get(l) for l in lab] for lab in chunks_filtered_y], maxlen=max_page_seq, padding="post", truncating="post", value=tag2idx["[PAD]"])

In [150]:
print("Current Shape:")
print(f"train_text_emb_x: {train_text_emb_x.shape}")
print(f"train_tag_x: {train_tag_x.shape}")
print(f"train_info_x: {train_info_x.shape}")
print(f"train_y: {train_y.shape}")

Current Shape:
train_text_emb_x: (355, 512, 1536)
train_tag_x: (355, 512, 10)
train_info_x: (355, 512, 1546)
train_y: (355, 512)


In [151]:
# train_x = train_text_emb_x
train_x = train_info_x
# train_x = train_tag_x

In [152]:
train_x.shape

(355, 512, 1546)

## Build BERT-BiLSTM-CRF Model

In [60]:
from tensorflow_addons.layers.crf import CRF

In [61]:
TIME_STAMPS = max_page_seq
HIDDEN_UNITS = 200
DROPOUT_RATE = 0.1
# NUM_CLASS = 5
NUM_CLASS = num_tags
print(f"TIME_STAMP: {TIME_STAMPS}")
print(f"HIDDEN_UNITS: {HIDDEN_UNITS}")
print(f"DROPOUT_RATE: {DROPOUT_RATE}")
print(f"NUM_CLASS: {NUM_CLASS}")

TIME_STAMP: 512
HIDDEN_UNITS: 200
DROPOUT_RATE: 0.1
NUM_CLASS: 5


In [62]:
def get_CRF(SHAPE, numtags):
    print(f"SHAPE: {SHAPE}")
    model=tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=SHAPE))
    crf=CRF(numtags,name='crf_layer')
    model.add(crf)
    model.compile('adam',loss={'crf_layer': crf.get_loss})
    model.summary()
    return model

In [63]:
def get_BERT_BILSTM_CRF(SHAPE, numtags):
    print(f"SHAPE: {SHAPE}")
    model=tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=SHAPE))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=HIDDEN_UNITS, return_sequences=True)))
    crf=CRF(numtags,name='crf_layer')
    model.add(crf)
    model.compile('adam',loss={'crf_layer': crf.get_loss})
    model.summary()
    return model

In [64]:
def get_BERT_BILSTM_SOFTMAX(SHAPE, numtags):
    print(f"SHAPE: {SHAPE}")
    model=tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=SHAPE))
#     model.add(tf.keras.layers.Masking(input_shape=SHAPE, mask_value=-1.))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=HIDDEN_UNITS, return_sequences=True)))
    model.add(tf.keras.layers.Dense(units = numtags, activation='softmax'))
    model.compile('adam',loss=tf.keras.losses.SparseCategoricalCrossentropy())
    model.summary()
    return model

In [153]:
# N_CRF = get_CRF(train_x.shape[1:], num_tags)

CRF_model = get_BERT_BILSTM_CRF(train_x.shape[1:], num_tags)

Softmax_model = get_BERT_BILSTM_SOFTMAX(train_x.shape[1:], num_tags)

SHAPE: (512, 1546)
Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_6 (Bidirection (None, 512, 400)          2795200   
_________________________________________________________________
crf_layer (CRF)              (None, 512)               2040      
Total params: 2,797,240
Trainable params: 2,797,240
Non-trainable params: 0
_________________________________________________________________
SHAPE: (512, 1546)
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_7 (Bidirection (None, 512, 400)          2795200   
_________________________________________________________________
dense_3 (Dense)              (None, 512, 5)            2005      
Total params: 2,797,205
Trainable params: 2,797,205
Non-trainable params: 0
________________________________________

In [154]:
train_x.shape

(355, 512, 1546)

In [155]:
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [156]:
CRF_history = CRF_model.fit(train_x, train_y, batch_size=128, epochs=1000, validation_split=0.2, verbose=1, callbacks=[earlyStopping])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000


In [157]:
Softmax_history = Softmax_model.fit(train_x, train_y, batch_size=128, epochs=1000, validation_split=0.2, verbose=1, callbacks=[earlyStopping])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

Epoch 83/1000


## GroupKfold

In [131]:
%%time
groups = [get_domain(url) for url in urls]
N_SPLITs = 6

CPU times: user 2.36 ms, sys: 0 ns, total: 2.36 ms
Wall time: 2.37 ms


In [207]:
def make_group_dataset(X_data,y_data, groups, n_splits):

    def gen():
        for train_index, test_index in GroupKFold(n_splits).split(X_data, y_data, groups):
            X_train, X_test = X_data[train_index], X_data[test_index]
            y_train, y_test = y_data[train_index], y_data[test_index]
            yield X_train,y_train,X_test,y_test

    return tf.data.Dataset.from_generator(gen, (tf.float64,tf.float64,tf.float64,tf.float64))

In [208]:
dataset = make_group_dataset(train_x, train_y, groups, N_SPLITs)

In [213]:
def tf_cross_val(dataset):
    count = 1
    for X_train,y_train,X_test,y_test in dataset:
        print(f"Start fold {count}")
        earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
        model = get_BILSTM_SOFTMAX(X_train.shape[1:], num_tags)
        history = model.fit(X_train, y_train, batch_size=32, epochs=500, validation_split=0.2, verbose=0, callbacks=[earlyStopping])
        predict_y = model.predict(X_test)
        predict_y = label_distribution_to_label(predict_y)
        predict_y = [[idx2tag.get(lab) for lab in page] for page in predict_y]
        y_test = [[idx2tag.get(lab) for lab in page] for page in y_test.numpy()]
        evaluate_labels = ['PREV', 'PAGE', 'NEXT']
        print(flat_classification_report(y_test, predict_y, labels=evaluate_labels, digits=len(evaluate_labels)))
        count+=1
    return

## Test on val set

In [71]:
def get_test_data(type=None):
    if type is None:
        print("Please assign type of test_data")
        return
    if type != 'EVENT_SOURCE':
        storage.test_file = 'NORMAL'
        test_urls = [rec['Page URL'] for rec in storage.iter_test_records()]
        test_X_one, test_y_one, test_page_positions_one = storage.get_test_Xy(validate=False)
        print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
        if type == 'NORMAL':
            return test_X_one, test_y_one, test_page_positions_one
    if type != 'NORMAL':
        storage.test_file = 'EVENT_SOURCE'
        test_urls = [rec['Page URL'] for rec in storage.iter_test_records()]
        test_X_two, test_y_two, test_page_positions_two = storage.get_test_Xy(validate=False)
        print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
        if type == 'EVENT_SOURCE':
            return test_X_two, test_y_two, test_page_positions_two
    test_X_raw = test_X_one + test_X_two
    test_y = test_y_one + test_y_two
    test_positions = test_page_positions_one + test_page_positions_two
    return test_X_raw, test_y, test_positions

In [72]:
def label_distribution_to_label(predict_y):
    if len(predict_y.shape) != 3:
        return predict_y
    label_y = list()
    for page in predict_y:
        tmp = list()
        for lab in page:
            lab = lab.tolist()
            tmp.append(lab.index(max(lab)))
        label_y.append(tmp)
    return label_y

In [73]:
# test_X_raw, test_y, test_page_positions = get_test_data('EVENT_SOURCE')
test_X_raw, test_y, test_page_positions = get_test_data('NORMAL')

pages: 100  domains: 54


In [74]:
test_urls = [rec['Page URL'] for rec in storage.iter_test_records()]
test_groups = set([get_domain(url) for url in test_urls])

In [308]:
for group in test_groups:
    if group in train_groups_set:
        print(f"Groups exist: {group}")

Groups exist: musicarts
Groups exist: mobile01


In [75]:
chunks_test_x, chunks_test_y, chunks_test_positions = get_chunks_data(test_X_raw, test_y, test_page_positions)

In [76]:
test_token_features, test_tag_features = get_token_tag_features_from_chunks(chunks_test_x)

In [112]:
test_full_text_emb = page_to_two_bert_embeddings(test_token_features, pbert.get_tokenizer())

Start encode first seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=127)

Start encode second seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=127)

In [149]:
# test_tag_emb_features = token_features_bert_preprocessing(test_token_features, 'single')
# test_tag_emb_features = token_features_bert_preprocessing(test_token_features, 'multi')
# test_tag_emb_features = token_features_bert_preprocessing(test_token_features, 'multi-two')
test_tag_emb_features = token_features_bert_preprocessing(test_token_features, pbert.get_tokenizer(), type = 'multi-two', only_text = True)


In [77]:
test_tag_info_list = test_tag_features

In [489]:
max_node = -1
page_sum = 0
for page in test_tag_emb_features:
    sum = 0
    for node in page:
        sum+=len(node)
        if len(node) > max_node:
            max_node = len(node)
    page_sum+=sum/len(page)
print("Average: ", page_sum/len(train_token_features))
print("Max_node: ",max_node)

Average:  1.0638181757827387
Max_node:  59


In [130]:
test_text_list = [[data["text-exact"] for data in page ] for page in test_token_features]
test_text_emb = pbert.page_list_to_bert_embedding_list(test_text_list, Token=False)

Use custom Token: False


IntProgress(value=0, description='(Init)', max=42)

In [151]:
test_tag_emb = pbert.page_list_to_bert_embedding_list(test_tag_emb_features, Token=True)

Use custom Token: True


IntProgress(value=0, description='(Init)', max=42)

In [158]:
# test_text_emb_x = feature_pad_to_npdata(test_concat_text_emb) # full text concat emb
test_text_emb_x = feature_pad_to_npdata(test_full_text_emb) # full text emb / two-bert emb
# test_text_emb_x = feature_pad_to_npdata(test_text_emb) # text emb
# test_text_emb_x = feature_pad_to_npdata(test_tag_emb) # tag emb

In [159]:
test_tag_x = feature_pad_to_npdata(test_tag_info_list)

In [160]:
test_positions_x = feature_pad_to_npdata(chunks_test_positions)
test_tag_x = np.concatenate([test_tag_x, test_positions_x], axis = 2)

In [161]:
test_info_x = np.concatenate([test_text_emb_x, test_tag_x], axis = 2)

In [162]:
x_test = test_info_x
# x_test = test_text_emb_x
# x_test = test_tag_x

In [163]:
x_test.shape

(127, 512, 1546)

In [164]:
y_test = pad_sequences([[tag2idx.get(l) for l in lab] for lab in chunks_test_y], maxlen=max_page_seq, padding="post", truncating="post", value=tag2idx["[PAD]"])

In [165]:
predict_crf_y = CRF_model.predict(x_test)
predict_softmax_y = Softmax_model.predict(x_test)

In [166]:
predict_crf_y = label_distribution_to_label(predict_crf_y)
predict_softmax_y = label_distribution_to_label(predict_softmax_y)

In [167]:
predict_crf_y = np.asarray([[idx2tag.get(lab) for lab in page] for page in predict_crf_y])
predict_softmax_y = np.asarray([[idx2tag.get(lab) for lab in page] for page in predict_softmax_y])

In [168]:
y_test = [[idx2tag.get(lab) for lab in page] for page in y_test]

In [169]:
y_test = np.asarray(y_test)

In [170]:
# evaluate_labels = ['PREV', 'PAGE', 'NEXT', '[PAD]', 'O']
evaluate_labels = ['PREV', 'PAGE', 'NEXT']

In [171]:
print(flat_classification_report(y_test, predict_crf_y, labels=evaluate_labels, digits=len(evaluate_labels)))

              precision    recall  f1-score   support

        PREV      0.000     0.000     0.000        27
        PAGE      0.740     0.750     0.745       300
        NEXT      0.750     0.115     0.200        52

   micro avg      0.729     0.609     0.664       379
   macro avg      0.497     0.288     0.315       379
weighted avg      0.689     0.609     0.617       379



In [172]:
print(flat_classification_report(y_test, predict_softmax_y, labels=evaluate_labels, digits=len(evaluate_labels)))

              precision    recall  f1-score   support

        PREV      0.000     0.000     0.000        27
        PAGE      0.755     0.780     0.767       300
        NEXT      1.000     0.038     0.074        52

   micro avg      0.756     0.623     0.683       379
   macro avg      0.585     0.273     0.280       379
weighted avg      0.735     0.623     0.617       379

