In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
import re
import sys
from collections import Counter
from itertools import islice
from urllib.parse import urlparse, urlsplit, parse_qs, parse_qsl

import numpy as np
import parsel
from sklearn_crfsuite.metrics import flat_classification_report, sequence_accuracy_score

sys.path.insert(0, '..')
from autopager.storage import Storage
from autopager.htmlutils import (get_link_text, get_text_around_selector_list,
                                 get_link_href, get_selector_root)
from autopager.utils import (
    get_domain, normalize_whitespaces, normalize, ngrams, tokenize, ngrams_wb, replace_digits
)
from autopager.model import link_to_features, _num_tokens_feature, _elem_attr
from autopager import AUTOPAGER_LIMITS
storage = Storage()

../autopager/data


In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
from ipywidgets import IntProgress
from IPython.display import display

In [4]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPU


In [5]:
urls = [rec['Page URL'] for rec in storage.iter_records(contain_button = True, file_type='T')]
groups = [get_domain(url) for url in urls]
train_groups_set = set(groups)

In [6]:
X_raw, y = storage.get_Xy(contain_button = True, file_type='T')

Finish: Get Page 1 (Encoding: UTF-8)records ... (len: 303)
Finish: Get Page 2 (Encoding: UTF-8)records ... (len: 243)
Finish: Get Page 3 (Encoding: UTF-8)records ... (len: 119)
Finish: Get Page 4 (Encoding: UTF-8)records ... (len: 944)
Finish: Get Page 5 (Encoding: UTF-8)records ... (len: 93)
Finish: Get Page 6 (Encoding: UTF-8)records ... (len: 994)
Finish: Get Page 7 (Encoding: UTF-8)records ... (len: 1014)
Finish: Get Page 8 (Encoding: UTF-8)records ... (len: 7)
Finish: Get Page 9 (Encoding: UTF-8)records ... (len: 288)
Finish: Get Page 10 (Encoding: UTF-8)records ... (len: 678)
Finish: Get Page 11 (Encoding: UTF-8)records ... (len: 789)
Finish: Get Page 12 (Encoding: UTF-8)records ... (len: 814)
Finish: Get Page 13 (Encoding: UTF-8)records ... (len: 814)
Finish: Get Page 14 (Encoding: UTF-8)records ... (len: 171)
Finish: Get Page 15 (Encoding: UTF-8)records ... (len: 168)
Finish: Get Page 16 (Encoding: UTF-8)records ... (len: 91)
Finish: Get Page 17 (Encoding: UTF-8)records ... (le

Finish: Get Page 162 (Encoding: UTF-8)records ... (len: 279)
Finish: Get Page 163 (Encoding: UTF-8)records ... (len: 269)
Finish: Get Page 164 (Encoding: UTF-8)records ... (len: 259)
Finish: Get Page 165 (Encoding: UTF-8)records ... (len: 133)
Finish: Get Page 166 (Encoding: UTF-8)records ... (len: 181)
Finish: Get Page 167 (Encoding: UTF-8)records ... (len: 94)
Finish: Get Page 168 (Encoding: UTF-8)records ... (len: 99)
Finish: Get Page 169 (Encoding: UTF-8)records ... (len: 103)
Finish: Get Page 170 (Encoding: UTF-8)records ... (len: 210)
Finish: Get Page 171 (Encoding: UTF-8)records ... (len: 208)
Finish: Get Page 172 (Encoding: UTF-8)records ... (len: 179)
Finish: Get Page 173 (Encoding: UTF-8)records ... (len: 461)
Finish: Get Page 174 (Encoding: UTF-8)records ... (len: 340)
Finish: Get Page 175 (Encoding: UTF-8)records ... (len: 188)
Finish: Get Page 176 (Encoding: UTF-8)records ... (len: 195)
Finish: Get Page 177 (Encoding: UTF-8)records ... (len: 40)
Finish: Get Page 178 (Encod

Finish: Get Page 307 (Encoding: UTF-8)records ... (len: 60)
Finish: Get Page 308 (Encoding: UTF-8)records ... (len: 60)
Finish: Get Page 309 (Encoding: UTF-8)records ... (len: 145)
Finish: Get Page 310 (Encoding: UTF-8)records ... (len: 116)
Finish: Get Page 311 (Encoding: UTF-8)records ... (len: 119)
Finish: Get Page 312 (Encoding: cp1252)records ... (len: 136)
Finish: Get Page 313 (Encoding: UTF-8)records ... (len: 383)
Finish: Get Page 314 (Encoding: UTF-8)records ... (len: 317)
Finish: Get Page 315 (Encoding: cp1252)records ... (len: 314)
Finish: Get Page 316 (Encoding: cp1252)records ... (len: 357)
Finish: Get Page 317 (Encoding: cp1252)records ... (len: 370)
Finish: Get Page 318 (Encoding: UTF-8)records ... (len: 137)
Finish: Get Page 319 (Encoding: UTF-8)records ... (len: 281)
Finish: Get Page 320 (Encoding: UTF-8)records ... (len: 281)
Finish: Get Page 321 (Encoding: UTF-8)records ... (len: 247)
Finish: Get Page 322 (Encoding: UTF-8)records ... (len: 248)


In [7]:
max_page_seq = 512

## Slice data into chunks

In [8]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [9]:
def get_chunks_data(x, y):
    new_tmp_x_array = []
    new_tmp_y_array = []
    for tmp_x, tmp_y in zip(x, y):
        new_tmp_x_array.extend(chunks(tmp_x, max_page_seq))
        new_tmp_y_array.extend(chunks(tmp_y, max_page_seq))
    return new_tmp_x_array, new_tmp_y_array

In [10]:
chunks_x, chunks_y = get_chunks_data(X_raw, y)

In [11]:
len(chunks_x)

353

## Load Pre-trained Bert model

In [12]:
import tensorflow_hub as hub
import numpy as np
import bert
from bert import tokenization
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
from tensorflow.keras import Model
from tensorflow.data import Dataset

In [307]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError(f"Token length more than max seq length! {len(tokens)} > {max_seq_length}")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id += 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

def get_bert_inputs_from_sequences(seqs, tokenizer, max_seq_length, Token):
    if Token is False:
        tokens_list = [tokenizer.tokenize(seq) for seq in seqs]
    else:
        tokens_list = seqs
    ids = [ np.array(get_ids(tokens, tokenizer, max_seq_length)) for tokens in tokens_list ]
    masks = [ np.array(get_masks(tokens, max_seq_length))  for tokens in tokens_list ]
    segments = [ np.array(get_segments(tokens, max_seq_length))  for tokens in tokens_list ]
    return np.array(ids), np.array(masks), np.array(segments)

def page_list_to_bert_embedding_list(page_list, model, tokenizer, max_seq_length, Token = False):
    print(f"Use custom Token: {Token}")
    p = IntProgress(max=len(page_list))
    p.description = '(Init)'
    p.value = 0
    display(p)
    seq_list = []
    for idx, page in enumerate(page_list):
        p.description = f"Task: {idx+1}"
        p.value = idx+1
        page_idx, page_mask, page_seg = get_bert_inputs_from_sequences(page, tokenizer, max_seq_length, Token)
        pooled_emb, _ = model.predict([ page_idx, page_mask, page_seg ])
        seq_list.append(pooled_emb)
    p.description = '(Done)'
    return seq_list

In [63]:
max_seq_length = 256  # Your choice here.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/1",
                            trainable=False)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

In [64]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [65]:
emb_model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])

# Feature extraction

In [123]:
%%time
# XXX: these functions should be copy-pasted from autopager/model.py

def _as_list(generator, limit=None):
    """
    >>> _as_list(ngrams_wb("text", 2, 2), 0)
    []
    >>> _as_list(ngrams_wb("text", 2, 2), 2)
    ['te', 'ex']
    >>> _as_list(ngrams_wb("text", 2, 2))
    ['te', 'ex', 'xt']
    """
    return list(generator if limit is None else islice(generator, 0, limit))

def feat_to_tokens(feat, tokenizer):
    if type(feat) == type([]):
        feat = ' '.join(feat)
    tokens = tokenizer.tokenize(feat)
    return tokens

def link_to_features(link, tokenizer):
    text = normalize(get_link_text(link))
    href = get_link_href(link)
    if href is None:
        href = ""
    p = urlsplit(href)
    query_parsed = parse_qsl(p.query) #parse query string from path
    query_param_names = [k.lower() for k, v in query_parsed]
    query_param_names_ngrams = _as_list(ngrams_wb(
        " ".join([normalize(name) for name in query_param_names]), 3, 5, True
    ))

    elem = get_selector_root(link)
    elem_target = _elem_attr(elem, 'target')
    elem_rel = _elem_attr(elem, 'rel')
    # Classes of link itself and all its children.
    # It is common to have e.g. span elements with fontawesome
    # arrow icon classes inside <a> links.
    self_and_children_classes = ' '.join(_as_list(link.xpath(".//@class").extract(), 5))
    parent_classes = ' '.join(_as_list(link.xpath('../@class').extract(), 5))
    css_classes = normalize(parent_classes + ' ' + self_and_children_classes)
    token_feature = {
        'text-before': '',
        'text-exact': replace_digits(text.strip()[:40].strip()),
        'text-after': '',
        'elem-target': elem_target,
        'elem-rel': elem_rel,
        'class': css_classes,
        'query': _as_list(query_param_names, 10)
    }
    tag_feature = {
        'isdigit': 1 if text.isdigit() is True else 0,
        'isalpha': 1 if text.isalpha() is True else 0,
        'path-has-page': 1 if 'page' in p.path.lower() else 0,
        'path-has-pageXX': 1 if re.search(r'[/-](?:p|page\w?)/?\d+', p.path.lower()) is not None else 0,
        'path-has-number': 1 if any(part.isdigit() for part in p.path.split('/')) else 0,
        'href-has-year': 1 if re.search('20\d\d', href) is not None else 0
    }
    return [token_feature, tag_feature]


def page_to_features(xseq, tokenizer):
    feat_list = [link_to_features(a, tokenizer) for a in xseq]
    around = get_text_around_selector_list(xseq, max_length=15)
    
    # weight is less than 1 because there is a lot of duplicate information
    # in these ngrams and so we want to regularize them stronger
    # (as if they are a single feature, not many features)
    k = 0.2
    for feat, (before, after) in zip(feat_list, around):
        feat[0]['text-before'] = normalize(before)
        feat[0]['text-after'] = normalize(after)
    return feat_list

CPU times: user 15 µs, sys: 0 ns, total: 15 µs
Wall time: 18.8 µs


In [124]:
def get_token_tag_features_from_chunks(chunks):
    token_features = []
    tag_features = []
    for page in chunks:
        feat_list = page_to_features(page, tokenizer)
        token_features.append([node[0] for node in feat_list])
        tag_features.append([node[1] for node in feat_list])
    return token_features, tag_features

In [125]:
def extract_tokens_from_token_features(token_features):
    train_tag_feature_token_list = []
    for page in token_features:
        tmp_page_list = []
        for node in page: 
            tmp_list = []
            for k, v in node.items():
                if k == 'text-exact':
                    continue
                else:
                    tmp_list.extend(v)
            tmp_page_list.append(tmp_list)
        train_tag_feature_token_list.append(tmp_page_list)
    return train_tag_feature_token_list

In [634]:
def token_features_bert_preprocessing(token_features, type = None, addNone = False):
    train_token_features = []
    if type == None or type not in ['single','multi','multi-two']:
        print("Must Given a type of pre-processing")
        return
    for page in token_features:
        page_features = []
        for node in page:
            node_features = ["[CLS]"]
            sep_two = False
            for k,v in node.items():
                value_tokens = feat_to_tokens(v, tokenizer)
                if addNone == True and len(value_tokens) == 0:
                    value_tokens = ["None"]
                if type == 'single':
                    node_features = node_features + value_tokens
                    if k == 'text-after':
                        node_features = node_features + ["[SEP]"]
                elif type == 'multi':
                    if k == 'text-after' or sep_two is True:
                        sep_two = True
                        node_features = node_features + value_tokens + ["[SEP]"]
                    else:
                        node_features = node_features + value_tokens
                elif type == 'multi-two':
                    node_features = node_features + value_tokens + ["[SEP]"]
            if 'multi' in type:
                node_features = node_features[:-1]
            page_features.append(node_features)
        train_token_features.append(page_features)
    return train_token_features

In [424]:
def flatten_dicts_to_values(pages):
    return [[[v for k,v in node.items()] for node in p ] for p in pages]

In [425]:
token_features, tag_features = get_token_tag_features_from_chunks(chunks_x)
# train_tag_feature_token_list = extract_tokens_from_token_features(token_features)

In [788]:
# train_token_features = token_features_bert_preprocessing(token_features,type = 'single')
# train_token_features = token_features_bert_preprocessing(token_features,type = 'multi')
train_token_features = token_features_bert_preprocessing(token_features,type = 'multi-two')

In [789]:
train_tag_info_list = flatten_dicts_to_values(tag_features) #features which only have tag true/false information

In [790]:
train_token_features[0][1]

['[CLS]',
 '[SEP]',
 'one',
 '##plus',
 'X',
 '##t',
 '[SEP]',
 '[SEP]',
 '[SEP]',
 '[SEP]',
 'page',
 '[SEP]']

In [791]:
max_node = -1
page_sum = 0
for page in train_token_features:
    sum = 0
    for node in page:
        sum+=len(node)
        if len(node) > max_node:
            max_node = len(node)
    page_sum+=sum/len(page)
print("Average: ", page_sum/len(train_token_features))
print("Max_node: ",max_node)

Average:  24.858043613697152
Max_node:  215


In [792]:
# Extract text-exact feature in token_features
train_text_list = [[ data['text-exact'] for data in x] for x in token_features]

In [61]:
train_text_emb = page_list_to_bert_embedding_list(train_text_list, emb_model, tokenizer, max_seq_length)

IntProgress(value=0, description='(Init)', max=353)

In [793]:
train_tag_emb = page_list_to_bert_embedding_list(train_token_features, emb_model, tokenizer, max_seq_length, Token=True)

Use custom Token: True


IntProgress(value=0, description='(Init)', max=353)

In [794]:
len(train_tag_emb)

353

In [795]:
train_tag_emb[0].shape

(303, 768)

## Split chunks into three type of training data
    (1) Every chunks
    (2) Chunks only have tag informations + Chunks only have Other

### Feature List
    * train_tag_feature_token_list => Tag Attributes tokens
    * train_tag_info_list => Tag information
    * train_text_emb => Only Text node => Bert Text embedding
    * train_tag_emb => Text-before Text Text-after [SEP] Other Attributes => Bert Text embedding

In [796]:
def onlyHavaOther(y):
    for tag in y:
        if tag != 'O':
            return False
    return True

In [797]:
chunks_only_data_idx = [idx for idx, row_y in enumerate(chunks_y) if not onlyHavaOther(row_y)]
chunks_only_other_idx = [idx for idx, row_y in enumerate(chunks_y) if onlyHavaOther(row_y)]

In [798]:
print("Tags data: ", len(chunks_only_data_idx))
print("Other data: ", len(chunks_only_other_idx))

Tags data:  228
Other data:  125


In [799]:
def getFilterChunks(chunks, filterIdx):
    # chunks: pages
    # filterIdx: Idx of list which indicate the return data
    return [page for idx, page in enumerate(chunks) if idx in filterIdx]

In [800]:
def getTrainingData(types = None):
    if types == None:
        if len(train_text_emb) != len(train_tag_info_list) or len(train_tag_info_list) != len(chunks_y) or len(train_tag_info_list) != len(train_tag_feature_token_list):
            raise Exception('Every chunks should have equal size')
        print(f"return {len(chunks_y)} data.")
        return (train_text_emb, train_tag_feature_token_list, train_tag_info_list, chunks_y)
    if types == 'Tags':
        chunks_text_x = getFilterChunks(train_text_emb, chunks_only_data_idx)
        chunks_token_x = getFilterChunks(train_tag_feature_token_list, chunks_only_data_idx)
        chunks_tag_x = getFilterChunks(train_tag_info_list, chunks_only_data_idx)
        chunks_filtered_y = getFilterChunks(chunks_y, chunks_only_data_idx)
        if len(chunks_text_x) != len(chunks_tag_x) or len(chunks_tag_x) != len(chunks_filtered_y) or len(chunks_token_x) != len(chunks_tag_x):
            raise Exception('Every chunks should have equal size')
        print(f"return {len(chunks_filtered_y)} data.")
        return (chunks_text_x, chunks_token_x, chunks_tag_x, chunks_filtered_y)

In [801]:
chunks_text_emb = train_tag_emb
chunks_tag_infos = train_tag_info_list
chunks_filtered_y = chunks_y

In [844]:
chunks_text_emb = getFilterChunks(train_tag_emb, chunks_only_data_idx)
chunks_tag_infos = getFilterChunks(train_tag_info_list, chunks_only_data_idx)
chunks_filtered_y = getFilterChunks(chunks_y, chunks_only_data_idx)

## Padding to fixed size

In [845]:
def feature_pad_to_npdata(embedding):
    dataset = Dataset.from_generator(lambda: iter(embedding), tf.float32)
    dataset = dataset.padded_batch(1, padded_shapes= (max_page_seq, len(embedding[0][0])), padding_values=-1.,drop_remainder=False)
    after_pad = np.array([ data[0] for data in list(dataset.as_numpy_iterator())])
    return after_pad

In [846]:
# train_tag_token = feature_pad_to_npdata(chunks_tag_tokens)
train_tag_x = feature_pad_to_npdata(chunks_tag_infos)
train_text_emb_x = feature_pad_to_npdata(chunks_text_emb)
train_info_x = np.concatenate([train_text_emb_x, train_tag_x], axis = 2)

In [847]:
labels = ["O", "PREV", "PAGE", "NEXT", "[PAD]"]
tag2idx = { label:idx for idx,label in enumerate(labels)}
idx2tag = { idx:label for idx,label in enumerate(labels)}
num_tags = len(labels)

In [848]:
train_y = pad_sequences([[tag2idx.get(l) for l in lab] for lab in chunks_filtered_y], maxlen=max_page_seq, padding="post", truncating="post", value=tag2idx["[PAD]"])

In [849]:
print("Current Shape:")
print(f"train_text_emb_x: {train_text_emb_x.shape}")
print(f"train_tag_x: {train_tag_x.shape}")
print(f"train_info_x: {train_info_x.shape}")
print(f"train_y: {train_y.shape}")

Current Shape:
train_text_emb_x: (228, 512, 768)
train_tag_x: (228, 512, 6)
train_info_x: (228, 512, 774)
train_y: (228, 512)


In [850]:
# train_x = train_text_emb_x
train_x = train_info_x

## Build BERT-BiLSTM-CRF Model

In [808]:
from tensorflow_addons.layers.crf import CRF

In [809]:
TIME_STAMPS = max_page_seq
HIDDEN_UNITS = 200
DROPOUT_RATE = 0.1
# NUM_CLASS = 5
NUM_CLASS = num_tags
print(f"TIME_STAMP: {TIME_STAMPS}")
print(f"HIDDEN_UNITS: {HIDDEN_UNITS}")
print(f"DROPOUT_RATE: {DROPOUT_RATE}")
print(f"NUM_CLASS: {NUM_CLASS}")

TIME_STAMP: 512
HIDDEN_UNITS: 200
DROPOUT_RATE: 0.1
NUM_CLASS: 5


In [810]:
def get_BERT_BILSTM_CRF(SHAPE, numtags):
    print(f"SHAPE: {SHAPE}")
    model=tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=SHAPE))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=HIDDEN_UNITS, return_sequences=True)))
    crf=CRF(numtags,name='crf_layer')
    model.add(crf)
    model.compile('adam',loss={'crf_layer': crf.get_loss})
    model.summary()
    return model

In [811]:
def get_BERT_BILSTM_SOFTMAX(SHAPE, numtags):
    print(f"SHAPE: {SHAPE}")
    model=tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=SHAPE))
#     model.add(tf.keras.layers.Masking(input_shape=SHAPE, mask_value=-1.))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=HIDDEN_UNITS, return_sequences=True)))
    model.add(tf.keras.layers.Dense(units = numtags, activation='softmax'))
    model.compile('adam',loss=tf.keras.losses.SparseCategoricalCrossentropy())
    model.summary()
    return model

In [851]:
CRF_model = get_BERT_BILSTM_CRF(train_x.shape[1:], num_tags)

SHAPE: (512, 774)
Model: "sequential_47"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_47 (Bidirectio (None, 512, 400)          1560000   
_________________________________________________________________
crf_layer (CRF)              (None, 512)               2040      
Total params: 1,562,040
Trainable params: 1,562,040
Non-trainable params: 0
_________________________________________________________________


In [852]:
Softmax_model = get_BERT_BILSTM_SOFTMAX(train_x.shape[1:], num_tags)

SHAPE: (512, 774)
Model: "sequential_48"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_48 (Bidirectio (None, 512, 400)          1560000   
_________________________________________________________________
dense_23 (Dense)             (None, 512, 5)            2005      
Total params: 1,562,005
Trainable params: 1,562,005
Non-trainable params: 0
_________________________________________________________________


In [853]:
train_x.shape

(228, 512, 774)

In [854]:
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

In [855]:
CRF_history = CRF_model.fit(train_x, train_y, batch_size=128, epochs=500, validation_split=0.2, verbose=1, callbacks=[earlyStopping])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [856]:
Softmax_history = Softmax_model.fit(train_x, train_y, batch_size=128, epochs=500, validation_split=0.2, verbose=1, callbacks=[earlyStopping])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

## GroupKfold

In [131]:
%%time
groups = [get_domain(url) for url in urls]
N_SPLITs = 6

CPU times: user 2.36 ms, sys: 0 ns, total: 2.36 ms
Wall time: 2.37 ms


In [207]:
def make_group_dataset(X_data,y_data, groups, n_splits):

    def gen():
        for train_index, test_index in GroupKFold(n_splits).split(X_data, y_data, groups):
            X_train, X_test = X_data[train_index], X_data[test_index]
            y_train, y_test = y_data[train_index], y_data[test_index]
            yield X_train,y_train,X_test,y_test

    return tf.data.Dataset.from_generator(gen, (tf.float64,tf.float64,tf.float64,tf.float64))

In [208]:
dataset = make_group_dataset(train_x, train_y, groups, N_SPLITs)

In [213]:
def tf_cross_val(dataset):
    count = 1
    for X_train,y_train,X_test,y_test in dataset:
        print(f"Start fold {count}")
        earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
        model = get_BILSTM_SOFTMAX(X_train.shape[1:], num_tags)
        history = model.fit(X_train, y_train, batch_size=32, epochs=500, validation_split=0.2, verbose=0, callbacks=[earlyStopping])
        predict_y = model.predict(X_test)
        predict_y = label_distribution_to_label(predict_y)
        predict_y = [[idx2tag.get(lab) for lab in page] for page in predict_y]
        y_test = [[idx2tag.get(lab) for lab in page] for page in y_test.numpy()]
        evaluate_labels = ['PREV', 'PAGE', 'NEXT']
        print(flat_classification_report(y_test, predict_y, labels=evaluate_labels, digits=len(evaluate_labels)))
        count+=1
    return

## Test on split train

## Test on val set

In [95]:
def label_distribution_to_label(predict_y):
    if len(predict_y.shape) != 3:
        return predict_y
    label_y = list()
    for page in predict_y:
        tmp = list()
        for lab in page:
            lab = lab.tolist()
            tmp.append(lab.index(max(lab)))
        label_y.append(tmp)
    return label_y

In [96]:
test_X_raw, test_y = storage.get_test_Xy()

In [97]:
test_urls = [rec['url'] for rec in storage.iter_test_records()]
test_groups = set([get_domain(url) for url in test_urls])

In [98]:
for group in test_groups:
    if group in train_groups_set:
        print(f"Groups exist: {group}")

Groups exist: mobile01
Groups exist: musicarts


In [468]:
chunks_test_x, chunks_test_y = get_chunks_data(test_X_raw, test_y)

In [818]:
test_token_features, test_tag_features = get_token_tag_features_from_chunks(chunks_test_x)
# test_tag_emb_features = token_features_bert_preprocessing(test_token_features, 'single')
# test_tag_emb_features = token_features_bert_preprocessing(test_token_features, 'multi')
test_tag_emb_features = token_features_bert_preprocessing(test_token_features, 'multi-two')
test_tag_info_list = flatten_dicts_to_values(test_tag_features)

In [95]:
test_text_list = [[data["text-exact"] for data in page ] for page in test_token_features]
test_text_emb = page_list_to_bert_embedding_list(test_text_list, emb_model, tokenizer, max_seq_length)

IntProgress(value=0, description='(Init)', max=42)

In [819]:
test_tag_emb = page_list_to_bert_embedding_list(test_tag_emb_features, emb_model, tokenizer, max_seq_length, Token=True)

Use custom Token: True


IntProgress(value=0, description='(Init)', max=42)

In [820]:
max_node = -1
page_sum = 0
for page in test_tag_emb_features:
    sum = 0
    for node in page:
        sum+=len(node)
        if len(node) > max_node:
            max_node = len(node)
    page_sum+=sum/len(page)
print("Average: ", page_sum/len(train_token_features))
print("Max_node: ",max_node)

Average:  3.170013315689047
Max_node:  134


In [821]:
# test_tag_token = feature_pad_to_npdata(test_tag_feature_token_list)
test_tag_x = feature_pad_to_npdata(test_tag_info_list)
test_text_emb_x = feature_pad_to_npdata(test_tag_emb)
test_info_x = np.concatenate([test_text_emb_x, test_tag_x], axis = 2)

In [822]:
x_test = test_info_x
# x_test = test_text_emb_x

In [823]:
x_test.shape

(42, 512, 774)

In [857]:
y_test = pad_sequences([[tag2idx.get(l) for l in lab] for lab in chunks_test_y], maxlen=max_page_seq, padding="post", truncating="post", value=tag2idx["[PAD]"])

In [858]:
predict_crf_y = CRF_model.predict(x_test)
predict_softmax_y = Softmax_model.predict(x_test)

In [859]:
predict_crf_y = label_distribution_to_label(predict_crf_y)
predict_softmax_y = label_distribution_to_label(predict_softmax_y)

In [860]:
predict_crf_y = np.asarray([[idx2tag.get(lab) for lab in page] for page in predict_crf_y])
predict_softmax_y = np.asarray([[idx2tag.get(lab) for lab in page] for page in predict_softmax_y])

In [861]:
y_test = [[idx2tag.get(lab) for lab in page] for page in y_test]

In [862]:
y_test = np.asarray(y_test)

In [863]:
# evaluate_labels = ['PREV', 'PAGE', 'NEXT', '[PAD]', 'O']
evaluate_labels = ['PREV', 'PAGE', 'NEXT']

In [864]:
print(flat_classification_report(y_test, predict_crf_y, labels=evaluate_labels, digits=len(evaluate_labels)))

              precision    recall  f1-score   support

        PREV      0.000     0.000     0.000        13
        PAGE      0.778     0.724     0.750       145
        NEXT      0.667     0.074     0.133        27

   micro avg      0.775     0.578     0.663       185
   macro avg      0.481     0.266     0.294       185
weighted avg      0.707     0.578     0.607       185



In [865]:
print(flat_classification_report(y_test, predict_softmax_y, labels=evaluate_labels, digits=len(evaluate_labels)))

              precision    recall  f1-score   support

        PREV      0.000     0.000     0.000        13
        PAGE      0.792     0.786     0.789       145
        NEXT      1.000     0.074     0.138        27

   micro avg      0.789     0.627     0.699       185
   macro avg      0.597     0.287     0.309       185
weighted avg      0.766     0.627     0.638       185

