In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
import re
import sys
from collections import Counter
from itertools import islice
from urllib.parse import urlparse, urlsplit, parse_qs, parse_qsl

import numpy as np
import parsel
from sklearn_crfsuite.metrics import flat_classification_report, sequence_accuracy_score

sys.path.insert(0, '..')
from autopager.storage import Storage
from autopager.htmlutils import (get_link_text, get_text_around_selector_list,
                                 get_link_href, get_selector_root)
from autopager.utils import (
    get_domain, normalize_whitespaces, normalize, ngrams, tokenize, ngrams_wb, replace_digits
)
from autopager.model import _num_tokens_feature, _elem_attr
from autopager import AUTOPAGER_LIMITS
from autopager.parserutils import (TagParser, MyHTMLParser, draw_scaled_page, position_check, compare_tag, get_first_tag)
parser = MyHTMLParser()
tagParser = TagParser()

In [93]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random
tf.random.set_seed(0)
random.seed(0)
np.random.seed(0)

In [3]:
storage = Storage()

Current test file:  ['en', 'zh', 'ko', 'ja', 'de', 'ru']


In [4]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if len(gpus)!=0:
  # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print("No GPUs visible")

1 Physical GPUs, 1 Logical GPU


In [5]:
%%time
urls = [rec['Page URL'] for rec in storage.iter_records(language='en',contain_button = True, file_type='T')]
X_raw, y, page_positions = storage.get_Xy(language='en',contain_button = True,  contain_position=True,file_type='T', scaled_page='normal')
print("pages: {}  domains: {}".format(len(urls), len({get_domain(url) for url in urls})))

Contain position: True
Finish: Get Page 1 (Encoding: UTF-8)records ... (len: 303)
Finish: Get Page 2 (Encoding: UTF-8)records ... (len: 243)
Finish: Get Page 3 (Encoding: UTF-8)records ... (len: 119)
Finish: Get Page 4 (Encoding: UTF-8)records ... (len: 944)
Finish: Get Page 5 (Encoding: UTF-8)records ... (len: 93)
Finish: Get Page 6 (Encoding: UTF-8)records ... (len: 994)
Finish: Get Page 7 (Encoding: UTF-8)records ... (len: 1014)
Finish: Get Page 8 (Encoding: UTF-8)records ... (len: 7)
Finish: Get Page 21 (Encoding: UTF-8)records ... (len: 158)
Finish: Get Page 22 (Encoding: UTF-8)records ... (len: 171)
Finish: Get Page 23 (Encoding: UTF-8)records ... (len: 181)
Finish: Get Page 24 (Encoding: UTF-8)records ... (len: 10)
Finish: Get Page 25 (Encoding: UTF-8)records ... (len: 165)
Finish: Get Page 26 (Encoding: UTF-8)records ... (len: 147)
Finish: Get Page 28 (Encoding: UTF-8)records ... (len: 268)
Finish: Get Page 33 (Encoding: UTF-8)records ... (len: 108)
Finish: Get Page 34 (Encodin

Finish: Get Page 265 (Encoding: UTF-8)records ... (len: 199)
Finish: Get Page 266 (Encoding: UTF-8)records ... (len: 86)
Finish: Get Page 267 (Encoding: UTF-8)records ... (len: 131)
Finish: Get Page 284 (Encoding: cp1252)records ... (len: 130)
Finish: Get Page 287 (Encoding: UTF-8)records ... (len: 82)
Finish: Get Page 288 (Encoding: UTF-8)records ... (len: 140)
Finish: Get Page 289 (Encoding: UTF-8)records ... (len: 44)
Finish: Get Page 293 (Encoding: UTF-8)records ... (len: 74)
Finish: Get Page 294 (Encoding: UTF-8)records ... (len: 63)
Finish: Get Page 295 (Encoding: UTF-8)records ... (len: 65)
Finish: Get Page 296 (Encoding: UTF-8)records ... (len: 20)
Finish: Get Page 299 (Encoding: UTF-8)records ... (len: 361)
Finish: Get Page 300 (Encoding: UTF-8)records ... (len: 361)
Finish: Get Page 301 (Encoding: UTF-8)records ... (len: 364)
Finish: Get Page 302 (Encoding: UTF-8)records ... (len: 170)
Finish: Get Page 303 (Encoding: UTF-8)records ... (len: 154)
Finish: Get Page 304 (Encoding

In [6]:
max_page_seq = 512

## Slice data into chunks

In [7]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [8]:
def get_chunks_data(x, y, p):
    new_tmp_x_array = []
    new_tmp_y_array = []
    new_tmp_p_array = []
    for tmp_x, tmp_y, tmp_p in zip(x, y, p):
        new_tmp_x_array.extend(chunks(tmp_x, max_page_seq))
        new_tmp_y_array.extend(chunks(tmp_y, max_page_seq))
        new_tmp_p_array.extend(chunks(tmp_p, max_page_seq))
    return new_tmp_x_array, new_tmp_y_array, new_tmp_p_array

In [9]:
chunks_x, chunks_y, chunk_positions = get_chunks_data(X_raw, y, page_positions)

In [10]:
len(chunks_x)

183

## Load Pre-trained Bert model

In [11]:
from tensorflow.keras import Model
from tensorflow.data import Dataset

In [12]:
from BertModel import BertModel

1 Physical GPUs, 1 Logical GPU


In [13]:
bert_short_model = BertModel(128)

In [14]:
bert_long_model = BertModel(512)

In [15]:
pbert = bert_short_model

## Load Pre-trained Fastext model

In [16]:
from FastTextModel import FastTextModel

1 Physical GPUs, 1 Logical GPU


In [17]:
ft = FastTextModel()



Current dimension:  100


# Feature extraction

In [216]:
%%time
# XXX: these functions should be copy-pasted from autopager/model.py

def _as_list(generator, limit=None):
    """
    >>> _as_list(ngrams_wb("text", 2, 2), 0)
    []
    >>> _as_list(ngrams_wb("text", 2, 2), 2)
    ['te', 'ex']
    >>> _as_list(ngrams_wb("text", 2, 2))
    ['te', 'ex', 'xt']
    """
    return list(generator if limit is None else islice(generator, 0, limit))

def feat_to_tokens(feat, tokenizer):
    if type(feat) == type([]):
        feat = ' '.join(feat)
    tokens = tokenizer.tokenize(feat)
    return tokens

def link_to_features(link):
    text = normalize(get_link_text(link))
    href = get_link_href(link)
    if href is None:
        href = ""
    p = urlsplit(href)
    parent = link.xpath('..').extract()
    parent = get_first_tag(parser, parent[0])
    query_parsed = parse_qsl(p.query) #parse query string from path
    query_param_names = [k.lower() for k, v in query_parsed]
    query_param_names_ngrams = _as_list(ngrams_wb(
        " ".join([normalize(name) for name in query_param_names]), 3, 5, True
    ))

    # Classes of link itself and all its children.
    # It is common to have e.g. span elements with fontawesome
    # arrow icon classes inside <a> links.
    self_and_children_classes = ' '.join(_as_list(link.xpath(".//@class").extract(), 5))
#     print("self and children: ",_as_list(link.xpath(".//@class").extract(), 5))
    parent_classes = ' '.join(_as_list(link.xpath('../@class').extract(), 5))
#     print("parent: ",_as_list(link.xpath('../@class').extract(), 5))
    css_classes = normalize(parent_classes + ' ' + self_and_children_classes)
#     print(css_classes)
    token_feature = {
        'text-before': '',
        'text-exact': replace_digits(text.strip()[:40].strip()),
        'text-after': '',
        'class': css_classes,
        'query': _as_list(query_param_names, 10),
        'parent-tag': parent,
    }
    tag_feature = {
        'isdigit': 1 if text.isdigit() is True else 0,
        'isalpha': 1 if text.isalpha() is True else 0,
        'has-href': 0 if href is "" else 1,
        'path-has-page': 1 if 'page' in p.path.lower() else 0,
        'path-has-pageXX': 1 if re.search(r'[/-](?:p|page\w?)/?\d+', p.path.lower()) is not None else 0,
        'path-has-number': 1 if any(part.isdigit() for part in p.path.split('/')) else 0,
        'href-has-year': 1 if re.search('20\d\d', href) is not None else 0,
        'class-has-disabled': 1 if 'disabled' in css_classes else 0,
        'num-tokens': _num_tokens_feature(text),
    }
    tag_feature = [v for k,v in tag_feature.items()]
#     attribute_feature = elem_rel + elem_target
    non_token_feature = tag_feature #+ attribute_feature
    return [token_feature, non_token_feature]


def page_to_features(xseq):
    feat_list = [link_to_features(a) for a in xseq]
    around = get_text_around_selector_list(xseq, max_length=15)
    
    # weight is less than 1 because there is a lot of duplicate information
    # in these ngrams and so we want to regularize them stronger
    # (as if they are a single feature, not many features)
    k = 0.2
    for feat, (before, after) in zip(feat_list, around):
        feat[0]['text-before'] = normalize(before)
        feat[0]['text-after'] = normalize(after)
        
    return feat_list

CPU times: user 11 µs, sys: 4 µs, total: 15 µs
Wall time: 21 µs


In [217]:
def get_token_tag_features_from_chunks(chunks):
    token_features = []
    tag_features = []
    for idx, page in enumerate(chunks):
        try:
            feat_list = page_to_features(page)
            token_features.append([node[0] for node in feat_list])
            tag_features.append([node[1] for node in feat_list])
        except:
            raise Exception(f"Error occured on {idx}")
    return token_features, tag_features

In [218]:
def extract_tokens_from_token_features(token_features, tokenizer):
    train_tag_feature_token_list = []
    for page in token_features:
        tmp_page_list = []
        for node in page: 
            tmp_list = ['[CLS]']
            start = False
            for k, v in node.items():
                if k == 'class':
                    start = True
                if start is True:
                    tokens = feat_to_tokens(v, tokenizer)
                    tmp_list = tmp_list + tokens + ['[SEP]']
            tmp_page_list.append(tmp_list)
        train_tag_feature_token_list.append(tmp_page_list)
    return train_tag_feature_token_list

In [219]:
def page_to_two_bert_embeddings(token_features, tokenizer):
    text_first_segs = []
    text_second_segs = []
    for page in token_features:
        page_one_features = []
        page_two_features = []
        for node in page:
            text_before = tokenizer.tokenize(node["text-before"])
            text_exact = tokenizer.tokenize(node["text-exact"])
            text_after = tokenizer.tokenize(node["text-after"])
            page_one_features.append(["[CLS]"]+text_before+["[SEP]"]+text_exact+["[SEP]"])
            page_two_features.append(["[CLS]"]+text_exact+["[SEP]"]+text_after+["[SEP]"])
        text_first_segs.append(page_one_features)
        text_second_segs.append(page_two_features)
    print("Start encode first seg embeddings")
    first_emb = pbert.page_list_to_bert_embedding_list(text_first_segs, Token=True)
    print("Start encode second seg embeddings")
    second_emb = pbert.page_list_to_bert_embedding_list(text_second_segs, Token=True)
    full_text_emb = [np.concatenate([first_emb[page], second_emb[page]], axis = 1) for page in range(len(token_features))]
    return first_emb, second_emb, full_text_emb

In [220]:
def word_to_vector(ft, word_list):
    if type(word_list) == type([]):
        if len(word_list) == 0:
            return np.zeros(ft.getModel().get_dimension())
        else:
            vectors_array = []
            for word in word_list:
                vector = ft.getWordVector(word)
                vectors_array.append(vector)
            mean_vector = np.mean(vectors_array, axis = 0)
            return mean_vector
    else:
        return ft.getWordVector(word_list)

In [221]:
def pages_to_word_vector(ft, token_features):
    pages_vector = []
    for page in token_features:
        page_vectors = []
        for node in page:
            classes = word_to_vector(ft, node['class'])
            query = word_to_vector(ft, node['query'])
            p_tag = word_to_vector(ft, node['parent-tag'])
            full_vector = np.concatenate([classes, query, p_tag], axis = 0)
            page_vectors.append(full_vector)
        pages_vector.append(np.array(page_vectors))
    return pages_vector

In [222]:
def pages_to_word_vector_from_keylist(ft, token_features, word_to_vec_list = token_feature_list):
    print(f"Transform key {word_to_vec_list} to word_vector ... ")
    pages_vector = []
    for page in token_features:
        page_vectors = []
        for node in page:
            full_vector_list = []
            for k,v in node.items():
                if k in word_to_vec_list:
                    full_vector_list.append(word_to_vector(ft, v))
            full_vector = np.concatenate(full_vector_list, axis=0)
            page_vectors.append(full_vector)
        pages_vector.append(np.array(page_vectors))
    return pages_vector

In [223]:
token_features, tag_features = get_token_tag_features_from_chunks(chunks_x)
# train_tag_feature_token_list = extract_tokens_from_token_features(token_features)

In [224]:
token_features[0][5]

{'text-before': '',
 'text-exact': 'community',
 'text-after': '',
 'class': 'page active',
 'query': [],
 'parent-tag': 'li',
 'text': ['co',
  'om',
  'mm',
  'mu',
  'un',
  'ni',
  'it',
  'ty',
  'com',
  'omm',
  'mmu',
  'mun',
  'uni',
  'nit',
  'ity',
  'comm',
  'ommu',
  'mmun',
  'muni',
  'unit',
  'nity',
  'commu',
  'ommun',
  'mmuni',
  'munit',
  'unity']}

In [155]:
token_feature_list = list(token_features[0][0].keys())

In [156]:
# Use ft to encode all token_features
ft_full_tokens_emb = pages_to_word_vector_from_keylist(ft, token_features, token_feature_list)

Transform key ['text-before', 'text-exact', 'text-after', 'class', 'query', 'parent-tag'] to word_vector ... 


In [157]:
ft_full_tokens_emb[0].shape

(303, 600)

In [38]:
ft_tokens_emb = pages_to_word_vector(ft, token_features)

In [39]:
ft_tokens_emb[0].shape

(303, 300)

In [40]:
first_emb, second_emb, full_text_emb = page_to_two_bert_embeddings(token_features, pbert.get_tokenizer())

Start encode first seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=183)

Start encode second seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=183)

In [41]:
print(f"First emb:{first_emb[0].shape}")
print(f"Second emb:{second_emb[0].shape}")
print(f"Full_text emb:{full_text_emb[0].shape}")

First emb:(303, 768)
Second emb:(303, 768)
Full_text emb:(303, 1536)


In [42]:
train_tag_info_list = tag_features #features which only have tag true/false information

## Feature List
    * train_tag_feature_token_list => Tag Attributes tokens
    * train_tag_info_list => Tag information
    * train_text_emb => Only Text node => Bert Text embedding
    * train_tag_emb => Text-before Text Text-after [SEP] Other Attributes => Bert Text embedding

In [190]:
# chunks_text_emb = train_text_emb # text
# chunks_text_emb = train_tag_emb # tag
chunks_text_emb = full_text_emb # full text embedding (two bert)
# chunks_text_emb = second_emb

chunks_tag_infos = train_tag_info_list

## Padding to fixed size

In [44]:
def prepare_input_tokens(page_tokens, tokenizer, max_len):
    pages_class = []
    pages_query = []
    pages_parent_tag = []
#     print(len(page_tokens))
    for page in page_tokens:
        class_page = []
        query_page = []
        parent_tag_page = []
        for node in page:
            #class
            class_tokens = tokenizer.tokenize(node['class'])
            class_ids = tokenizer.convert_tokens_to_ids(class_tokens)
            class_ids = class_ids + [0] * (max_len-len(class_ids))
            class_page.append(class_ids[:256])
            #query
            query_tokens = tokenizer.tokenize(' '.join(node['query']))
            query_ids = tokenizer.convert_tokens_to_ids(query_tokens)
            query_ids = query_ids + [0] * (max_len-len(query_ids))
            query_page.append(query_ids[:256])
            #parent tag
            parent_tag_tokens = tokenizer.tokenize(node['parent-tag'])
            parent_tag_ids = tokenizer.convert_tokens_to_ids(parent_tag_tokens)
            parent_tag_ids = parent_tag_ids + [0] * (max_len-len(parent_tag_ids))
            parent_tag_page.append(parent_tag_ids[:256])
        pages_class.append(class_page)
        pages_query.append(query_page)
        pages_parent_tag.append(parent_tag_page)
    return pages_class, pages_query, pages_parent_tag

In [45]:
def feature_pad_to_npdata(embedding):
    dataset = Dataset.from_generator(lambda: iter(embedding), tf.float32)
    dataset = dataset.padded_batch(1, padded_shapes= (max_page_seq, len(embedding[0][0])), padding_values=-1.,drop_remainder=False)
    after_pad = np.array([ data[0] for data in list(dataset.as_numpy_iterator())])
    return after_pad

In [46]:
def token_pad_to_npdata(embedding):
    dataset = Dataset.from_generator(lambda: iter(embedding), tf.int32)
    dataset = dataset.padded_batch(1, padded_shapes= (max_page_seq, len(embedding[0][0])), padding_values=0,drop_remainder=False)
    after_pad = np.array([ data[0] for data in list(dataset.as_numpy_iterator())])
    return after_pad

In [47]:
max_len = 256

In [193]:
# train_tag_token = feature_pad_to_npdata(chunks_tag_tokens)
train_text_emb_x = feature_pad_to_npdata(chunks_text_emb)

In [194]:
train_tag_x = feature_pad_to_npdata(chunks_tag_infos)

In [195]:
train_attr_x = feature_pad_to_npdata(ft_tokens_emb)

In [51]:
train_info_x = np.concatenate([train_text_emb_x, train_tag_x], axis = 2)

In [52]:
labels = ["O", "PREV", "PAGE", "NEXT", "[PAD]"]
tag2idx = { label:idx for idx,label in enumerate(labels)}
idx2tag = { idx:label for idx,label in enumerate(labels)}
num_tags = len(labels)

In [53]:
train_y = pad_sequences([[tag2idx.get(l) for l in lab] for lab in chunks_y], maxlen=max_page_seq, padding="post", truncating="post", value=tag2idx["[PAD]"])

In [55]:
print("Current Shape:")
print(f"train_text_emb_x: {train_text_emb_x.shape}")
print(f"train_tag_x: {train_tag_x.shape}")
# print(f"train_info_x: {train_info_x.shape}")
print(f"train_ft_x: {train_attr_x.shape}")
print(f"train_y: {train_y.shape}")

Current Shape:
train_text_emb_x: (183, 512, 1536)
train_tag_x: (183, 512, 8)
train_ft_x: (183, 512, 300)
train_y: (183, 512)


In [37]:
# train_x = train_text_emb_x
train_x = train_info_x
# train_x = train_tag_x

In [38]:
train_x.shape

(356, 512, 1544)

In [196]:
train_composite_input = [train_text_emb_x, train_attr_x, train_tag_x]

In [197]:
for inputs in train_composite_input:
    print(inputs.shape)

(183, 512, 1536)
(183, 512, 300)
(183, 512, 8)


## Input: Bert text + Ft tag + tag information

In [198]:
# train_tag_token = feature_pad_to_npdata(chunks_tag_tokens)
train_text_emb_x = feature_pad_to_npdata(chunks_text_emb)

In [199]:
train_tag_x = feature_pad_to_npdata(chunks_tag_infos)

In [200]:
train_attr_x = feature_pad_to_npdata(ft_tokens_emb)

In [201]:
train_composite_input = [train_text_emb_x, train_attr_x, train_tag_x]

In [202]:
for inputs in train_composite_input:
    print(inputs.shape)

(183, 512, 1536)
(183, 512, 300)
(183, 512, 8)


## Input: FastText All + Tag information

In [158]:
train_text_emb_x = feature_pad_to_npdata(ft_full_tokens_emb)

In [159]:
train_tag_x = feature_pad_to_npdata(train_tag_info_list)

In [170]:
train_composite_input = [train_text_emb_x, train_tag_x] #Ft, taginfo

In [171]:
for inputs in train_composite_input:
    print(inputs.shape)

(183, 512, 600)
(183, 512, 8)


## Build BERT-BiLSTM-CRF Model

In [58]:
from tensorflow_addons.layers.crf import CRF

In [59]:
from tensorflow.keras.layers import (Dense, Input, Bidirectional, LSTM, Embedding, Masking, Concatenate,
                                    AveragePooling2D, GlobalAveragePooling2D, Reshape)

In [125]:
class BestWeightCallback(keras.callbacks.Callback):
    def __init__(self):
        self.best_weights = None
    def on_train_begin(self, logs=None):
        self.best = np.Inf
        self.best_epoch = np.Inf
    def on_epoch_begin(self, epoch, logs=None):
        keys = list(logs.keys())

    def on_epoch_end(self, epoch, logs=None):
        current = logs.get("val_loss")
        epoch = epoch + 1
        if np.less(current, self.best):
            self.best_weights = self.model.get_weights()
            self.best = current
            self.best_epoch = epoch
    def on_train_end(self, logs=None):
        print(f"Training Finish, Best epoch: {self.best_epoch}, Best Val_loss: {self.best}")
        self.model.set_weights(self.best_weights)

In [60]:
TIME_STAMPS = max_page_seq
HIDDEN_UNITS = 200
DROPOUT_RATE = 0.1
# NUM_CLASS = 5
NUM_CLASS = num_tags
print(f"TIME_STAMP: {TIME_STAMPS}")
print(f"HIDDEN_UNITS: {HIDDEN_UNITS}")
print(f"DROPOUT_RATE: {DROPOUT_RATE}")
print(f"NUM_CLASS: {NUM_CLASS}")

TIME_STAMP: 512
HIDDEN_UNITS: 200
DROPOUT_RATE: 0.1
NUM_CLASS: 5


In [126]:
bwCallback = BestWeightCallback()

In [78]:
earlyStopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [203]:
#For ft_bert_bilstm
bert_shape = train_text_emb_x[0].shape
tag_info_shape = train_tag_x[0].shape
ft_emb_shape = train_attr_x[0].shape
# tag_emb_shape = train_tag_feature[0].shape
def get_ft_bert_bilstm_model_simple(use_crf = True):
    #BERT-BiLSTM-SoftMAX with Custom Embeddings
    input_bert_embeddings = Input(shape=(bert_shape), name="input_bert_embeddings")
    input_tag_information = Input(shape=(tag_info_shape), name="input_tag_information")
    input_ft_embeddings = Input(shape=(ft_emb_shape), name="input_ft_embeddings")
    
    input_tags_FFN = Concatenate()([input_bert_embeddings, input_ft_embeddings, input_tag_information])
    if not use_crf:
        input_tags_FFN = Masking(mask_value=-1.)(input_tags_FFN)
    input_tags_FFN = Dense(units = 768, activation = 'relu')(input_tags_FFN)
    input_tags_FFN = Dense(units = 324, activation = 'relu', name="tags_FFN_out")(input_tags_FFN)
    
    model = Bidirectional(LSTM(units=HIDDEN_UNITS//2, return_sequences=True))(input_tags_FFN)
    if use_crf:
        crf=CRF(NUM_CLASS,name='crf_layer')
        out =crf(model)
    else:
        out = Dense(units = NUM_CLASS, activation='softmax')(model)
    model = Model([input_bert_embeddings, input_ft_embeddings, input_tag_information], out)
    if use_crf:
        model.compile('adam',loss={'crf_layer': crf.get_loss})
    else:
        model.compile('adam',loss=tf.keras.losses.SparseCategoricalCrossentropy())
    return model
def get_ft_bert_bilstm_model(use_crf = True):
    #BERT-BiLSTM-SoftMAX with Custom Embeddings
    input_bert_embeddings = Input(shape=(bert_shape), name="input_bert_embeddings")
    input_tag_information = Input(shape=(tag_info_shape), name="input_tag_information")
    input_ft_embeddings = Input(shape=(ft_emb_shape), name="input_ft_embeddings")
    
#     input_tags_FFN = Masking(mask_value=-1.)(input_tags_info)
    input_tags_FFN = Dense(units = 150, activation = 'relu')(input_ft_embeddings)
    input_tags_FFN = Dense(units = 75, activation = 'relu', name="tags_FFN_out")(input_tags_FFN)
    
#     bert_FFN = Masking(mask_value=-1.)(input_bert_embeddings)
    bert_FFN = Dense(units = 768, activation = 'relu')(input_bert_embeddings)
    bert_FFN = Dense(units = 324, activation = 'relu')(bert_FFN)
    bert_FFN = Dense(units = 162, activation = 'relu', name="bert_FFN_out")(bert_FFN)

    # merged = Concatenate()([bert_FFN, input_tags_FFN])
    merged = Concatenate()([bert_FFN, input_tags_FFN, input_tag_information])
    model = Bidirectional(LSTM(units=HIDDEN_UNITS//2, return_sequences=True))(merged)
    if use_crf:
        crf=CRF(NUM_CLASS,name='crf_layer')
        out =crf(model)
    else:
        out = Dense(units = NUM_CLASS, activation='softmax')(model)
    model = Model([input_bert_embeddings, input_ft_embeddings, input_tag_information], out)
    if use_crf:
        model.compile('adam',loss={'crf_layer': crf.get_loss})
    else:
        model.compile('adam',loss=tf.keras.losses.SparseCategoricalCrossentropy())
    return model

#### Get ft-bert models and train

In [212]:
model = get_ft_bert_bilstm_model(use_crf=False)

In [140]:
model = get_ft_bert_bilstm_model_simple(use_crf=False)

In [213]:
model.summary()

Model: "model_22"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_bert_embeddings (InputLay [(None, 512, 1536)]  0                                            
__________________________________________________________________________________________________
dense_59 (Dense)                (None, 512, 768)     1180416     input_bert_embeddings[0][0]      
__________________________________________________________________________________________________
input_ft_embeddings (InputLayer [(None, 512, 300)]   0                                            
__________________________________________________________________________________________________
dense_60 (Dense)                (None, 512, 324)     249156      dense_59[0][0]                   
___________________________________________________________________________________________

In [214]:
model.fit(train_composite_input, train_y, batch_size=64, epochs=100, validation_split=0.1, verbose=1, callbacks=[bwCallback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Training Finish, Best epoch: 50, Best Val_loss: 0.04418157413601875


<tensorflow.python.keras.callbacks.History at 0x7f1a3e8b17b8>

#### Get full ft model and train

In [166]:
TIME_STAMPS = max_page_seq
HIDDEN_UNITS = 100
DROPOUT_RATE = 0.1
# NUM_CLASS = 5
NUM_CLASS = num_tags
print(f"TIME_STAMP: {TIME_STAMPS}")
print(f"HIDDEN_UNITS: {HIDDEN_UNITS}")
print(f"DROPOUT_RATE: {DROPOUT_RATE}")
print(f"NUM_CLASS: {NUM_CLASS}")
tag_info_shape = train_tag_x[0].shape
ft_emb_shape = train_text_emb_x[0].shape
def get_ft_bilstm_model(use_crf = True):
    input_tag_information = Input(shape=(tag_info_shape), name="input_tag_information")
    input_ft_embeddings = Input(shape=(ft_emb_shape), name="input_ft_embeddings")
    
    input_tags_FFN = Dense(units = 300, activation = 'relu')(input_ft_embeddings)
    input_tags_FFN = Dense(units = 150, activation = 'relu', name="tags_FFN_out")(input_tags_FFN)

    merged = Concatenate()([input_tag_information, input_tags_FFN])
    model = Bidirectional(LSTM(units=HIDDEN_UNITS//2, return_sequences=True))(merged)
    if use_crf:
        crf=CRF(NUM_CLASS,name='crf_layer')
        out =crf(model)
    else:
        out = Dense(units = NUM_CLASS, activation='softmax')(model)
    model = Model([input_ft_embeddings, input_tag_information], out)
    if use_crf:
        model.compile('adam',loss={'crf_layer': crf.get_loss})
    else:
        model.compile('adam',loss=tf.keras.losses.SparseCategoricalCrossentropy())
    return model

TIME_STAMP: 512
HIDDEN_UNITS: 100
DROPOUT_RATE: 0.1
NUM_CLASS: 5


In [186]:
model = get_ft_bilstm_model(use_crf = False)

In [187]:
model.summary()

Model: "model_19"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ft_embeddings (InputLayer [(None, 512, 600)]   0                                            
__________________________________________________________________________________________________
dense_50 (Dense)                (None, 512, 300)     180300      input_ft_embeddings[0][0]        
__________________________________________________________________________________________________
input_tag_information (InputLay [(None, 512, 8)]     0                                            
__________________________________________________________________________________________________
tags_FFN_out (Dense)            (None, 512, 150)     45150       dense_50[0][0]                   
___________________________________________________________________________________________

In [188]:
model.fit(train_composite_input, train_y, batch_size=64, epochs=50, validation_split=0.1, verbose=1, callbacks=[bwCallback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Training Finish, Best epoch: 14, Best Val_loss: 0.049879852682352066


<tensorflow.python.keras.callbacks.History at 0x7f1a669e4550>

#### Get bert and train

In [239]:
TEST_MODELS = [FFN_CRF, CRF_model, Softmax_model]

In [149]:
TEST_MODELS = [FFN_CRF, CRF_model]

In [159]:
TEST_MODELS = [Softmax_model]

## Test on val set

In [32]:
def get_test_data(type=None, scaled_page='normal'):
    if type is None:
        print("Please assign type of test_data")
        return
    if type != 'EVENT_SOURCE':
        storage.test_file = 'NORMAL'
        test_urls = [rec['Page URL'] for rec in storage.iter_test_records()]
        test_X_one, test_y_one, test_page_positions_one = storage.get_test_Xy(validate=False, scaled_page=scaled_page, contain_position = True)
        print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
        if type == 'NORMAL':
            return test_X_one, test_y_one, test_page_positions_one
    if type != 'NORMAL':
        storage.test_file = 'EVENT_SOURCE'
        test_urls = [rec['Page URL'] for rec in storage.iter_test_records()]
        test_X_two, test_y_two, test_page_positions_two = storage.get_test_Xy(validate=False, scaled_page=scaled_page, contain_position = True)
        print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
        if type == 'EVENT_SOURCE':
            return test_X_two, test_y_two, test_page_positions_two
    test_X_raw = test_X_one + test_X_two
    test_y = test_y_one + test_y_two
    test_positions = test_page_positions_one + test_page_positions_two
    return test_X_raw, test_y, test_positions

In [31]:
def label_distribution_to_label(predict_y):
    if len(predict_y.shape) != 3:
        return predict_y
    label_y = list()
    for page in predict_y:
        tmp = list()
        for lab in page:
            lab = lab.tolist()
            tmp.append(lab.index(max(lab)))
        label_y.append(tmp)
    return label_y

In [30]:
def recursive_predict_and_evaluate(models, x_test, y_test, evaluate_labels):
    for idx, model in enumerate(models):
        print(f"Start predict model {idx}")
#         print(model.summary())
        print("--------------------------")
        predict_y = model.predict(x_test)
        predict_y = label_distribution_to_label(predict_y)
        predict_y = np.asarray([[idx2tag.get(lab) for lab in page] for page in predict_y])
        print(flat_classification_report(y_test, predict_y, labels=evaluate_labels, digits=3))
        print("--------------------------")

In [49]:
# test_X_raw, test_y, test_page_positions = get_test_data('EVENT_SOURCE')
test_X_raw, test_y, test_page_positions = get_test_data('NORMAL')

Contain position: True
pages: 100  domains: 58


In [28]:
def get_chunks_data_wo_position(x, y):
    new_tmp_x_array = []
    new_tmp_y_array = []
    for tmp_x, tmp_y in zip(x, y):
        new_tmp_x_array.extend(chunks(tmp_x, max_page_seq))
        new_tmp_y_array.extend(chunks(tmp_y, max_page_seq))
    return new_tmp_x_array, new_tmp_y_array

In [210]:
def prepare_for_testing(test_X_raw, test_y_raw): #ft-bert
    chunks_test_x, chunks_test_y = get_chunks_data_wo_position(test_X_raw, test_y_raw)
    test_token_features, test_tag_features = get_token_tag_features_from_chunks(chunks_test_x)
    _, _, test_full_text_emb = page_to_two_bert_embeddings(test_token_features, pbert.get_tokenizer())
    test_tag_info_list = test_tag_features
    test_ft_emb = pages_to_word_vector(ft, test_token_features)
    ## X_test_input
    test_text_emb_x = feature_pad_to_npdata(test_full_text_emb)
    test_ft_emb_x = feature_pad_to_npdata(test_ft_emb)
    test_tag_x = feature_pad_to_npdata(test_tag_info_list)
    test_composite_input = [test_text_emb_x, test_ft_emb_x, test_tag_x]
    
    ## y_test_input
    y_test = pad_sequences([[tag2idx.get(l) for l in lab] for lab in chunks_test_y], maxlen=max_page_seq, padding="post", truncating="post", value=tag2idx["[PAD]"])
    y_test = [[idx2tag.get(lab) for lab in page] for page in y_test]
    y_test = np.asarray(y_test)
    
    return test_composite_input, y_test

In [50]:
test_urls = [rec['Page URL'] for rec in storage.iter_test_records()]
test_groups = set([get_domain(url) for url in test_urls])

In [51]:
chunks_test_x, chunks_test_y, chunks_test_positions = get_chunks_data(test_X_raw, test_y, test_page_positions)

In [173]:
test_token_features, test_tag_features = get_token_tag_features_from_chunks(chunks_test_x)

In [174]:
test_tag_info_list = test_tag_features

In [58]:
test_text_emb_x = feature_pad_to_npdata(test_full_text_emb) # full text emb / two-bert emb
# test_text_emb_x = feature_pad_to_npdata(test_second_emb)
# test_text_emb_x = feature_pad_to_npdata(test_text_emb) # text emb

In [59]:
test_tag_x = feature_pad_to_npdata(test_tag_info_list)

In [60]:
test_info_x = np.concatenate([test_text_emb_x, test_tag_x], axis = 2)

### input for full ft_emb

In [184]:
def prepare_for_testing_full_ft(test_X_raw, test_y_raw): #ft-full
    chunks_test_x, chunks_test_y = get_chunks_data_wo_position(test_X_raw, test_y_raw)
    test_token_features, test_tag_features = get_token_tag_features_from_chunks(chunks_test_x)
    
    test_tag_info_list = test_tag_features
    test_ft_full_emb = pages_to_word_vector_from_keylist(ft, test_token_features)
    ## X_test_input
    test_ft_emb_x = feature_pad_to_npdata(test_ft_full_emb)
    test_tag_x = feature_pad_to_npdata(test_tag_info_list)
    test_composite_input = [test_ft_emb_x, test_tag_x]
    
    ## y_test_input
    y_test = pad_sequences([[tag2idx.get(l) for l in lab] for lab in chunks_test_y], maxlen=max_page_seq, padding="post", truncating="post", value=tag2idx["[PAD]"])
    y_test = [[idx2tag.get(lab) for lab in page] for page in y_test]
    y_test = np.asarray(y_test)
    
    return test_composite_input, y_test

In [176]:
ft_full_emb = pages_to_word_vector_from_keylist(ft, test_token_features)

Transform key ['text-before', 'text-exact', 'text-after', 'class', 'query', 'parent-tag'] to word_vector ... 


In [178]:
test_full_text_x = feature_pad_to_npdata(ft_full_emb)

In [177]:
test_tag_x = feature_pad_to_npdata(test_tag_info_list)

In [179]:
test_composite_input = [test_full_text_x, test_tag_x]

In [183]:
for inputs in test_composite_input:
    print(inputs.shape)

(68, 512, 600)
(68, 512, 8)


## Ready to testing

In [62]:
test_composite_with_token = [test_text_emb_x, test_tag_x, test_class, test_query, test_parent_tag]

In [65]:
y_test = pad_sequences([[tag2idx.get(l) for l in lab] for lab in chunks_test_y], maxlen=max_page_seq, padding="post", truncating="post", value=tag2idx["[PAD]"])
y_test = [[idx2tag.get(lab) for lab in page] for page in y_test]
y_test = np.asarray(y_test)

In [67]:
# evaluate_labels = ['PREV', 'PAGE', 'NEXT', '[PAD]', 'O']
evaluate_labels = ['PAGE', 'NEXT']

In [70]:
def test_custom_model(iter_train_x, iter_train_y, iter_test_x, iter_test_y, iters = 10):
    for it in range(iters):
        print(f"iteration: {it}")
        model = get_custom_emb_model(use_crf=False)
        model.fit(iter_train_x, iter_train_y, batch_size=8, epochs=1000, validation_split=0.1, verbose=False, callbacks=[earlyStopping])
        recursive_predict_and_evaluate([model], iter_test_x, iter_test_y, ['PAGE','NEXT'])
test_custom_model(train_composite_with_token, train_y, test_composite_with_token, y_test)

iteration: 0
Start predict model 0
--------------------------




              precision    recall  f1-score   support

        PAGE      0.665     0.860     0.750       279
        NEXT      0.786     0.449     0.571        49

   micro avg      0.674     0.799     0.731       328
   macro avg      0.725     0.655     0.661       328
weighted avg      0.683     0.799     0.723       328

--------------------------
iteration: 1
Start predict model 0
--------------------------


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        PAGE      0.624     0.839     0.716       279
        NEXT      0.000     0.000     0.000        49

   micro avg      0.624     0.713     0.666       328
   macro avg      0.312     0.419     0.358       328
weighted avg      0.531     0.713     0.609       328

--------------------------
iteration: 2
Start predict model 0
--------------------------
              precision    recall  f1-score   support

        PAGE      0.682     0.878     0.768       279
        NEXT      0.000     0.000     0.000        49

   micro avg      0.682     0.747     0.713       328
   macro avg      0.341     0.439     0.384       328
weighted avg      0.580     0.747     0.653       328

--------------------------
iteration: 3
Start predict model 0
--------------------------


InternalError:  Dst tensor is not initialized.
	 [[{{node model_6/embedding_14/embedding_lookup/_30}}]] [Op:__inference_predict_function_84639]

Function call stack:
predict_function


In [181]:
test_languages = storage.get_all_test_languages()
# test_languages = ["en"]

In [215]:
TEST_MODELS = [model]
test_languages = storage.get_all_test_languages()
for language in test_languages:
    print("Testing language: ", language)
    test_urls = [rec['Page URL'] for rec in storage.iter_test_records_by_language(language=language)]
    test_X_raw, test_y = storage.get_test_Xy_by_language(language=language)
    print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
    _test_x, _test_y = prepare_for_testing(test_X_raw, test_y)
    recursive_predict_and_evaluate(TEST_MODELS, _test_x, _test_y, ['PAGE','NEXT'])
    print("===================================")

Testing language:  en
pages: 49  domains: 34
Start encode first seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=68)

Start encode second seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=68)

Start predict model 0
--------------------------
              precision    recall  f1-score   support

        PAGE      0.801     0.865     0.832       126
        NEXT      0.000     0.000     0.000        29

   micro avg      0.801     0.703     0.749       155
   macro avg      0.401     0.433     0.416       155
weighted avg      0.652     0.703     0.676       155

--------------------------
Testing language:  zh
pages: 44  domains: 19
Start encode first seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=48)

Start encode second seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=48)

Start predict model 0
--------------------------
              precision    recall  f1-score   support

        PAGE      0.714     0.650     0.681       277
        NEXT      0.000     0.000     0.000        24

   micro avg      0.714     0.598     0.651       301
   macro avg      0.357     0.325     0.340       301
weighted avg      0.657     0.598     0.626       301

--------------------------
Testing language:  ko
pages: 24  domains: 13
Start encode first seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=36)

Start encode second seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=36)

Start predict model 0
--------------------------
              precision    recall  f1-score   support

        PAGE      0.644     0.933     0.762        60
        NEXT      0.000     0.000     0.000         5

   micro avg      0.644     0.862     0.737        65
   macro avg      0.322     0.467     0.381        65
weighted avg      0.594     0.862     0.703        65

--------------------------
Testing language:  ja
pages: 23  domains: 9
Start encode first seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=23)

Start encode second seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=23)

Start predict model 0
--------------------------
              precision    recall  f1-score   support

        PAGE      0.644     0.592     0.617        49
        NEXT      0.000     0.000     0.000        11

   micro avg      0.644     0.483     0.552        60
   macro avg      0.322     0.296     0.309        60
weighted avg      0.526     0.483     0.504        60

--------------------------
Testing language:  de
pages: 20  domains: 7
Start encode first seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=25)

Start encode second seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=25)

Start predict model 0
--------------------------
              precision    recall  f1-score   support

        PAGE      0.892     0.770     0.826       139
        NEXT      0.000     0.000     0.000        22

   micro avg      0.892     0.665     0.762       161
   macro avg      0.446     0.385     0.413       161
weighted avg      0.770     0.665     0.713       161

--------------------------
Testing language:  ru
pages: 21  domains: 14
Start encode first seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=20)

Start encode second seg embeddings
Use custom Token: True


IntProgress(value=0, description='(Init)', max=20)

Start predict model 0
--------------------------
              precision    recall  f1-score   support

        PAGE      0.692     0.730     0.711        37
        NEXT      0.000     0.000     0.000         7

   micro avg      0.692     0.614     0.651        44
   macro avg      0.346     0.365     0.355        44
weighted avg      0.582     0.614     0.597        44

--------------------------


In [189]:
test_languages = storage.get_all_test_languages()
#Test for full ft emb
TEST_MODELS = [model]
for language in test_languages:
    print("Testing language: ", language)
    test_urls = [rec['Page URL'] for rec in storage.iter_test_records_by_language(language=language)]
    test_X_raw, test_y = storage.get_test_Xy_by_language(language=language)
    print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
    _test_x, _test_y = prepare_for_testing_full_ft(test_X_raw, test_y)
    recursive_predict_and_evaluate(TEST_MODELS, _test_x, _test_y, ['PAGE','NEXT'])
    print("===================================")

Testing language:  en
pages: 49  domains: 34
Transform key ['text-before', 'text-exact', 'text-after', 'class', 'query', 'parent-tag'] to word_vector ... 
Start predict model 0
--------------------------
              precision    recall  f1-score   support

        PAGE      0.843     0.556     0.670       126
        NEXT      0.000     0.000     0.000        29

   micro avg      0.843     0.452     0.588       155
   macro avg      0.422     0.278     0.335       155
weighted avg      0.686     0.452     0.545       155

--------------------------
Testing language:  zh
pages: 44  domains: 19
Transform key ['text-before', 'text-exact', 'text-after', 'class', 'query', 'parent-tag'] to word_vector ... 
Start predict model 0
--------------------------
              precision    recall  f1-score   support

        PAGE      0.783     0.534     0.635       277
        NEXT      0.000     0.000     0.000        24

   micro avg      0.783     0.492     0.604       301
   macro avg      0.