In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
import re
import sys
from collections import Counter
from itertools import islice
from urllib.parse import urlparse, urlsplit, parse_qs, parse_qsl

import numpy as np
import parsel
from sklearn_crfsuite.metrics import flat_classification_report, sequence_accuracy_score

sys.path.insert(0, '..')
from autopager.storage import Storage
from autopager.htmlutils import (get_link_text, get_text_around_selector_list,
                                 get_link_href, get_selector_root)
from autopager.utils import (
    get_domain, normalize_whitespaces, normalize, ngrams, tokenize, ngrams_wb, replace_digits
)
from autopager.model import _num_tokens_feature, _elem_attr
from autopager import AUTOPAGER_LIMITS
from autopager.parserutils import (TagParser, MyHTMLParser, draw_scaled_page, position_check, compare_tag, get_first_tag)
parser = MyHTMLParser()
tagParser = TagParser()

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random
tf.random.set_seed(0)
random.seed(0)
np.random.seed(0)

In [3]:
storage = Storage()

Current test file:  ['en', 'zh', 'ko', 'ja', 'de', 'ru']


In [4]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if len(gpus)!=0:
  # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print("No GPUs visible")

1 Physical GPUs, 1 Logical GPU


In [5]:
%%time
urls = [rec['Page URL'] for rec in storage.iter_records(language='en',contain_button = True, file_type='T')]
X_raw, y, page_positions = storage.get_Xy(language='en',contain_button = True,  contain_position=True,file_type='T', scaled_page='normal')
print("pages: {}  domains: {}".format(len(urls), len({get_domain(url) for url in urls})))

Contain position: True
Finish: Get Page 1 (Encoding: UTF-8)records ... (len: 303)
Finish: Get Page 2 (Encoding: UTF-8)records ... (len: 243)
Finish: Get Page 3 (Encoding: UTF-8)records ... (len: 119)
Finish: Get Page 4 (Encoding: UTF-8)records ... (len: 944)
Finish: Get Page 5 (Encoding: UTF-8)records ... (len: 93)
Finish: Get Page 6 (Encoding: UTF-8)records ... (len: 994)
Finish: Get Page 7 (Encoding: UTF-8)records ... (len: 1014)
Finish: Get Page 8 (Encoding: UTF-8)records ... (len: 7)
Finish: Get Page 21 (Encoding: UTF-8)records ... (len: 158)
Finish: Get Page 22 (Encoding: UTF-8)records ... (len: 171)
Finish: Get Page 23 (Encoding: UTF-8)records ... (len: 181)
Finish: Get Page 24 (Encoding: UTF-8)records ... (len: 10)
Finish: Get Page 25 (Encoding: UTF-8)records ... (len: 165)
Finish: Get Page 26 (Encoding: UTF-8)records ... (len: 147)
Finish: Get Page 28 (Encoding: UTF-8)records ... (len: 268)
Finish: Get Page 33 (Encoding: UTF-8)records ... (len: 108)
Finish: Get Page 34 (Encodin

Finish: Get Page 284 (Encoding: cp1252)records ... (len: 130)
Finish: Get Page 287 (Encoding: UTF-8)records ... (len: 82)
Finish: Get Page 288 (Encoding: UTF-8)records ... (len: 140)
Finish: Get Page 289 (Encoding: UTF-8)records ... (len: 44)
Finish: Get Page 293 (Encoding: UTF-8)records ... (len: 74)
Finish: Get Page 294 (Encoding: UTF-8)records ... (len: 63)
Finish: Get Page 295 (Encoding: UTF-8)records ... (len: 65)
Finish: Get Page 296 (Encoding: UTF-8)records ... (len: 20)
Finish: Get Page 299 (Encoding: UTF-8)records ... (len: 361)
Finish: Get Page 300 (Encoding: UTF-8)records ... (len: 361)
Finish: Get Page 301 (Encoding: UTF-8)records ... (len: 364)
Finish: Get Page 302 (Encoding: UTF-8)records ... (len: 170)
Finish: Get Page 303 (Encoding: UTF-8)records ... (len: 154)
Finish: Get Page 304 (Encoding: cp1252)records ... (len: 117)
Finish: Get Page 305 (Encoding: UTF-8)records ... (len: 1987)
Finish: Get Page 312 (Encoding: cp1252)records ... (len: 136)
Finish: Get Page 313 (Enco

In [6]:
max_page_seq = 512

In [7]:
from tensorflow.keras import Model
from tensorflow.data import Dataset

## Slice data into chunks

In [8]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [9]:
def get_chunks_data(x, y, p):
    new_tmp_x_array = []
    new_tmp_y_array = []
    new_tmp_p_array = []
    for tmp_x, tmp_y, tmp_p in zip(x, y, p):
        new_tmp_x_array.extend(chunks(tmp_x, max_page_seq))
        new_tmp_y_array.extend(chunks(tmp_y, max_page_seq))
        new_tmp_p_array.extend(chunks(tmp_p, max_page_seq))
    return new_tmp_x_array, new_tmp_y_array, new_tmp_p_array

In [10]:
chunks_x, chunks_y, chunk_positions = get_chunks_data(X_raw, y, page_positions)

In [11]:
len(chunks_x)

183

## Load Pre-trained Fastext model

In [12]:
from FastTextModel import FastTextModel

1 Physical GPUs, 1 Logical GPU


In [13]:
ft = FastTextModel()



Current dimension:  100


In [14]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(a,b):
    ans = dot(a, b)/(norm(a)*norm(b))
    return ans

In [15]:
cos_sim(ft.getWordVector("last"), ft.getWordVector("next"))

0.80560714

# Feature extraction

In [164]:
%%time
# XXX: these functions should be copy-pasted from autopager/model.py

def _as_list(generator, limit=None):
    """
    >>> _as_list(ngrams_wb("text", 2, 2), 0)
    []
    >>> _as_list(ngrams_wb("text", 2, 2), 2)
    ['te', 'ex']
    >>> _as_list(ngrams_wb("text", 2, 2))
    ['te', 'ex', 'xt']
    """
    return list(generator if limit is None else islice(generator, 0, limit))

def feat_to_tokens(feat, tokenizer):
    if type(feat) == type([]):
        feat = ' '.join(feat)
    tokens = tokenizer.tokenize(feat)
    return tokens

def num_token_feature_to_class(number):
    if number == '=0':
        return [1, 0, 0, 0]
    elif number == '=1':
        return [0, 1, 0, 0]
    elif number == '=2':
        return [0, 0, 1, 0]
    else:
        return [0, 0, 0, 1]

def link_to_features(link):
    text = normalize(get_link_text(link))
    href = get_link_href(link)
    if href is None:
        href = ""
    p = urlsplit(href)
    parent = link.xpath('..').extract()
    parent = get_first_tag(parser, parent[0])
    query_parsed = parse_qsl(p.query) #parse query string from path
    query_param_names = [k.lower() for k, v in query_parsed]
    query_param_names_ngrams = _as_list(ngrams_wb(
        " ".join([normalize(name) for name in query_param_names]), 3, 5, True
    ))

    # Classes of link itself and all its children.
    # It is common to have e.g. span elements with fontawesome
    # arrow icon classes inside <a> links.
    self_and_children_classes = ' '.join(link.xpath(".//@class").extract())
    parent_classes = ' '.join(link.xpath('../@class').extract())
    css_classes = normalize(parent_classes + ' ' + self_and_children_classes)
    
    token_feature = {
        'text-exact': replace_digits(text.strip()[:100].strip()),
        'query': query_param_names,
        'parent-tag': parent,
        'class': css_classes.split()[:AUTOPAGER_LIMITS.max_css_features],
    }
    tag_feature = {
        'isdigit': 1 if text.isdigit() is True else 0,
        'isalpha': 1 if text.isalpha() is True else 0,
        'has-href': 0 if href is "" else 1,
        'path-has-page': 1 if 'page' in p.path.lower() else 0,
        'path-has-pageXX': 1 if re.search(r'[/-](?:p|page\w?)/?\d+', p.path.lower()) is not None else 0,
        'path-has-number': 1 if any(part.isdigit() for part in p.path.split('/')) else 0,
        'href-has-year': 1 if re.search('20\d\d', href) is not None else 0,
        'class-has-disabled': 1 if 'disabled' in css_classes else 0,
#         'num-tokens': num_token_feature_to_class(_num_tokens_feature(text)),
    }
    non_token_feature = []
    for k,v in tag_feature.items():
        if type(v) == type([]):
            non_token_feature.extend(v)
        else:
            non_token_feature.append(v)

    return [token_feature, non_token_feature]


def page_to_features(xseq):
    feat_list = [link_to_features(a) for a in xseq]
    around = get_text_around_selector_list(xseq, max_length=15)
    
    return feat_list

CPU times: user 0 ns, sys: 24 µs, total: 24 µs
Wall time: 32.9 µs


In [165]:
def get_token_tag_features_from_chunks(chunks):
    token_features = []
    tag_features = []
    for idx, page in enumerate(chunks):
        try:
            feat_list = page_to_features(page)
            token_features.append([node[0] for node in feat_list])
            tag_features.append([node[1] for node in feat_list])
        except:
            raise Exception(f"Error occured on {idx}")
    return token_features, tag_features

In [166]:
def word_to_vector(ft, word_list):
    if type(word_list) == type([]):
        if len(word_list) == 0:
            return np.zeros(ft.getModel().get_dimension())
        else:
            vectors_array = []
            for word in word_list:
                vector = ft.getWordVector(word)
                vectors_array.append(vector)
            mean_vector = np.mean(vectors_array, axis = 0)
            return mean_vector
    else:
        return ft.getWordVector(word_list)

In [167]:
def pages_to_word_vector(ft, token_features):
    pages_vector = []
    for page in token_features:
        page_vectors = []
        for node in page:
            classes = word_to_vector(ft, node['class'])
            query = word_to_vector(ft, node['query'])
            p_tag = word_to_vector(ft, node['parent-tag'])
            full_vector = np.concatenate([classes, query, p_tag], axis = 0)
            page_vectors.append(full_vector)
        pages_vector.append(np.array(page_vectors))
    return pages_vector

In [168]:
token_features, tag_features = get_token_tag_features_from_chunks(chunks_x)
# train_tag_feature_token_list = extract_tokens_from_token_features(token_features)

In [169]:
token_feature_list = list(token_features[0][0].keys())

In [170]:
def pages_to_word_vector_from_keylist(ft, token_features, word_to_vec_list = token_feature_list):
    print(f"Transform key {word_to_vec_list} to word_vector ... ")
    pages_vector = []
    for page in token_features:
        page_vectors = []
        for node in page:
            full_vector_list = []
            for k,v in node.items():
                if k in word_to_vec_list:
                    full_vector_list.append(word_to_vector(ft, v))
            full_vector = np.concatenate(full_vector_list, axis=0)
            page_vectors.append(full_vector)
        pages_vector.append(np.array(page_vectors))
    return pages_vector

In [171]:
top_parent_tags = {}
for page in token_features:
    for node in page:
        p_tag = node['parent-tag']
        if p_tag not in top_parent_tags:
            top_parent_tags[p_tag] = 1
        else:
            top_parent_tags[p_tag] += 1
sorted_parent_tags = sorted(top_parent_tags.items(),key=lambda x:x[1],reverse=True)

In [172]:
data_map_for_ptag = sorted_parent_tags[:30]

In [173]:
def sparse_representation_with_map(tag, data_map = data_map_for_ptag):
    rt_vec = [0] * len(data_map)
    for idx, map_tag in enumerate(data_map):
        if tag == map_tag[0]:
            rt_vec[idx] = 1
            break
    return rt_vec

In [174]:
def get_ptags_vector(token_features):
    pages_ptag = []
    for page in token_features:
        ptag_page = []
        for node in page:
            p_tag = node['parent-tag']
            ptag_page.append(sparse_representation_with_map(p_tag))
        pages_ptag.append(ptag_page)
    return pages_ptag

In [175]:
ptags_vector = get_ptags_vector(token_features)

In [176]:
from collections import OrderedDict

In [177]:
class TagTokenizer:
    def __init__(self, myDict = None):
        rt_dict = {}
        rt_dict['[PAD]'] = 0
        rt_dict['[UNK]'] = 1
        i = 2
        if myDict is not None:
            for item in myDict:
                rt_dict[item[0]] = i
                i+=1
        self.map = rt_dict
        
    def tokenize(self, word):
        if type(word) == type([]):
            token_list = []
            for _word in word:
                if _word not in self.map:
                    token_list.append(self.map['[UNK]'])
                else:
                    token_list.append(self.map[_word])
            return token_list
        else:
            if word not in self.map:
                return self.map['[UNK]']
            else:
                return self.map[word]
    def get_size(self):
        return len(self.map)

In [178]:
top_thousand_class = {}
top_thousand_query = {}
for page in token_features:
    for node in page:
        for _class in node['class']:
            if _class in top_thousand_class:
                top_thousand_class[_class]+=1
            else:
                top_thousand_class[_class]=1
        for _query in node['query']:
            if _query in top_thousand_query:
                top_thousand_query[_query]+=1
            else:
                top_thousand_query[_query]=1
sorted_class_map = sorted(top_thousand_class.items(),key=lambda x:x[1],reverse=True)
sorted_query_map = sorted(top_thousand_query.items(),key=lambda x:x[1],reverse=True)

class_tokenizer = TagTokenizer(sorted_class_map)
query_tokenizer = TagTokenizer(sorted_query_map)


In [179]:
# Use ft to encode all token_features
ft_full_tokens_emb = pages_to_word_vector_from_keylist(ft, token_features, ['text-exact'])

Transform key ['text-exact'] to word_vector ... 


In [180]:
ft_full_tokens_emb[0].shape

(303, 100)

In [181]:
train_tag_info_list = tag_features #features which only have tag true/false information

## Padding to fixed size

In [182]:
def prepare_input_ids(page_tokens, max_len):
    pages_class = []
    pages_query = []
#     print(len(page_tokens))
    for page in page_tokens:
        class_page = []
        query_page = []
        for node in page:
            #class
            class_ids = class_tokenizer.tokenize(node['class'])
            class_ids = class_ids + [0] * (max_len-len(class_ids))
            class_page.append(class_ids[:max_len])
            #query
            query_ids = query_tokenizer.tokenize(node['query'])
            query_ids = query_ids + [0] * (max_len-len(query_ids))
            query_page.append(query_ids[:max_len])
        pages_class.append(class_page)
        pages_query.append(query_page)
    return pages_class, pages_query

In [183]:
def feature_pad_to_npdata(embedding):
    dataset = Dataset.from_generator(lambda: iter(embedding), tf.float32)
    dataset = dataset.padded_batch(1, padded_shapes= (max_page_seq, len(embedding[0][0])), padding_values=-1.,drop_remainder=False)
    after_pad = np.array([ data[0] for data in list(dataset.as_numpy_iterator())])
    return after_pad

In [184]:
def token_pad_to_npdata(embedding):
    dataset = Dataset.from_generator(lambda: iter(embedding), tf.int32)
    dataset = dataset.padded_batch(1, padded_shapes= (max_page_seq, len(embedding[0][0])), padding_values=0,drop_remainder=False)
    after_pad = np.array([ data[0] for data in list(dataset.as_numpy_iterator())])
    return after_pad

In [185]:
max_len = 256

In [186]:
pages_class, pages_query = prepare_input_ids(token_features, max_len)

In [187]:
train_class = token_pad_to_npdata(pages_class)
train_query = token_pad_to_npdata(pages_query)
print("Current Shape:")
print(f"train_class: {train_class.shape}")
print(f"train_query: {train_query.shape}")

Current Shape:
train_class: (183, 512, 256)
train_query: (183, 512, 256)


In [188]:
train_ptag = token_pad_to_npdata(ptags_vector)

In [189]:
train_tag_x = feature_pad_to_npdata(tag_features)

In [190]:
train_attr_x = feature_pad_to_npdata(ft_full_tokens_emb)

In [191]:
# Concate Tag Embedding + Tag information
train_tag_x = np.concatenate([train_tag_x, train_ptag], axis = 2)

In [192]:
labels = ["O", "PREV", "PAGE", "NEXT", "[PAD]"]
tag2idx = { label:idx for idx,label in enumerate(labels)}
idx2tag = { idx:label for idx,label in enumerate(labels)}
num_tags = len(labels)

In [193]:
train_y = pad_sequences([[tag2idx.get(l) for l in lab] for lab in chunks_y], maxlen=max_page_seq, padding="post", truncating="post", value=tag2idx["[PAD]"])

In [194]:
print("Current Shape:")
print(f"train_tag_x: {train_tag_x.shape}")
print(f"train_ft_x: {train_attr_x.shape}")
print(f"train_y: {train_y.shape}")

Current Shape:
train_tag_x: (183, 512, 38)
train_ft_x: (183, 512, 100)
train_y: (183, 512)


In [195]:
train_composite_with_token = [train_attr_x, train_tag_x, train_class, train_query]

In [196]:
for inputs in train_composite_with_token:
    print(inputs.shape)

(183, 512, 100)
(183, 512, 38)
(183, 512, 256)
(183, 512, 256)


## Build BERT-BiLSTM-CRF Model

In [139]:
from tensorflow_addons.layers.crf import CRF

In [140]:
from tensorflow.keras.layers import (Dense, Input, Bidirectional, LSTM, Embedding, Masking, Concatenate,
                                    AveragePooling2D, MaxPooling2D, Reshape)

In [141]:
class BestWeightCallback(keras.callbacks.Callback):
    def __init__(self):
        self.best_weights = None
    def on_train_begin(self, logs=None):
        self.best = np.Inf
        self.best_epoch = np.Inf
    def on_epoch_begin(self, epoch, logs=None):
        keys = list(logs.keys())

    def on_epoch_end(self, epoch, logs=None):
        current = logs.get("val_loss")
        epoch = epoch + 1
        if np.less(current, self.best):
            self.best_weights = self.model.get_weights()
            self.best = current
            self.best_epoch = epoch
    def on_train_end(self, logs=None):
        print(f"Training Finish, Best epoch: {self.best_epoch}, Best Val_loss: {self.best}")
        self.model.set_weights(self.best_weights)

In [142]:
TIME_STAMPS = max_page_seq
HIDDEN_UNITS = 200
DROPOUT_RATE = 0.1
# NUM_CLASS = 5
NUM_CLASS = num_tags
print(f"TIME_STAMP: {TIME_STAMPS}")
print(f"HIDDEN_UNITS: {HIDDEN_UNITS}")
print(f"DROPOUT_RATE: {DROPOUT_RATE}")
print(f"NUM_CLASS: {NUM_CLASS}")

TIME_STAMP: 512
HIDDEN_UNITS: 200
DROPOUT_RATE: 0.1
NUM_CLASS: 5


In [143]:
bwCallback = BestWeightCallback()

In [197]:
#For custom embedding
ft_shape = train_attr_x[0].shape
tag_info_shape = train_tag_x[0].shape
tag_emb_shape = train_class[0].shape
embbed_output_shape = 32
page_embbed_shape = (train_class[0].shape[0], embbed_output_shape)
pool_size = (train_class[0].shape[1], 1)
def get_custom_emb_model(use_crf = True):
    input_ft_embedding = Input(shape=(ft_shape), name="input_ft_embeddings")
    input_tag_information = Input(shape=(tag_info_shape), name="input_tag_information")
    input_class = Input(shape=(tag_emb_shape), name="input_class")
    input_query = Input(shape=(tag_emb_shape), name="input_query")

    #Embedding layers
    ## input_class
    class_emb = Embedding(input_dim = class_tokenizer.get_size(), output_dim = embbed_output_shape, input_length=max_page_seq, mask_zero = True)(input_class)
    class_emb = AveragePooling2D(pool_size, data_format = 'channels_first')(class_emb)
#     class_emb = MaxPooling2D(pool_size, data_format = 'channels_first')(class_emb)
    class_emb = Reshape(page_embbed_shape, name="class_emb_out")(class_emb)
    ## input_query
    query_emb = Embedding(input_dim = query_tokenizer.get_size(), output_dim = embbed_output_shape, input_length=max_page_seq, mask_zero = True)(input_query)
    query_emb = AveragePooling2D(pool_size, data_format = 'channels_first')(query_emb)
#     query_emb = MaxPooling2D(pool_size, data_format = 'channels_first')(query_emb)
    query_emb = Reshape(page_embbed_shape, name="query_emb_out")(query_emb)

    input_tags = Concatenate()([class_emb, query_emb])
#     input_tags = Concatenate()([class_emb, query_emb, input_tag_information])
    input_tags_FFN = Dense(units = 256, activation = 'relu')(input_tags)
    input_tags_FFN = Dense(units = 128, activation = 'relu', name="input_tag_FFN_out")(input_tags_FFN)

    ft_FFN = Dense(units = 256, activation = 'relu', name="ft_FFN_01")(input_ft_embedding)
    ft_FFN = Dense(units = 128, activation = 'relu', name="ft_FFN_out")(ft_FFN)


    merged = Concatenate()([ft_FFN, input_tags_FFN, input_tag_information])
    model = Bidirectional(LSTM(units = HIDDEN_UNITS//2, return_sequences=True))(merged)
    if use_crf:
        crf=CRF(NUM_CLASS, name='crf_layer')
        out =crf(model)
    else:
        out = Dense(units = NUM_CLASS, activation='softmax')(model)
    model = Model([input_ft_embedding, input_tag_information, input_class, input_query], out)
    if use_crf:
        model.compile('adam',loss={'crf_layer': crf.get_loss})
    else:
        model.compile('adam',loss=tf.keras.losses.SparseCategoricalCrossentropy())
    return model

In [198]:
model = get_custom_emb_model(use_crf=True)

In [199]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_class (InputLayer)        [(None, 512, 256)]   0                                            
__________________________________________________________________________________________________
input_query (InputLayer)        [(None, 512, 256)]   0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 512, 256, 32) 122400      input_class[0][0]                
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 512, 256, 32) 9632        input_query[0][0]                
____________________________________________________________________________________________

In [200]:
model.fit(train_composite_with_token, train_y, batch_size=64, epochs=100, validation_split=0.1, verbose=1, callbacks=[bwCallback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Training Finish, Best epoch: 50, Best Val_loss: 18.434776306152344


<tensorflow.python.keras.callbacks.History at 0x7fdd23b3c668>

## Test on val set

In [148]:
def label_distribution_to_label(predict_y):
    if len(predict_y.shape) != 3:
        return predict_y
    label_y = list()
    for page in predict_y:
        tmp = list()
        for lab in page:
            lab = lab.tolist()
            tmp.append(lab.index(max(lab)))
        label_y.append(tmp)
    return label_y

In [149]:
def get_chunks_data_wo_position(x, y):
    new_tmp_x_array = []
    new_tmp_y_array = []
    for tmp_x, tmp_y in zip(x, y):
        new_tmp_x_array.extend(chunks(tmp_x, max_page_seq))
        new_tmp_y_array.extend(chunks(tmp_y, max_page_seq))
    return new_tmp_x_array, new_tmp_y_array

In [150]:
def prepare_for_testing(test_X_raw, test_y_raw): #ft-bert
    chunks_test_x, chunks_test_y = get_chunks_data_wo_position(test_X_raw, test_y_raw)
    test_token_features, test_tag_features = get_token_tag_features_from_chunks(chunks_test_x)
    
    test_ptags_vector = get_ptags_vector(test_token_features)
    test_ft_emb = pages_to_word_vector_from_keylist(ft, test_token_features, ['text-exact'])
    test_tag_info_list = test_tag_features
    ## Tokens prepare
    test_pages_class, test_pages_query = prepare_input_ids(test_token_features, max_len)
    test_class = token_pad_to_npdata(test_pages_class)
    test_query = token_pad_to_npdata(test_pages_query)
    test_ptags = token_pad_to_npdata(test_ptags_vector)
    ## X_test_input
    test_ft_emb_x = feature_pad_to_npdata(test_ft_emb)
    test_tag_x = feature_pad_to_npdata(test_tag_info_list)
    test_tag_x = np.concatenate([test_tag_x, test_ptags], axis = 2)
    test_composite_input = [test_ft_emb_x, test_tag_x, test_class, test_query]
    
    ## y_test_input
    y_test = pad_sequences([[tag2idx.get(l) for l in lab] for lab in chunks_test_y], maxlen=max_page_seq, padding="post", truncating="post", value=tag2idx["[PAD]"])
    y_test = [[idx2tag.get(lab) for lab in page] for page in y_test]
    y_test = np.asarray(y_test)
    
    return test_composite_input, y_test

## Ready to testing

In [151]:
def recursive_predict_and_evaluate(models, x_test, y_test, evaluate_labels):
    for idx, model in enumerate(models):
        print(f"Start predict model {idx}")
#         print(model.summary())
        print("--------------------------")
        predict_y = model.predict(x_test)
        predict_y = label_distribution_to_label(predict_y)
        predict_y = np.asarray([[idx2tag.get(lab) for lab in page] for page in predict_y])
        print(flat_classification_report(y_test, predict_y, labels=evaluate_labels, digits=3))
        print("--------------------------")

In [201]:
test_languages = storage.get_all_test_languages()
# test_languages = ["en"]

In [202]:
TEST_MODELS = [model]
test_languages = storage.get_all_test_languages()
for language in test_languages:
    print("Testing language: ", language)
    test_urls = [rec['Page URL'] for rec in storage.iter_test_records_by_language(language=language)]
    test_X_raw, test_y = storage.get_test_Xy_by_language(language=language)
    print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
    _test_x, _test_y = prepare_for_testing(test_X_raw, test_y)
    recursive_predict_and_evaluate(TEST_MODELS, _test_x, _test_y, ['PAGE','NEXT'])
    print("===================================")

Testing language:  en
pages: 49  domains: 34
Transform key ['text-exact'] to word_vector ... 
Start predict model 0
--------------------------
              precision    recall  f1-score   support

        PAGE      0.844     0.857     0.850       126
        NEXT      0.000     0.000     0.000        29

   micro avg      0.844     0.697     0.763       155
   macro avg      0.422     0.429     0.425       155
weighted avg      0.686     0.697     0.691       155

--------------------------
Testing language:  zh
pages: 44  domains: 19
Transform key ['text-exact'] to word_vector ... 
Start predict model 0
--------------------------
              precision    recall  f1-score   support

        PAGE      0.851     0.596     0.701       277
        NEXT      1.000     0.083     0.154        24

   micro avg      0.852     0.555     0.672       301
   macro avg      0.925     0.340     0.427       301
weighted avg      0.862     0.555     0.657       301

--------------------------
Testin