In [None]:
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = "1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
import re
import sys
from collections import Counter
from itertools import islice
from urllib.parse import urlparse, urlsplit, parse_qs, parse_qsl

import numpy as np
import parsel
from sklearn_crfsuite.metrics import flat_classification_report, sequence_accuracy_score

sys.path.insert(0, '..')
from autopager.storage import Storage
from autopager.htmlutils import (get_link_text, get_text_around_selector_list,
                                 get_link_href, get_selector_root)
from autopager.utils import (
    get_domain, normalize_whitespaces, normalize, ngrams, tokenize, ngrams_wb, replace_digits
)
from autopager.model import _num_tokens_feature, _elem_attr
from autopager import AUTOPAGER_LIMITS
from autopager.parserutils import (TagParser, MyHTMLParser, draw_scaled_page, position_check, compare_tag, get_first_tag)
parser = MyHTMLParser()
tagParser = TagParser()

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random
tf.random.set_seed(0)
random.seed(0)
np.random.seed(0)

In [None]:
from ipywidgets import IntProgress
from IPython.display import display


## Set GPU

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')

In [None]:
gpus

In [None]:
USED_GPU = 1

In [None]:
for device in gpus:
    tf.config.experimental.set_memory_growth(device, True)

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if len(gpus)!=0:
  # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[USED_GPU], 'GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
    # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print("No GPUs visible")

## Load data

In [None]:
storage = Storage()

In [None]:
%%time
urls = [rec['Page URL'] for rec in storage.iter_records(language='en',contain_button = True, file_type='T')]
X_raw, y, page_positions = storage.get_Xy(language='en',contain_button = True,  contain_position=True,file_type='T', scaled_page='normal')
print("pages: {}  domains: {}".format(len(urls), len({get_domain(url) for url in urls})))

In [None]:
max_page_seq = 512

In [None]:
from tensorflow.keras import Model
from tensorflow.data import Dataset

In [None]:
import copy

## Slice data into chunks

In [None]:
def filter_empty(x, y):
    res_x = [page for page in x if len(x)!= 0]
    res_y = [page for page in y if len(y)!= 0]
    return x, y

In [None]:
chunks_x, chunks_y, chunk_positions = X_raw, y, page_positions

In [None]:
chunks_x, chunks_y = filter_empty(chunks_x, chunks_y)

## Load Pre-trained Laser embedding

In [None]:
from LaserSentenceModel import LaserSentenceModel

In [None]:
laser = LaserSentenceModel()

In [None]:
laser.getSentenceVector('hello').shape

In [None]:
def textToMultiVector(text):
    ngram_next = _as_list(ngrams_wb(replace_digits(text), 2, 5),AUTOPAGER_LIMITS.max_text_features)
    return np.average(laser.getSentenceVector(ngram_next), axis = 0)

# Feature extraction

In [None]:
%%time
# XXX: these functions should be copy-pasted from autopager/model.py

def _as_list(generator, limit=None):
    """
    >>> _as_list(ngrams_wb("text", 2, 2), 0)
    []
    >>> _as_list(ngrams_wb("text", 2, 2), 2)
    ['te', 'ex']
    >>> _as_list(ngrams_wb("text", 2, 2))
    ['te', 'ex', 'xt']
    """
    return list(generator if limit is None else islice(generator, 0, limit))

def feat_to_tokens(feat, tokenizer):
    if type(feat) == type([]):
        feat = ' '.join(feat)
    tokens = tokenizer.tokenize(feat)
    return tokens

def num_token_feature_to_class(number):
    if number == '=0':
        return [1, 0, 0, 0]
    elif number == '=1':
        return [0, 1, 0, 0]
    elif number == '=2':
        return [0, 0, 1, 0]
    else:
        return [0, 0, 0, 1]

def link_to_features(link):
    text = normalize(get_link_text(link))
    href = get_link_href(link)
    if href is None:
        href = ""
    p = urlsplit(href)
    parent = link.xpath('..').extract()
    parent = get_first_tag(parser, parent[0])
    query_parsed = parse_qsl(p.query) #parse query string from path
    query_param_names = [k.lower() for k, v in query_parsed]
    query_param_names_ngrams = _as_list(ngrams_wb(
        " ".join([normalize(name) for name in query_param_names]), 3, 5, True
    ))

    # Classes of link itself and all its children.
    # It is common to have e.g. span elements with fontawesome
    # arrow icon classes inside <a> links.
    self_and_children_classes = ' '.join(link.xpath(".//@class").extract())
    parent_classes = ' '.join(link.xpath('../@class').extract())
    css_classes = normalize(parent_classes + ' ' + self_and_children_classes)
    
    token_feature = {
        'text-exact': replace_digits(text.strip()[:100].strip()),
#         'query': query_param_names,
        'query': query_param_names_ngrams,
        'parent-tag': parent,
#         'class': css_classes.split()[:AUTOPAGER_LIMITS.max_css_features],
        'class':_as_list(ngrams_wb(css_classes, 4, 5),
                          AUTOPAGER_LIMITS.max_css_features),
        'text': _as_list(ngrams_wb(replace_digits(text), 2, 5),
                         AUTOPAGER_LIMITS.max_text_features),
    }
    tag_feature = {
        'isdigit': 1 if text.isdigit() is True else 0,
        'isalpha': 1 if text.isalpha() is True else 0,
        'has-href': 0 if href is "" else 1,
        'path-has-page': 1 if 'page' in p.path.lower() else 0,
        'path-has-pageXX': 1 if re.search(r'[/-](?:p|page\w?)/?\d+', p.path.lower()) is not None else 0,
        'path-has-number': 1 if any(part.isdigit() for part in p.path.split('/')) else 0,
        'href-has-year': 1 if re.search('20\d\d', href) is not None else 0,
        'class-has-disabled': 1 if 'disabled' in css_classes else 0,
#         'num-tokens': num_token_feature_to_class(_num_tokens_feature(text)),
    }
    non_token_feature = []
    for k,v in tag_feature.items():
        if type(v) == type([]):
            non_token_feature.extend(v)
        else:
            non_token_feature.append(v)

    return [token_feature, non_token_feature]


def page_to_features(xseq):
    feat_list = [link_to_features(a) for a in xseq]
    around = get_text_around_selector_list(xseq, max_length=15)
#     print(len(feat_list))
    for feat, (before, after) in zip(feat_list, around):
        feat[0]['text-full'] = normalize(before) + ',' + feat[0]['text-exact'] + ',' + normalize(after)
    
    return feat_list

In [None]:
def get_token_tag_features_from_chunks(chunks):
    token_features = []
    tag_features = []
    for idx, page in enumerate(chunks):
        try:
            feat_list = page_to_features(page)
            token_features.append([node[0] for node in feat_list])
            tag_features.append(np.array([node[1] for node in feat_list]))
        except:
            raise Exception(f"Error occured on {idx}")
    return token_features, tag_features

In [None]:
def word_to_vector(word_list, word_vector_method = None):
    if word_vector_method is None:
        print("Need to specified a method.")
        return
    elif word_vector_method == 'FastText':
        if type(word_list) == type([]):
            if len(word_list) == 0:
                return np.zeros(ft.getModel().get_dimension())
            else:
                vectors_array = []
                for word in word_list:
                    vector = ft.getWordVector(word)
                    vectors_array.append(vector)
                mean_vector = np.mean(vectors_array, axis = 0)
                return mean_vector
        else:
            return ft.getWordVector(word_list)
    elif word_vector_method == 'Laser':
        return laser.getSentenceVector(word_list)

In [None]:
def pages_to_word_vector(ft, token_features):
    pages_vector = []
    for page in token_features:
        page_vectors = []
        for node in page:
            classes = word_to_vector(ft, node['class'])
            query = word_to_vector(ft, node['query'])
            p_tag = word_to_vector(ft, node['parent-tag'])
            full_vector = np.concatenate([classes, query, p_tag], axis = 0)
            page_vectors.append(full_vector)
        pages_vector.append(np.array(page_vectors))
    return pages_vector

In [None]:
token_features, tag_features = get_token_tag_features_from_chunks(chunks_x)
# train_tag_feature_token_list = extract_tokens_from_token_features(token_features)

In [None]:
token_feature_list = list(token_features[0][0].keys())

In [None]:
def pages_to_word_vector_from_keylist(word_vector_method, token_features, word_to_vec_list = token_feature_list):
    print(f"Transform key {word_to_vec_list} to word_vector ... ")
    pages_vector = []
    p = IntProgress(max=len(token_features))
    p.description = '(Init)'
    p.value = 0
    display(p)
    for idx, page in enumerate(token_features):
        p.description = f"Task: {idx+1}"
        p.value = idx+1
        page_vectors = []
        for node in page:
            full_vector_list = []
            for k,v in node.items():
                if k in word_to_vec_list:
                    full_vector_list.append(word_to_vector(v, word_vector_method))
            full_vector = np.concatenate(full_vector_list, axis=0)
            page_vectors.append(full_vector)
        pages_vector.append(np.array(page_vectors))
    p.description = '(Done)'
    return pages_vector

### Prepare class and query features

In [None]:
class_set = set()
query_set = set()
class_counter = dict()
query_counter = dict()
for page in token_features:
    for node in page:
        for class_name in node['class']:
            if class_name not in class_set:
                if str(len(class_name)) not in class_counter:
                    class_counter[str(len(class_name))] = 1
                else:
                    if class_name not in class_set:
                        class_counter[str(len(class_name))] += 1
                class_set.add(class_name)
        for query_name in node['query']:
            if query_name not in query_set:
                if str(len(query_name)) not in query_counter:
                    query_counter[str(len(query_name))] = 1
                else:
                    query_counter[str(len(query_name))] += 1
                query_set.add(query_name)

### Prepare top 30 parent tag

In [None]:
top_parent_tags = {}
for page in token_features:
    for node in page:
        p_tag = node['parent-tag']
        if p_tag not in top_parent_tags:
            top_parent_tags[p_tag] = 1
        else:
            top_parent_tags[p_tag] += 1
sorted_parent_tags = sorted(top_parent_tags.items(),key=lambda x:x[1],reverse=True)

In [None]:
data_map_for_ptag = sorted_parent_tags[:30]

In [None]:
def sparse_representation_with_map(tag, data_map = data_map_for_ptag):
    rt_vec = [0] * len(data_map)
    for idx, map_tag in enumerate(data_map):
        if tag == map_tag[0]:
            rt_vec[idx] = 1
            break
    return rt_vec

In [None]:
def get_ptags_vector(token_features):
    pages_ptag = []
    for page in token_features:
        ptag_page = []
        for node in page:
            p_tag = node['parent-tag']
            ptag_page.append(sparse_representation_with_map(p_tag))
        pages_ptag.append(np.array(ptag_page))
    return pages_ptag

In [None]:
#Get parent tag vector
ptags_vector = get_ptags_vector(token_features)

### Get Class, Query tokens by tokenizer

In [None]:
from collections import OrderedDict

In [None]:
class TagTokenizer:
    def __init__(self, myDict = None):
        rt_dict = {}
        rt_dict['[PAD]'] = 0
        rt_dict['[UNK]'] = 1
        i = 2
        if myDict is not None:
            for k,v in myDict.items():
                rt_dict[k] = i
                i+=1
        self.map = rt_dict
        
    def tokenize(self, word):
        if type(word) == type([]):
            token_list = []
            for _word in word:
                if _word not in self.map:
                    token_list.append(self.map['[UNK]'])
                else:
                    token_list.append(self.map[_word])
            return token_list
        else:
            if word not in self.map:
                return self.map['[UNK]']
            else:
                return self.map[word]
    def get_size(self):
        return len(self.map)

In [None]:
top_thousand_class = {}
top_thousand_query = {}
for page in token_features:
    for node in page:
        for _class in node['class']:
            if _class in top_thousand_class:
                top_thousand_class[_class]+=1
            else:
                top_thousand_class[_class]=1
        for _query in node['query']:
            if _query in top_thousand_query:
                top_thousand_query[_query]+=1
            else:
                top_thousand_query[_query]=1

class_tokenizer = TagTokenizer(top_thousand_class)
query_tokenizer = TagTokenizer(top_thousand_query)

### Get pre-trained sentence embedding

In [None]:
# Use ft to encode all token_features
# ft_full_tokens_emb = pages_to_word_vector_from_keylist('Laser', token_features, ['text-exact'])
ft_full_tokens_emb = pages_to_word_vector_from_keylist('Laser', token_features, ['text-full'])

In [None]:
np.save('embedding/train/LaserEmb_full.npy', ft_full_tokens_emb)

In [None]:
train_tag_info_list = tag_features #features which only have tag true/false information

## Padding to fixed size and prepare for training inputs

In [None]:
def prepare_input_ids(page_tokens, max_len):
    pages_class = []
    pages_query = []
#     print(len(page_tokens))
    for page in page_tokens:
        class_page = []
        query_page = []
        for node in page:
            #class
            class_ids = class_tokenizer.tokenize(node['class'])
            class_ids = class_ids + [0] * (max_len-len(class_ids))
            class_page.append(class_ids[:max_len])
            #query
            query_ids = query_tokenizer.tokenize(node['query'])
            query_ids = query_ids + [0] * (max_len-len(query_ids))
            query_page.append(query_ids[:max_len])
        pages_class.append(np.array(class_page))
        pages_query.append(np.array(query_page))
    return pages_class, pages_query

In [None]:
max_len = 256

In [None]:
pages_class, pages_query = prepare_input_ids(token_features, max_len)

In [None]:
train_attr_x = ft_full_tokens_emb

In [None]:
train_ptag = ptags_vector

In [None]:
train_tag_x = tag_features

In [None]:
train_composite_with_token = [train_attr_x, train_ptag, pages_class, pages_query, train_tag_x]

In [None]:
labels = ["O", "PREV", "PAGE", "NEXT"]
tag2idx = { label:idx for idx,label in enumerate(labels)}
idx2tag = { idx:label for idx,label in enumerate(labels)}
num_tags = len(labels)

In [None]:
train_y = [np.array([tag2idx.get(l) for l in lab]) for lab in chunks_y]

In [None]:
for inputs in train_composite_with_token:
    print(inputs[0].shape)

## Import model

In [None]:
from tensorflow_addons.layers.crf import CRF

In [None]:
from tensorflow.keras.layers import (Dense, Input, Bidirectional, LSTM, Embedding, Masking, Concatenate,
                                    AveragePooling2D, MaxPooling2D, Reshape, Attention, GlobalAveragePooling1D
                                    )

In [None]:
tf.__version__

# Custom training

## Build model

In [None]:
#For custom embedding

def get_custom_emb_model(use_crf = True, embedding_size = 32, hidden_size = 300):
    ft_shape = (None, 1024)
    tag_info_shape = (None, 8)
    tag_emb_shape = (None, 256)
    ptag_emb_shape = (None, 30)
    embbed_output_shape = embedding_size
    page_embbed_shape = (-1, embbed_output_shape)
    pool_size = (256, 1)
    HIDDEN_UNITS = hidden_size
    NUM_CLASS = num_tags
    
    input_ft_embedding = Input(shape=(ft_shape), name="input_ft_embeddings")
    input_tag_information = Input(shape=(tag_info_shape), name="input_tag_information")
    input_ptag_vector = Input(shape=(ptag_emb_shape), name="input_ptag")
    input_class = Input(shape=(tag_emb_shape), name="input_class")
    input_query = Input(shape=(tag_emb_shape), name="input_query")

    #Embedding layers
    ## input_class
    class_emb = Embedding(input_dim = class_tokenizer.get_size(), output_dim = embbed_output_shape, input_length=max_page_seq, mask_zero = True)(input_class)
    class_emb = AveragePooling2D(pool_size, data_format = 'channels_first')(class_emb)
    class_emb = Reshape(page_embbed_shape, name="class_emb_out")(class_emb)
    ## input_query
    query_emb = Embedding(input_dim = query_tokenizer.get_size(), output_dim = embbed_output_shape, input_length=max_page_seq, mask_zero = True)(input_query)
    query_emb = AveragePooling2D(pool_size, data_format = 'channels_first')(query_emb)
    query_emb = Reshape(page_embbed_shape, name="query_emb_out")(query_emb)

    input_tags = Concatenate()([class_emb, query_emb])
    input_tags_FFN = Dense(units = 2 * embbed_output_shape, activation = 'relu')(input_tags)
    input_tags_FFN = Dense(units = embbed_output_shape, activation = 'relu', name="input_tag_FFN_out")(input_tags_FFN)


    ft_FFN = Dense(units = 512, activation = 'relu', name="ft_FFN_01")(input_ft_embedding)
    ft_FFN = Dense(units = 256, activation = 'relu', name="ft_FFN_02")(ft_FFN)
    ft_FFN = Dense(units = 128, activation = 'relu', name="ft_FFN_out")(ft_FFN)
    
    # FFN for ptag
#     ptag_FFN = Dense(units = 128, activation = 'relu', name="ptag_FFN_01")(input_ptag_vector)
#     ptag_FFN = Dense(units = 64, activation = 'relu', name="ptag_FFN_out")(ptag_FFN)
    
    merged = Concatenate()([input_tags_FFN, input_ptag_vector, input_tag_information])
    model = Bidirectional(LSTM(units = HIDDEN_UNITS//2, return_sequences=True))(merged)
#     model = LSTM(units = HIDDEN_UNITS, return_sequences=True)(merged)
    if use_crf:
        crf=CRF(NUM_CLASS, name='crf_layer')
        out =crf(model)
    else:
        out = Dense(units = NUM_CLASS, activation='softmax')(model)
    model = Model([input_ft_embedding, input_ptag_vector, input_class, input_query, input_tag_information], out)
    if use_crf:
        loss_fn = crf.get_loss
    else:
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
    return model, loss_fn

## Split data into train/val set

In [None]:
batch_size = 1

In [None]:
def list_to_dataSet(data, dataType):
    dataset = Dataset.from_generator(lambda: iter(data), dataType)
    return dataset
def zip_dataSet(data):
    data_tuple = tuple(data)
    dataset = Dataset.zip(data_tuple)
    return dataset
def describe_dataset(dataset):
    print(train_dataset.element_spec)

In [None]:
def composite_splite_to_train_val(composite_x, y, number):
    x_train = [ data[:-number] for data in composite_x]
    y_train = y[:-number]
    x_val = [ data[-number:] for data in composite_x]
    y_val = y[-number:]
    return x_train, y_train, x_val, y_val

In [None]:
def composite_cut_data(composite_x, y, percent):
    number = round(len(y) * percent)
    new_composite_x = [ data[:number] for data in composite_x]
    new_y = y[:number]

    return new_composite_x, new_y

In [None]:
x_train, y_train, x_val, y_val = composite_splite_to_train_val(train_composite_with_token, train_y, 20)

In [None]:
def data_list_to_dataset(x, y, isValidation = False, batch_size = 1):
    all_data = None
    for data in x:
        dataset = list_to_dataSet(data, tf.float32)
        if all_data == None:
            all_data = dataset
        else:
            all_data = Dataset.zip((all_data, dataset))
    y_ds = list_to_dataSet(y, tf.int32)
    final_set = Dataset.zip((all_data, y_ds))
    if not isValidation:
        final_set = final_set.shuffle(buffer_size=1024).batch(batch_size)
    else:
        final_set = final_set.batch(batch_size)
    return final_set

def composite_list_to_dataset(x, batch_size = 1):
    all_data = None
    for data in x:
        dataset = list_to_dataSet(data, tf.float32)
        if all_data == None:
            all_data = dataset
        else:
            all_data = Dataset.zip((all_data, dataset))
    return all_data.batch(batch_size)

In [None]:
train_dataset = data_list_to_dataset(x_train, y_train, isValidation=False)

In [None]:
val_dataset = data_list_to_dataset(x_val, y_val, isValidation=True)

In [None]:
# Generate data by percentage
def GenerateData(train_composite_with_token, train_y, percent):
    train_composite_with_token, train_y = train_composite_with_token[:len(train_composite_with_token)*percent], train_y[:len(train_y)*percent]
    x_train, y_train, x_val, y_val = composite_splite_to_train_val(train_composite_with_token, train_y, )

## Define Custom Training

In [None]:
model, loss_fn = get_custom_emb_model(use_crf=True, embedding_size = 32, hidden_size = 300)

In [None]:
model.summary()

In [None]:
optimizer = keras.optimizers.Adam()

In [None]:
# Calculate training/val f1-score
from sklearn.metrics import classification_report
from collections import Counter
def calculate_pages_metric(y_true_pages, y_predict_pages):
    pages_f1 = []
    nexts_f1 = []
    avg_f1 = []
    for y_true, y_predict in zip(y_true_pages, y_predict_pages):
        if len(y_true) == 0:
            break
        report = classification_report(y_true, y_predict,output_dict=True)
#         print(report)
        PAGE = report['2']['f1-score']
        NEXT = report['3']['f1-score']
        pages_f1.append(PAGE)
        nexts_f1.append(NEXT)
        avg_f1.append((PAGE+NEXT)/2)
    return pages_f1, nexts_f1, avg_f1
def calculate_page_metric(y_true, y_predict):    
    report = classification_report(y_true, y_predict,labels=[0,2,3],output_dict=True)
    OTHER = report['0']['f1-score']
    PAGE = report['2']['f1-score']
    NEXT = report['3']['f1-score']
    if 2 in y_true and 3 in y_true:
        AVG = (PAGE+NEXT)/2
    elif 2 in y_true and 3 not in y_true:
        AVG = PAGE
    elif 2 not in y_true and 3 in y_true:
        AVG = NEXT
    else:
        AVG = OTHER
    return AVG

In [None]:
#Test for data predict
for (batch_x, batch_y) in train_dataset.take(1):
    batch_predict_y = model(batch_x).numpy()
    batch_true_y = batch_y.numpy()
    print(batch_true_y)
    print(calculate_page_metric(batch_true_y[0], batch_predict_y[0]))
    print(classification_report(batch_true_y[0], batch_predict_y[0]))
    break

In [None]:
def train_on_epoch(epochs, model, optimizer, train_dataset, val_dataset, best_model_method = 'f1-score'):
    import time
    
    epochs = epochs
    best_weights = None
    best_f1_weights = None
    best = np.Inf
    best_loss_history = None
    best_f1 = 0
    best_f1_history = None
    avg_epoch_losses = []
    avg_epoch_f1s = []
    for epoch in range(epochs):
        print("\nStart of epoch %d" % (epoch,))
        start_time = time.time()

        # Iterate over the batches of the dataset.
        for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
            with tf.GradientTape() as tape:
                logits = model(x_batch_train, training=True)
                loss_value = loss_fn(y_batch_train, logits)
            grads = tape.gradient(loss_value, model.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.trainable_weights))

            # Log every 50 batches.
#             if step % 50 == 0:
#                 print(
#                     "Training loss (for one batch) at step %d: %.4f"
#                     % (step, float(loss_value))
#                 )
#                 print("Seen so far: %d samples" % ((step + 1) * batch_size))


        # Run a validation loop at the end of each epoch.
        val_losses = []
        val_f1s = []
        for x_batch_val, y_batch_val in val_dataset:
            val_logits = model(x_batch_val, training=False)
            val_loss_value = loss_fn(y_batch_val, val_logits)
            val_avg_f1 = calculate_page_metric(y_batch_val.numpy()[0], val_logits.numpy()[0])
            val_losses.append(val_loss_value)
            val_f1s.append(val_avg_f1)
        average_val_loss = np.average(val_losses)
        average_val_f1 = np.average(val_f1s)
        avg_epoch_losses.append(average_val_loss)
        avg_epoch_f1s.append(average_val_f1)
        if average_val_loss < best:
            best_weights = model.get_weights()
            best = average_val_loss
            best_loss_history = [val_losses, val_f1s]
        if average_val_f1 > best_f1:
            best_f1_weights = model.get_weights()
            best_f1 = average_val_f1
            best_f1_history = [val_losses, val_f1s]
        print("Validation loss: %.4f" % (float(average_val_loss),))
        print("Validation F1: %.4f" % (float(average_val_f1),))
        print("Time taken: %.2fs" % (time.time() - start_time))
    print(f"Best loss: {best}, Best F1: {best_f1}")
    print(f"Training finish, load best weights. {best_model_method}")
    
    if best_model_method == 'loss':
        model.set_weights(best_weights)
    elif best_model_method == 'f1-score':
        model.set_weights(best_f1_weights)
    avg_epoch_result = {"epoch_losses": avg_epoch_losses, "epoch_f1s": avg_epoch_f1s}
    return model, avg_epoch_result

In [None]:
def learning_curve(epochs, model, optimizer, train_dataset, val_dataset):
    import time
    
    epochs = epochs
    best_f1_weights = None
    best_f1 = 0
    best_f1_history = None
    best_train = 0

    for epoch in range(epochs):
#         print("\nStart of epoch %d" % (epoch,))
        start_time = time.time()
        train_f1s = []
        # Iterate over the batches of the dataset.
        for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
            with tf.GradientTape() as tape:
                logits = model(x_batch_train, training=True)
                loss_value = loss_fn(y_batch_train, logits)
                train_avg_f1 = calculate_page_metric(y_batch_train.numpy()[0], logits.numpy()[0])
                train_f1s.append(train_avg_f1)
            grads = tape.gradient(loss_value, model.trainable_weights)
            optimizer.apply_gradients(zip(grads, model.trainable_weights))

        average_train_f1 = np.average(train_f1s)
        
        # Run a validation loop at the end of each epoch.
        val_losses = []
        val_f1s = []
        for x_batch_val, y_batch_val in val_dataset:
            val_logits = model(x_batch_val, training=False)
            val_loss_value = loss_fn(y_batch_val, val_logits)
            val_avg_f1 = calculate_page_metric(y_batch_val.numpy()[0], val_logits.numpy()[0])
            val_losses.append(val_loss_value)
            val_f1s.append(val_avg_f1)

        average_val_f1 = np.average(val_f1s)

        if average_val_f1 > best_f1:
            best_f1 = average_val_f1
            best_train = average_train_f1
            
    print(f"Best train f1: {best_train}, Best val f1: {best_f1}")
    
    return best_train, best_f1

## Test data evaluation

In [None]:
import pandas as pd

In [None]:
# Transfer distribution to corresponding label
def label_distribution_to_label(predict_y):
    if len(predict_y.shape) != 3:
        return predict_y
    label_y = list()
    for page in predict_y:
        tmp = list()
        for lab in page:
            lab = lab.tolist()
            tmp.append(lab.index(max(lab)))
        label_y.append(tmp)
    return label_y

In [None]:
# Prepare for testing inputs
def prepare_for_testing(test_X_raw, test_y_raw): #ft-bert -no chunks
    chunks_test_x, chunks_test_y = test_X_raw, test_y_raw
    chunks_test_x, chunks_test_y = filter_empty(chunks_test_x, chunks_test_y)
    test_token_features, test_tag_features = get_token_tag_features_from_chunks(chunks_test_x)
    
    test_ptags_vector = get_ptags_vector(test_token_features)
    test_ft_emb = pages_to_word_vector_from_keylist('Laser', test_token_features, ['text-full'])
    test_tag_info_list = test_tag_features
    ## Tokens prepare
    test_pages_class, test_pages_query, test_pages_text = prepare_input_ids(test_token_features, max_len)
    ## X_test_input
    test_composite_input = [test_ft_emb, test_ptags_vector, test_pages_class, test_pages_query, test_tag_info_list]
    
    ## y_test_input
    y_test = [[tag2idx.get(l) for l in lab] for lab in chunks_test_y]
    y_test = [[idx2tag.get(lab) for lab in page] for page in y_test]
    y_test = np.asarray(y_test)
    
    return test_composite_input, y_test

In [None]:
def evaluate_from_batch(model, x, y, evaluate_labels, multiTask = False):
    print("Start predicting test data ...")
    test_page_dataset = composite_list_to_dataset(x)
    predicted_y = []
    for pageIdx, batch_x_test in enumerate(test_page_dataset):
        if len(y[pageIdx]) == 0:
            batch_predict_y = np.array([])
        else:
            if multiTask:
                batch_predict_y = model(batch_x_test)[0][0].numpy()
            else:
                batch_predict_y = model(batch_x_test)[0].numpy()
#         print(batch_predict_y.shape)
        if len(batch_predict_y.shape) != 1:
            tmp = list()
            for lab in batch_predict_y:
                lab = lab.tolist()
                tmp.append(lab.index(max(lab)))
            batch_predict_y = tmp
        predicted_y.append(batch_predict_y)
    print("Start evaluating test data ...")
    predict_y = np.asarray([[idx2tag.get(lab) for lab in page] for page in predicted_y])
#     report = flat_classification_report(y, predict_y, labels=evaluate_labels, digits=3,output_dict=True)
    macro_report = page_level_score(predict_y, y)
    micro_report = node_level_score(predict_y, y)
    print("Macro")
    print(macro_report)
    print("Micro")
    print(micro_report)
    return (0.5*(macro_report['page_f1'] + macro_report['next_f1']) + 0.5*(micro_report['page_f1'] + micro_report['next_f1']))/2

In [None]:
def evaluate_model(model, target = "all"):
    TEST_MODEL = model
#     test_languages = storage.get_all_test_languages()
    test_languages = ['en','de','ru','zh','ja','ko']
    if target != "all":
        test_languages = [target]
    reports = {}
    for language in test_languages:
        print("Testing language: ", language)
        test_urls = [rec['Page URL'] for rec in storage.iter_test_records_by_language(language=language)]
        test_X_raw, test_y = storage.get_test_Xy_by_language(language=language)
        print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
        _test_x, _test_y = prepare_for_testing(test_X_raw, test_y)
        score = evaluate_from_batch(TEST_MODEL, _test_x, _test_y, ['PAGE','NEXT'])
        print("===================================")
    return score

In [None]:
def calculate_macro_avg(reports):
    avg_macro = 0
    for lan, report in reports.items():
        avg_macro+=report['macro avg']['f1-score']
    return avg_macro/len(reports)

## Page/Node level evaluation

In [None]:
def node_level_score(y_pred, y_true):

    reports = flat_classification_report(y_true, y_pred, labels=['PAGE', 'NEXT'], digits=3, output_dict = True)

    page_prec = reports['PAGE']['precision']
    page_rec = reports['PAGE']['recall']
    page_f1 = reports['PAGE']['f1-score']
    next_prec = reports['NEXT']['precision']
    next_rec = reports['NEXT']['recall']
    next_f1 = reports['NEXT']['f1-score']
    
    record = {"page_prec": page_prec, "page_rec": page_rec, "page_f1": page_f1, "next_prec": next_prec, "next_rec": next_rec, "next_f1": next_f1}
    return record

def page_level_score(y_pred, y_true):
    page_prec = 0
    page_rec = 0
    page_f1 = 0
    next_prec = 0
    next_rec = 0
    next_f1 = 0
    macro_f1 = 0
    size = 0
    for idx, (page_pred, page_true) in enumerate(zip(y_pred, y_true)):
        
        if 'NEXT' not in page_true and 'PAGE' not in page_true and 'PREV' not in page_true:
#             print("Continue at ",idx)
            continue
        else:
            size += 1
        reports = classification_report(page_true, page_pred, labels=['PAGE', 'NEXT'], digits=3, output_dict = True)
#         print(reports)
        page_prec += reports['PAGE']['precision']
        page_rec += reports['PAGE']['recall']
        page_f1 += reports['PAGE']['f1-score']
        next_prec += reports['NEXT']['precision']
        next_rec += reports['NEXT']['recall']
        next_f1 += reports['NEXT']['f1-score']
    record = {"page_prec": page_prec/size, "page_rec": page_rec/size, "page_f1": page_f1/size, "next_prec": next_prec/size, "next_rec": next_rec/size, "next_f1": next_f1/size}
    return record

In [None]:
score = evaluate_model(model, target='en')

In [None]:
score

### Learning Curve

In [None]:
Learning_records = []
for percent in [0.2, 0.4, 0.6, 0.8, 1]:
    for Iteration in range(5):
        new_composite_x, new_y = composite_cut_data(train_composite_with_token, train_y, percent)
        train_case = len(new_y)
        test_case = round(train_case * 0.2)
        print(f"Train case: {train_case}, Test case: {test_case}")
        x_train, y_train, _, _ = composite_splite_to_train_val(new_composite_x, new_y, test_case)
        _, _, x_val, y_val = composite_splite_to_train_val(train_composite_with_token, train_y, 20)
        train_dataset = data_list_to_dataset(x_train, y_train, isValidation=False)
        val_dataset = data_list_to_dataset(x_val, y_val, isValidation=True)
        model, loss_fn = get_custom_emb_model(use_crf=True, embedding_size = 32, hidden_size = 300)
        best_train, best_f1 = learning_curve(15, model, optimizer, train_dataset, val_dataset)
        Learning_records.append({"Pages": train_case, "Iteration": Iteration, "best_train": best_train, "best_test": best_f1})

In [None]:
pd.DataFrame(Learning_records)

In [None]:
lc_train_f1s = []
lc_val_f1s = []
for train_pages in [33, 66, 98, 131, 164]:
    tmp_train = []
    tmp_val = []
    for record in Learning_records:
        if record['Pages'] == train_pages:
            tmp_train.append(record['best_train'])
            tmp_val.append(record['best_test'])
    lc_train_f1s.append(tmp_train)
    lc_val_f1s.append(tmp_val)

In [None]:
import matplotlib.pyplot as plt

In [None]:
import numpy as np

In [None]:
lc_train_f1s = [[0.819,0.679,0.705,0.793,0.774],[0.966,0.377,0.935,0.934,0.912],[0.926,0.938,0.965,0.951,0.925],[0.98,0.944,0.958,0.964,0.96],[0.965,0.937,0.942,0.939,0.96]]
lc_val_f1s   = [[0.628,0.695,0.636,0.638,0.67],[0.804,0.3,0.78,0.789,0.762],[0.774,0.746,0.806,0.778,0.767],[0.829,0.829,0.789,0.802,0.791],[0.806,0.809,0.804,0.788,0.809]]

In [None]:
learning_curve_pd = pd.DataFrame()

In [None]:
for data, page_sample in zip(lc_train_f1s, [33, 66, 98, 131, 164]):
    train_type = 'train'
    for idx, score in enumerate(data):
        record = {"train_type": train_type, "sample_size": page_sample, "iteration": idx, "macro F1": score}
        learning_curve_pd = learning_curve_pd.append(record, ignore_index=True)

In [None]:
for data, page_sample in zip(lc_val_f1s, [33, 66, 98, 131, 164]):
    train_type = 'val'
    for idx, score in enumerate(data):
        record = {"train_type": train_type, "sample_size": page_sample, "iteration": idx, "macro F1": score}
        learning_curve_pd = learning_curve_pd.append(record, ignore_index=True)

In [None]:
lc_train_means = np.mean(lc_train_f1s, axis = 1)
lc_val_means = np.mean(lc_val_f1s, axis = 1)
lc_train_std = np.std(lc_train_f1s, axis = 1)
lc_val_std = np.std(lc_val_f1s, axis = 1)

In [None]:
train_sizes = np.array([33, 66, 98, 131, 164])
_, axes = plt.subplots(1,3,figsize=(20, 5))
axes[0].set_xlabel("Training pages")
axes[0].set_ylabel("Macro F1 Score")

# Plot learning curve
axes[0].grid()
axes[0].fill_between(train_sizes, lc_train_means - lc_train_std,
                     lc_train_means + lc_train_std, alpha=0.1,
                     color="r")
axes[0].fill_between(train_sizes, lc_val_means - lc_val_std,
                     lc_val_means + lc_val_std, alpha=0.1,
                     color="g")
axes[0].plot(train_sizes, lc_train_means, 'x-', color="r",
             label="Training score")
axes[0].plot(train_sizes, lc_val_means, 'x-', color="g",
             label="Cross-validation score")
axes[0].legend(loc="best")
axes[0].set_title("Macro F1 on EN dev --> EN test")

### Tag Vector Effect

In [None]:
TAG_EXP_RECORDS = []
for iteration in range(5):
    print("Iteration start ",iteration)
    model, loss_fn = get_custom_emb_model(use_crf=True, embedding_size = 32, hidden_size = 300)
    model, avg_epoch_result = train_on_epoch(25, model, optimizer, train_dataset, val_dataset)
    score = evaluate_model(model, target='en')
    TAG_EXP_RECORDS.append(record)

In [None]:
reports = evaluate_model(model, target = 'ko')

### Ablation Tag Vector

In [None]:
PTAG_ABLA_EXP_RECORDS = []
for iteration in range(3):
    print("Iteration start ",iteration)
    model, loss_fn = get_ablation_ptag_model(use_crf=True)
    model, avg_epoch_result = train_on_epoch(25, model, optimizer, train_dataset, val_dataset)
    reports = evaluate_model(model)
    macro_avg = calculate_macro_avg(reports)
    print(f"Iter: {iteration}, macro f1: {macro_avg}")
    record = {"iter": iteration, "macro_avg": macro_avg}
    PTAG_ABLA_EXP_RECORDS.append(record)

In [None]:
PTAG_ABLA_EXP_RECORDS

### Heuristic label

In [None]:
HUE_EXP_RECORDS = []
for iteration in range(2):
    print("Iteration start ",iteration)
    model, loss_fn = get_ablation_model_hl(use_crf=True)
    model, avg_epoch_result = train_on_epoch(25, model, optimizer, train_dataset, val_dataset)
    score = evaluate_model(model, target='en')
    print(score)

In [None]:
HUE_EXP_RECORDS

### EMB Exp

In [None]:
EMB_EXP_RECORDS = []

In [None]:
for EMB_SIZE in [16,32,64,128]:
    for iteration in range(5):
        print("Iteration start ",iteration)
        model, loss_fn = get_custom_emb_model(use_crf=True, embedding_size = EMB_SIZE, hidden_size = 300)
        model, avg_epoch_result = train_on_epoch(25, model, optimizer, train_dataset, val_dataset)
        print("best: ",best)
        reports = evaluate_model(model)
        macro_avg = calculate_macro_avg(reports)
        print(f"Iter: {iteration}, macro f1: {macro_avg}")
        record = {"EMB_SIZE": EMB_SIZE, "iter": iteration, "macro_avg": macro_avg}
        EMB_EXP_RECORDS.append(record)

In [None]:
EMB_EXP_RECORDS

In [None]:
pd.DataFrame(EMB_EXP_RECORDS)

### LSTM Hidden Size

In [None]:
HD_SIZE_RECORDS = []

In [None]:
for NUERON_SIZE in [400, 500]:
    for iteration in range(5):
        print("Iteration start ",iteration)
        model, loss_fn = get_custom_emb_model(use_crf=True, embedding_size = 32, hidden_size=NUERON_SIZE)
        model, avg_epoch_result = train_on_epoch(25, model, optimizer, train_dataset, val_dataset)
        reports = evaluate_model(model)
        macro_avg = calculate_macro_avg(reports)
        print(f"Iter: {iteration}, macro f1: {macro_avg}")
        record = {"NUERON_SIZE": NUERON_SIZE, "iter": iteration, "macro_avg": macro_avg}
        HD_SIZE_RECORDS.append(record)

In [None]:
HD_SIZE_RECORDS

In [None]:
pd.DataFrame(HD_SIZE_RECORDS)

### Attr Ablation

In [None]:
Ablation_records = []

In [None]:
for iteration in range(3):
    for attrReq in ['class','query']:
        print("Iteration start ",iteration)
        print("Attr: ",attrReq)
        model, loss_fn = get_ablation_model(attrReq=attrReq)
        model, avg_epoch_result = train_on_epoch(25, model, optimizer, train_dataset, val_dataset)
        reports = evaluate_model(model)
        macro_avg = calculate_macro_avg(reports)
        print(f"Iter: {iteration}, attrReq: {attrReq}, macro f1: {macro_avg}")
        record = {"iter": iteration, "macro_avg": macro_avg, "attrReq": attrReq}
        Ablation_records.append(record)

In [None]:
Ablation_records

In [None]:
for iteration in range(4):
    print("Iteration start ",iteration)
    model, loss_fn = get_custom_emb_model()
    model, avg_epoch_result = train_on_epoch(15, model, optimizer, train_dataset, val_dataset)
    score = evaluate_model(model, target = 'en')

## DRAW

In [None]:
import seaborn as sns

In [None]:
import pandas as pd

In [None]:
attrData = pd.read_csv('AttrEmbData.csv')

In [None]:
lstmData = pd.read_csv('lstmData.csv')

In [None]:
attrData = attrData.dropna()

In [None]:
ax = sns.boxplot(x="EMB_SIZE", y="macro_avg", data=attrData, showfliers=False)
ax.set_xlabel("Attribute Embedding Size")
ax.set_ylabel("Macro F1")

In [None]:
fig = ax.get_figure()
fig.savefig("AttributeEmbeddingSize.png")

In [None]:
ax = sns.boxplot(x="NUERON_SIZE", y="macro_avg", data=lstmData, showfliers=False)
ax.set_xlabel("LSTM Hidden Size")
ax.set_ylabel("Macro F1")

In [None]:
fig = ax.get_figure()
fig.savefig("LSTMHiddenSize.png")

In [None]:
enDev_macro = pd.read_csv('en_dev_macro_sns.csv')

In [None]:
enDev_macro.loc[enDev_macro['Method'] == 'CRFSuite', 'Method'] = 'Autopager'

In [None]:
ax = sns.boxplot(x="Label", y="F1", hue="Method",
                 data=enDev_macro, palette="Set3")
ax.set_title("Macro F1 on En Dev --> En Test")
ax.set_ylim(0.5,1)
ax.set_xlabel("")
ax.set_ylabel("Macro F1")

In [None]:
enDev_micro = pd.read_csv('en_dev_micro_sns.csv')

In [None]:
enDev_micro.loc[enDev_micro['Method'] == 'CRFSuite', 'Method'] = 'Autopager'

In [None]:
ax = sns.boxplot(x="Label", y="F1", hue="Method",
                 data=enDev_micro, palette="Set3")
ax.set_title("Micro F1 on En Dev --> En Test")
ax.set_ylim(0.5,1)
ax.set_xlabel("")
ax.set_ylabel("Micro F1")