In [17]:
import os
import re
import sys
from collections import Counter
from itertools import islice
from urllib.parse import urlparse, urlsplit, parse_qs, parse_qsl

import numpy as np
import parsel
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_classification_report, sequence_accuracy_score
from sklearn.model_selection import cross_val_predict, GroupKFold

sys.path.insert(0, '..')
from autopager.storage import Storage
from autopager.htmlutils import (get_link_text, get_text_around_selector_list,
                                 get_link_href, get_selector_root)
from autopager.utils import (
    get_domain, normalize_whitespaces, normalize, tokenize, replace_digits
)
from autopager.model import link_to_features, _num_tokens_feature, _elem_attr
from autopager import AUTOPAGER_LIMITS
from autopager.parserutils import (TagParser, MyHTMLParser, draw_scaled_page, position_check, compare_tag, get_first_tag)
storage = Storage()
parser = MyHTMLParser()
tagParser = TagParser()

Current test file:  ['en', 'zh', 'ko', 'ja', 'de', 'ru']


In [18]:
from autopager.autopager import get_shared_autopager

In [19]:
best_autopager = get_shared_autopager()

In [20]:
best_crf = best_autopager.crf

In [21]:
best_crf



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.001, c2=0.05,
    keep_tempfiles=None, max_iterations=100)

In [22]:
%%time
urls = [rec['Page URL'] for rec in storage.iter_records(language='en',contain_button = True, file_type='T')]
X_raw, y, page_positions = storage.get_Xy(language='en',contain_button = True,  contain_position=True,file_type='T', scaled_page='normal')
print("pages: {}  domains: {}".format(len(urls), len({get_domain(url) for url in urls})))

pages: 164  domains: 55
CPU times: user 4.14 s, sys: 26.8 ms, total: 4.17 s
Wall time: 4.17 s


In [23]:
len(X_raw)

164

## Use href as feature

In [24]:
%%time
# XXX: these functions should be copy-pasted from autopager/model.py

def _as_list(generator, limit=None):
    return list(generator if limit is None else islice(generator, 0, limit))

def link_to_features(link):
    text = normalize(get_link_text(link))
    href = get_link_href(link)
    if href is None:
        href = ""
    p = urlsplit(href)
    query_parsed = parse_qsl(p.query) #parse query string from path
    query_param_names = [k.lower() for k, v in query_parsed]
    parent = link.xpath('..').extract()
    parent = get_first_tag(parser, parent[0])
    elem = get_selector_root(link)
    elem_target = _elem_attr(elem, 'target')
    elem_rel = _elem_attr(elem, 'rel')
    # Classes of link itself and all its children.
    # It is common to have e.g. span elements with fontawesome
    # arrow icon classes inside <a> links.
    self_and_children_classes = ' '.join(link.xpath(".//@class").extract())
    parent_classes = ' '.join(link.xpath('../@class').extract())
    css_classes = normalize(parent_classes + ' ' + self_and_children_classes)
    
    return {
        'bias': 3.0,
        'isdigit': text.isdigit(),
        'isalpha': text.isalpha(),
#         'elem-target': elem_target, # Not effective
#         'elem-rel': elem_rel, # Not effective
        'num-tokens%s' % _num_tokens_feature(text): 1.0,
        'text-exact': replace_digits(text.strip()[:20].strip()),
        'parent-tag': parent, #Really good at Normal, but get worse in EVENT_SOURCE
        'class': css_classes, 
        'class_disabled': True if 'disabled' in css_classes else False, #Effective
        'query': query_param_names,
        'has-href': False if href is "" else True,
        'path-has-page': 'page' in p.path.lower(),
        'path-has-pageXX': re.search(r'[/-](?:p|page\w?)/?\d+', p.path.lower()) is not None,
        'path-has-number': any(part.isdigit() for part in p.path.split('/')),

        'href-has-year': re.search('20\d\d', href) is not None,
#         'href-had-self-redirection': '#' in href # effect to bad performance
#         'parent-tag': parent,
    }


def page_to_features(xseq, positions = None):
    features = [link_to_features(a) for a in xseq]
    around = get_text_around_selector_list(xseq, max_length=15)

    k = 0.2
    if positions is None:
        for feat, (before, after) in zip(features, around):
            feat['text-before'] = normalize(before)
            feat['text-after'] = after
    return features

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 10.5 µs


In [25]:
X = [page_to_features(xseq) for xseq in X_raw]

In [63]:
Counter = {}
for idx, page in enumerate(X):
    for node, predict_label in zip(page, y[idx]):
        if predict_label == 'PAGE' or predict_label == 'NEXT' or predict_label == 'PREV':
            ptag = node['parent-tag']
            print(ptag, predict_label)
        if ptag not in Counter:
            Counter[ptag] = 1
        else:
            Counter[ptag] +=1

nav PAGE
span PAGE
span PAGE
span PAGE
span PAGE
span PAGE
nav PAGE
nav NEXT
nav PAGE
span PAGE
span PAGE
span PAGE
span PAGE
span PAGE
nav PAGE
nav NEXT
nav PREV
nav PAGE
span PAGE
span PAGE
span PAGE
span PAGE
span PAGE
nav PAGE
nav NEXT
nav PREV
nav PAGE
span PAGE
span PAGE
span PAGE
span PAGE
span PAGE
nav PAGE
nav NEXT
nav PREV
nav PAGE
span PAGE
span PAGE
span PAGE
span PAGE
span PAGE
nav PAGE
nav PREV
nav PAGE
span PAGE
span PAGE
span PAGE
span PAGE
span PAGE
nav PAGE
li PAGE
li PAGE
li PAGE
li PAGE
li NEXT
li PAGE
li PAGE
li PAGE
li PAGE
li NEXT
li PREV
li PAGE
li PAGE
li PAGE
li PAGE
li PAGE
li PAGE
li NEXT
li PREV
li PAGE
li PAGE
li PAGE
li PAGE
li PAGE
li PAGE
li NEXT
div PAGE
div PAGE
div PAGE
div PAGE
div PAGE
div PAGE
div NEXT
div PAGE
div PAGE
div PREV
div PAGE
div PAGE
div PAGE
div PAGE
div PAGE
div NEXT
div PAGE
div PAGE
div PAGE
div PAGE
div NEXT
div PREV
div PAGE
div PAGE
div PAGE
div PAGE
div NEXT
div PAGE
div PAGE
div PAGE
div NEXT
div PREV
div PAGE
div PAGE
div PA

In [16]:
sorted(Counter.items(), key=lambda x:x[1], reverse=True)

[('li', 17964),
 ('div', 12920),
 ('span', 3254),
 ('p', 1271),
 ('td', 1188),
 ('h3', 629),
 ('h2', 621),
 ('font', 287),
 ('dd', 286),
 ('strong', 260),
 ('h4', 146),
 ('h1', 124),
 ('article', 104),
 ('label', 101),
 ('nav', 72),
 ('figure', 65),
 ('h5', 60),
 ('h6', 58),
 ('figcaption', 56),
 ('form', 41),
 ('b', 34),
 ('ul', 34),
 ('time', 34),
 ('blockquote', 26),
 ('noscript', 24),
 ('body', 21),
 ('a', 21),
 ('center', 20),
 ('header', 17),
 ('th', 13),
 ('footer', 12),
 ('dt', 6),
 ('section', 5),
 ('address', 5),
 ('home-logo', 4),
 ('cart-header', 4),
 ('cite', 4),
 ('small', 3),
 ('aside', 2),
 ('em', 1),
 ('textarea', 1)]

In [14]:
sorted(Counter, reverse = True)

['ul',
 'time',
 'th',
 'textarea',
 'td',
 'strong',
 'span',
 'small',
 'section',
 'p',
 'noscript',
 'nav',
 'li',
 'label',
 'home-logo',
 'header',
 'h6',
 'h5',
 'h4',
 'h3',
 'h2',
 'h1',
 'form',
 'footer',
 'font',
 'figure',
 'figcaption',
 'em',
 'dt',
 'div',
 'dd',
 'cite',
 'center',
 'cart-header',
 'body',
 'blockquote',
 'b',
 'aside',
 'article',
 'address',
 'a']

In [44]:
for page_x, page_y in zip(X, y):
    if 'PAGE' in page_y:
        for node_x, node_y in zip(page_x, page_y):
            if node_y == 'PAGE' or node_y == 'NEXT':
                print("Label: ", node_y)
                print(node_x)
        break

Label:  PAGE
{'bias': 3.0, 'isdigit': True, 'isalpha': False, 'num-tokens=1': 1.0, 'text-exact': 'X', 'parent-tag': 'nav', 'class': ' currentpage ', 'class_disabled': False, 'query': [], 'has-href': True, 'path-has-page': False, 'path-has-pageXX': False, 'path-has-number': False, 'href-has-year': False, 'text-before': ' page 1 of 349', 'text-after': ''}
Label:  PAGE
{'bias': 3.0, 'isdigit': True, 'isalpha': False, 'num-tokens=1': 1.0, 'text-exact': 'X', 'parent-tag': 'span', 'class': 'items ', 'class_disabled': False, 'query': [], 'has-href': True, 'path-has-page': True, 'path-has-pageXX': False, 'path-has-number': False, 'href-has-year': False, 'text-before': '', 'text-after': ''}
Label:  PAGE
{'bias': 3.0, 'isdigit': True, 'isalpha': False, 'num-tokens=1': 1.0, 'text-exact': 'X', 'parent-tag': 'span', 'class': 'items ', 'class_disabled': False, 'query': [], 'has-href': True, 'path-has-page': True, 'path-has-pageXX': False, 'path-has-number': False, 'href-has-year': False, 'text-befor

In [61]:
count_class = set()
count_query = set()
text_dict = dict()
for page in X:
    for node in page:
        for class_name in node['class'].split():
            if class_name not in count_class:
                count_class.add(class_name)
        for query_name in node['query']:
            if query_name not in count_query:
                count_query.add(query_name)
        for key,val in node.items():
            if 'text' in key:
                if key not in text_dict:
                    text_dict[key] = set()
                if val not in text_dict[key]:
                    text_dict[key].add(val)

In [57]:
len(count_class)

3823

In [58]:
len(count_query)

299

In [59]:
len(count_text)

20265

In [62]:
for k, v in text_dict.items():
    print(k, len(v))

text-exact 9791
text-before 5684
text-after 5153


In [60]:
count_text

{'',
 'nt for toggling',
 'Apr 7, 2020 12:',
 'There are not t',
 'weekend though.',
 'Departments',
 'new releases & bests',
 'reality tv',
 '-aug-2015 02:44',
 'rom yours so...',
 '18-Oct-2014 04:',
 'calendars',
 'de to look… by',
 '-oct-2013 02:31',
 'list your property',
 'tours',
 'If you spend an',
 'hqpremier',
 'Hi people, I ha',
 'vario watch straps',
 'https://telegram.me/',
 'mexico',
 '(109 replies)  ',
 'Game related di',
 '#comment-##  De',
 '-jul-2013 00:28',
 'smp levels guide',
 'speaking orders in c',
 'random games',
 '519  Thread ico',
 '<li class="no-m',
 'hp node enclosures',
 '66 pt (1%)  Shi',
 'Shop By Age',
 'cables',
 '03-Jul-2015 03:',
 'I should watch ',
 ' [twd\xa0$616.11]',
 '-jun-2011 09:21',
 '16-Dec-2014 03:',
 'view officialvisitor',
 ' 8,492 17 # 2',
 'compostable tablewar',
 '36,989  Thread ',
 '26-Mar-2016 10:',
 'XXmb video cards',
 ' 316l stainless',
 'prevention of the us',
 'breastfeeding',
 '-may-2015 07:50',
 '|  Apr 30, 2020',
 '14-Jan-2016

In [None]:
X[0][0]

## Not use href as feature

In [26]:
groups = [get_domain(url) for url in urls]

In [27]:
# TRAIN_SIZE = 80
# X_train, y_train = X[:TRAIN_SIZE], y[:TRAIN_SIZE]
# X_test, y_test = X[TRAIN_SIZE:], y[TRAIN_SIZE:]
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.001,
    c2=0.05,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=False,
)
# crf.fit(X_train, y_train, X_test, y_test)

In [28]:
crf



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.001, c2=0.05,
    keep_tempfiles=None, max_iterations=100)

We must be careful when splitting the dataset into training and
evaluation parts: pages from the same domain should be in the same
"bin". There could be several pages from the same domain, and these
pages may have duplicate or similar link patterns
(e.g. a particular CSS class for paginator links). If we put one such page in a training dataset and another in
an evaluation dataset then the metrics will be too optimistic,
and they can make us to choose wrong features/models.

In [29]:
from sklearn.model_selection import cross_validate, cross_val_predict

In [30]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold

In [31]:
X = np.array(X)
y = np.array(y)

In [32]:
groups = [get_domain(url) for url in urls]
group_kfold = GroupKFold(n_splits=5)
group_kfold.get_n_splits(X, y, groups)

5

In [38]:
def filter_empty(x, y):
    res_x = [page for page in x if len(x)!= 0]
    res_y = [page for page in y if len(y)!= 0]
    return x, y

In [43]:
for train_index, test_index in group_kfold.split(X, y, groups):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.001,
        c2=0.05,
        max_iterations=100,
        all_possible_transitions=True,
        verbose=False,
    )
    crf.fit(X_train, y_train)
    for language in ['en']:
#     for language in ['en','de','ru','zh','ja','ko']:
        print("Testing language: ", language)
        test_urls = [rec['Page URL'] for rec in storage.iter_test_records_by_language(language=language)]
        test_X_raw, test_y = storage.get_test_Xy_by_language(language=language)
        test_X_raw, test_y = filter_empty(test_X_raw, test_y)
        #         print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
        evaluate_test(test_X_raw, test_y, crf)
        print("===================================")
        
    print("=====================")

Testing language:  en


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Macro
{'page_prec': 0.4318840579710145, 'page_rec': 0.3840579710144928, 'page_f1': 0.39912186763760976, 'next_prec': 0.30434782608695654, 'next_rec': 0.30434782608695654, 'next_f1': 0.30434782608695654}
Micro
{'page_prec': 0.9868421052631579, 'page_rec': 0.5952380952380952, 'page_f1': 0.7425742574257426, 'next_prec': 0.8, 'next_rec': 0.41379310344827586, 'next_f1': 0.5454545454545454}
Testing language:  en
Macro
{'page_prec': 0.6024844720496895, 'page_rec': 0.5314872325741892, 'page_f1': 0.5528619930793844, 'next_prec': 0.6086956521739131, 'next_rec': 0.6086956521739131, 'next_f1': 0.6086956521739131}
Micro
{'page_prec': 0.9894736842105263, 'page_rec': 0.746031746031746, 'page_f1': 0.8506787330316742, 'next_prec': 0.76, 'next_rec': 0.6551724137931034, 'next_f1': 0.7037037037037037}
Testing language:  en
Macro
{'page_prec': 0.5002121138310397, 'page_rec': 0.515648723257419, 'page_f1': 0.4972271028021379, 'next_prec': 0.2608695652173913, 'next_rec': 0.2608695652173913, 'next_f1': 0.26086

In [None]:
y_pred = cross_val_predict(crf, X, y, cv=5, n_jobs=-1)

In [None]:
len(y_pred)

In [None]:
%%time
group_kfold = GroupKFold(n_splits=5)
y_pred = cross_val_predict(crf, X, y, cv=group_kfold, groups=groups, n_jobs=-1)
print(flat_classification_report(y, y_pred, labels=['PAGE', 'NEXT'], digits=3))
print("Sequence accuracy: {:0.3f}".format(sequence_accuracy_score(y, y_pred)))

In [None]:
crf.fit(X, y)
# crf.attributes_
crf.num_attributes_

In [None]:
crf.classes_

In [None]:
# [a for a in sorted(crf.attributes_) if a.startswith('id')]

## What are important features?

In [None]:
import eli5

In [None]:
# XXX: weight for correlated features don't show their importance
# XXX: weights for features of different scale don't show their importance
# (e.g. coefficients to text-after and text-before features are high, but only
# because input is scaled down for these features)

eli5.show_weights(crf, top=50)

## Let's check errors the model is making

In [None]:
group_kfold = GroupKFold(n_splits=6)
groups = [get_domain(url) for url in urls]
y_pred = cross_val_predict(crf, X, y, cv=group_kfold, groups=groups, n_jobs=-1)

errors = np.asarray(y) != np.asarray(y_pred)
error_rows = np.asarray(list(storage.iter_records(contain_button=True,file_type='T')))[errors]
error_links = np.asarray(X_raw)[errors]
error_y_pred = y_pred[errors]
error_y_true = np.asarray(y)[errors]

## Unused code

# Test data

In [34]:
def get_test_data(type=None, scaled_page='normal'):
    if type is None:
        print("Please assign type of test_data")
        return
    if type != 'EVENT_SOURCE':
        storage.test_file = 'NORMAL'
        test_urls = [rec['Page URL'] for rec in storage.iter_test_records(exclude_en=None)]
        test_X_one, test_y_one, test_page_positions_one = storage.get_test_Xy(validate=False, contain_position=True,scaled_page=scaled_page,exclude_en=None)
        print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
        if type == 'NORMAL':
            return test_X_one, test_y_one, test_page_positions_one
    if type != 'NORMAL':
        storage.test_file = 'EVENT_SOURCE'
        test_urls = [rec['Page URL'] for rec in storage.iter_test_records(exclude_en=None)]
        test_X_two, test_y_two, test_page_positions_two = storage.get_test_Xy(validate=False, contain_position=True,scaled_page=scaled_page,exclude_en=None)
        print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
        if type == 'EVENT_SOURCE':
            return test_X_two, test_y_two, test_page_positions_two
    test_X_raw = test_X_one + test_X_two
    test_y = test_y_one + test_y_two
    test_positions = test_page_positions_one + test_page_positions_two
    return test_X_raw, test_y, test_positions

In [None]:
# test_X_raw, test_y, test_page_positions = get_test_data('EVENT_SOURCE')
test_X_raw, test_y, test_page_positions = get_test_data('NORMAL', scaled_page='normal')

In [37]:
def evaluate_test(test_X_raw, test_y, crf):
    test_X = [page_to_features(xseq) for xseq in test_X_raw]
    # test_X = [page_to_features(xseq) for xseq in test_X_raw]
    test_y_pred = crf.predict(test_X)
    test_y_pred = np.asarray(test_y_pred)

    macro_report = page_level_score(test_y_pred, test_y)
    micro_report = node_level_score(test_y_pred, test_y)
    print("Macro")
    print(macro_report)
    print("Micro")
    print(micro_report)
#     print(flat_classification_report(test_y, test_y_pred, labels=['PAGE', 'NEXT'], digits=3))
    return

In [None]:
test_crf = crf

In [None]:
evaluate_test(test_X_raw, test_y, test_crf)

In [None]:
test_languages = ['en']

In [65]:
test_languages = storage.get_all_test_languages()

In [None]:
for language in test_languages:
    print("Testing language: ", language)
    test_urls = [rec['Page URL'] for rec in storage.iter_test_records_by_language(language=language)]
    test_X_raw, test_y = storage.get_test_Xy_by_language(language=language)
    print("pages: {}  domains: {}".format(len(test_urls), len({get_domain(url) for url in test_urls})))
    evaluate_test(test_X_raw, test_y, crf)
    print("===================================")

In [70]:
import numpy as np
def count_labels(y):
    count = np.sum([len(page) for page in y])
    return count

In [74]:
print("Dev en", count_labels(y)/len(y))
for language in ['en','de','ru','zh','ja','ko']:
    print("Testing language: ", language)
    test_urls = [rec['Page URL'] for rec in storage.iter_test_records_by_language(language=language)]
    test_X_raw, test_y = storage.get_test_Xy_by_language(language=language)
    print(count_labels(test_y)/len(test_y))

Dev en 242.67073170731706
Testing language:  en
459.734693877551
Testing language:  de
401.55
Testing language:  ru
160.42857142857142
Testing language:  zh
237.61363636363637
Testing language:  ja
180.34782608695653
Testing language:  ko
484.375


In [None]:
test_X_raw, test_y = storage.get_test_Xy_by_language(language='en')

In [None]:
len(test_X_raw)

In [None]:
len(test_y)

In [41]:
from sklearn.metrics import classification_report

In [42]:
import pandas as pd

In [36]:
def node_level_score(y_pred, y_true):

    reports = flat_classification_report(y_true, y_pred, labels=['PAGE', 'NEXT'], digits=3, output_dict = True)

    page_prec = reports['PAGE']['precision']
    page_rec = reports['PAGE']['recall']
    page_f1 = reports['PAGE']['f1-score']
    next_prec = reports['NEXT']['precision']
    next_rec = reports['NEXT']['recall']
    next_f1 = reports['NEXT']['f1-score']
    
    record = {"page_prec": page_prec, "page_rec": page_rec, "page_f1": page_f1, "next_prec": next_prec, "next_rec": next_rec, "next_f1": next_f1}
    return record

def page_level_score(y_pred, y_true):
    page_prec = 0
    page_rec = 0
    page_f1 = 0
    next_prec = 0
    next_rec = 0
    next_f1 = 0
    macro_f1 = 0
    size = 0
    for idx, (page_pred, page_true) in enumerate(zip(y_pred, y_true)):
        
        if 'NEXT' not in page_true and 'PAGE' not in page_true and 'PREV' not in page_true:
#             print("Continue at ",idx)
            continue
        else:
            size += 1
        reports = classification_report(page_true, page_pred, labels=['PAGE', 'NEXT'], digits=3, output_dict = True)
#         print(reports)
        page_prec += reports['PAGE']['precision']
        page_rec += reports['PAGE']['recall']
        page_f1 += reports['PAGE']['f1-score']
        next_prec += reports['NEXT']['precision']
        next_rec += reports['NEXT']['recall']
        next_f1 += reports['NEXT']['f1-score']
    record = {"page_prec": page_prec/size, "page_rec": page_rec/size, "page_f1": page_f1/size, "next_prec": next_prec/size, "next_rec": next_rec/size, "next_f1": next_f1/size}
    return record

In [None]:
test_X = [page_to_features(xseq) for xseq in test_X_raw]
# test_X = [page_to_features(xseq) for xseq in test_X_raw]
test_y_pred = crf.predict(test_X)
test_y_pred = np.asarray(test_y_pred)
# print(flat_classification_report(test_y, test_y_pred, labels=['PAGE', 'NEXT'], digits=3))

In [None]:
page_level_score(test_y_pred, test_y)

In [None]:
node_level_score(test_y_pred, test_y)

In [None]:
multi_count = 0
single_count = 0
PAGE_count = 0
NEXT_count = 0
for page in test_y_pred:
    has_pagination = False
    for label in page:
        if label == 'PAGE':
            PAGE_count+=1
            has_pagination = True
        elif label == 'NEXT':
            NEXT_count+=1
            has_pagination = True
    if has_pagination:
        multi_count+=1
    else:
        single_count+=1
print(PAGE_count)
print(NEXT_count)
print(multi_count)
print(single_count)

In [None]:
errors = np.asarray(test_y) != np.asarray(test_y_pred)
error_rows = np.asarray(list(storage.iter_test_records(exclude_en=None)))[errors]
error_links = np.asarray(test_X_raw)[errors]
error_y_pred = test_y_pred[errors]
error_y_true = np.asarray(test_y)[errors]

In [None]:
errors

In [None]:
sum(errors)

In [None]:
for links, yseq_pred, yseq_true, row in zip(error_links, error_y_pred, error_y_true, error_rows):
    print(row['Page URL'])
    for label_correct, label_pred, link in zip(yseq_true, yseq_pred, links.extract()):
        if label_correct != label_pred:
            print("CORRECT: %4s, PREDICT: %4s, LINK: %s" % (label_correct, label_pred, link))
    print("\n")

## Count Every Parent tag of real label

In [None]:
target_page = 46
idx = 0
counter = {}
for target_page in range(len(X_raw)):
    for page, label in zip(X_raw[target_page], y[target_page]):
        if label == 'PAGE':
            parent = page.xpath('..').extract()
            parent = get_first_tag(parser, parent[0])
            if parent == 'span':
                print(page.xpath('..').extract())
            if parent not in counter:
                counter[parent] = 1
            else:
                counter[parent]+=1
    #         print("Idx: %s, Label: %s, Link: %s, Parent: %s" % (idx,y,x.extract(),parent))
#             print("Parent: %s" % (parent))
        idx+=1

In [None]:
counter

In [None]:
target_page = 46
idx = 0
counter = {}
for target_page in range(len(test_X_raw)):
    for page, label in zip(test_X_raw[target_page], test_y[target_page]):
        if label == 'PAGE':
            parent = page.xpath('..').extract()
            parent = get_first_tag(parser, parent[0])
            if parent not in counter:
                counter[parent] = 1
            else:
                counter[parent]+=1
    #         print("Idx: %s, Label: %s, Link: %s, Parent: %s" % (idx,y,page.extract(),parent))
#             print("Parent: %s" % (parent))
        idx+=1

In [None]:
counter

## Visualize for train page position

In [None]:
import matplotlib.pyplot as plt

In [None]:
target_page = 12 - 1

In [None]:
all_tag = page_positions[target_page]

In [None]:
def draw_page(page):
    plt.scatter([node[0] for node in page], [node[1] for node in page])
#     plt.xlim([-0.05, 1.05])
#     plt.ylim([-0.05, 1.05])
    plt.gca().invert_yaxis()

In [None]:
draw_page(all_tag)

In [None]:
label_nodes = [node for node, label in zip(page_positions[target_page], y[target_page]) if label != 'O']

In [None]:
draw_page(label_nodes)

In [None]:
all_pages = [[node for node, label in zip(page_positions[page], y[page]) if label != 'O'] for page in range(len(y))]

In [None]:
from itertools import chain 

In [None]:
flatten_list = list(chain.from_iterable(all_pages)) 

In [None]:
draw_page(flatten_list)