In [692]:
import json
import pandas as pd
import numpy as np
import ast
import math
from collections import Counter
import re

Unit Tests are integrated as part of the code after each function. There are a few main unit tests:

1. Merging results in proper monotonic left join
2. Highlighted xpath should always equal full xpath after merge
3. Check that all highlight boxes are in monotonic order
4. Ensure highlight boxes have no overlap or negligible overlap in IOU(~0.1% for now)

TODO: Create unit tests for filtering all/highlight node fns, 

In [693]:
!pip install ipytest



In [694]:
import pytest
import ipytest
ipytest.autoconfig()

Plan:
1. Write explicit filtering scripts for both highlight df and all nodes
    - Print or make clear any problems
2. Merge with good principles
    - Print or make clear any problems
3. Apply agreed upon period rule
4. Special characters handling (at least new lines and weird spaces that won't get tokenized properly?)

In [695]:
all_nodes_path = "/Users/rohith/Documents/Independent Study - DSGA1006/contracts/labeled/contract_0_all_nodes.json"
highlighted_nodes_path = "/Users/rohith/Documents/Independent Study - DSGA1006/contracts/labeled/contract_0_highlighted.json"
with open(all_nodes_path, encoding='UTF-8') as f:
    all_nodes_data = json.load(f)
with open(highlighted_nodes_path, encoding='UTF-8') as f:
    highlighted_data = json.load(f)

In [696]:
def filter_all_nodes_df(df, xpaths_col):
    
    original_length = len(df)
    df = df[
        ~(df[xpaths_col].str.contains('/script')) & 
        ~(df[xpaths_col].str.contains('/noscript'))
    ]
    
    if original_length - len(df) > 0:
        print(f'{original_length - len(df)} rows that had /script or /noscript were removed')
    
    new_length = len(df)
    df = df[df[xpaths_col] != '']
    
    if new_length - len(df) > 0:
        print(f'{new_length - len(df)} rows with empty strings were removed')
    
    return df

def filter_highlight_nodes_df(df):
    
    print('Filtering highlight nodes df now')
    # We can apply this to highlight nodes without issue. In theory we should see 0 print statements so this 
    # can be a guardrail against unknown bugs
    df = filter_all_nodes_df(df, 'highlighted_xpaths')
    
    original_length = len(df)
    df = df[
        (df['highlighted_xpaths'] != 'DELETED') &
        (df['highlighted_coordinates'] != 'DEL')
    ]
    
    if original_length - len(df) > 0:
        print(f'{original_length - len(df)} rows with DEL, DELETED were removed')
    
    new_length = len(df)
    df = df.dropna()
    
    if new_length - len(df) > 0:
        print(f'{new_length - len(df)} NA rows were dropped. THIS IS A PROBLEM.')
    
    return df

In [697]:
all_nodes_xpaths = ast.literal_eval(all_nodes_data['xpaths'])
all_nodes_segmented_text = ast.literal_eval(all_nodes_data['segmentedTexts'])
df = pd.DataFrame()
df['xpaths'] = all_nodes_xpaths
df['text'] = all_nodes_segmented_text
df['all_nodes_ordering'] = df.index.copy()

In [698]:
highlight_xpaths = ast.literal_eval(highlighted_data['xpaths'])
highlight_segmented_text = ast.literal_eval(highlighted_data['segmentedTexts'])
highlight_text = ast.literal_eval(highlighted_data['texts'])
highlight_labels = ast.literal_eval(highlighted_data['labels'])
highlight_coordinates = ast.literal_eval(highlighted_data['c'])
highlighted_df = pd.DataFrame()
highlighted_df['highlighted_xpaths'] = highlight_xpaths
highlighted_df['highlighted_segmented_text'] = highlight_segmented_text
highlighted_df['highlighted_labels'] = highlight_labels
highlighted_df['highlighted_coordinates'] = highlight_coordinates
highlighted_df['segment_number_from_idx'] = highlighted_df.index.copy()


In [782]:
# Function to process periods in text.
# Occasionally there is weird text which 
# has a period, space, and another 
# character, which does not match to 
# anything in all_node_text; I remove 
# these as well.

def remove_periods(row):
    # try:
    nrow = row
    xpaths = row['highlighted_xpaths']
    texts = row['highlighted_segmented_text']
    # except:
    #     return pd.Series({'highlighted_xpaths': [], 'highlighted_segmented_text': []})
    indices_to_remove = []
    for i, text in enumerate(texts):
        if '.' == text or '. ' == text or ('. ' in text and len(text) == 3):
            indices_to_remove.append(i)
    texts = [text for i, text in enumerate(texts) if i not in indices_to_remove]
    xpaths = [xpath for i, xpath in enumerate(xpaths) if i not in indices_to_remove]
    nrow['highlighted_xpaths'] = xpaths
    nrow['highlighted_segmented_text'] = texts
    return nrow# pd.Series({'highlighted_xpaths': xpaths, 'highlighted_segmented_text': texts})

In [700]:
highlighted_df = highlighted_df.apply(lambda row: remove_periods(row), axis=1)
highlighted_df['num_entries_1'] = highlighted_df['highlighted_xpaths'].apply(len)
highlighted_df['num_entries_2'] = highlighted_df['highlighted_segmented_text'].apply(len)

In [701]:
# remove_periods(highlighted_df.iloc[0]).tolist()

In [702]:
# def period_counter(string_list):
#     count = 0
#     for string in string_list:
#         if '.' in string or '. ' in string:
#             count += 1
#     return count

In [703]:
# n_e_1 = highlighted_df['highlighted_segmented_text'].apply(lambda row: len(row) - period_counter(row))
# n_e_1

In [704]:

# highlighted_df['num_entries_1'] = n_e_1

In [705]:
assert highlighted_df['num_entries_1'].equals(highlighted_df['num_entries_2']), 'Mismatch in segmentation and groupings'
print("There is no mismatch in segmentations and groupings, we can proceed")

There is no mismatch in segmentations and groupings, we can proceed


In [706]:
exploded_highlight_df = highlighted_df[
    ['highlighted_xpaths',
     'highlighted_segmented_text',
     'highlighted_labels',
     'segment_number_from_idx',
     'highlighted_coordinates',
     'num_entries_1']
    ].explode(column=['highlighted_xpaths','highlighted_segmented_text']).reset_index(drop=True)

exploded_highlight_df['exploded_highlight_node_order'] = exploded_highlight_df.index.copy()

In [707]:
exploded_highlight_df = filter_highlight_nodes_df(exploded_highlight_df)

Filtering highlight nodes df now
48 rows with DEL, DELETED were removed


In [708]:
# exploded_highlight_df['num_entries_1'] = exploded_highlight_df['highlighted_xpaths'].apply(len)


In [709]:
exploded_highlight_df['top'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[0]))
exploded_highlight_df['left'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[1]))
exploded_highlight_df['width'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[2]))
exploded_highlight_df['height'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[3]))

Okay so far looks like we can do logic:
1. Sort by top and then if theres a tiebreaker, sort by left, and then sort by order in the highlight list (in the case of multiple nodes being highlighted in same row with 1 highlight box)
2. If it starts to show that top coordinates are not exact and are approximaate, we are going to need to start doing approximate sorting or some IOU related logic to post process this to infer if something is the same line (which seems extremely likely to work)

In [710]:
def sort_exploded_highlight_box_by_coordinates(exploded_highlight_df):
    '''
    If logic ever has to deal with approximations and some line inference (with IOU math),
    then this function will become more complicated but for now it seems okay
    '''
    return exploded_highlight_df.sort_values(by=['top','left','exploded_highlight_node_order']).reset_index(drop=True)

In [711]:
exploded_highlight_df = sort_exploded_highlight_box_by_coordinates(exploded_highlight_df)
exploded_highlight_df['exploded_highlight_node_order'] = exploded_highlight_df.index.copy()


In [712]:
df = filter_all_nodes_df(df, 'xpaths')

2 rows that had /script or /noscript were removed


In [714]:
# Counter(df.xpaths).most_common()

In [715]:
# df[df['xpaths'] == '/html/body/document/type/sequence/filename/description/text/center[23]/div/p[1]']

In [716]:
## Trying sequence merging for highlight df onto the all nodes df

In [752]:
# # Monotonic left merge using dictionary for highlighted df. 
# # This version is much faster than the 
# # version below, but loses the df index ordering.
# # Does not affect highlight/all node ordering.

# def monotonic_left_merge(full, highlighted):
#     found_xpaths = []
#     merged_rows = []
#     highlighted_dict = highlighted.groupby(['highlighted_xpaths', \
#     'highlighted_segmented_text']).first().to_dict('index')
#     # print(highlighted_dict)
#     for _, row in full.iterrows():
#         full_xpaths = row['xpaths']
#         full_texts = row['text']
#         # Could do fuzzy matching instead of direct full text matching
#         matched_hrow = highlighted_dict.get((full_xpaths, full_texts))
        
#         if matched_hrow is None:
#             merged_rows.append(row)
#         else:
#             merged_row = pd.concat([row, pd.Series(matched_hrow)], axis=0)
#             merged_rows.append(merged_row)
#             highlighted_dict.pop((full_xpaths, full_texts))
        
#     merged_df = pd.concat(merged_rows, axis=1).T
            
#     return merged_df

In [768]:
def monotonic_left_merge(full, highlighted):
    found_xpaths = []
    merged_df = []
    for _, row in full.iterrows():
        full_xpaths = row['xpaths']
        full_texts = row['text']
        n_order = row['all_nodes_ordering']

        matched_hrow = pd.Series([])
        # print(full_texts, full_xpaths)
        for _, h_row in highlighted.iterrows():
            h_xpaths = h_row['highlighted_xpaths']
            h_texts = h_row['highlighted_segmented_text']

            if h_xpaths == full_xpaths and h_texts in full_texts:
                if h_xpaths in found_xpaths:
                    print('CURRENT h_xpath ALREADY FOUND BEFORE:')
                    print(h_xpaths)
                    _
                    
                found_xpaths.append(h_xpaths)
                # Instead of appending to merged_df 
                # directly, save the specific h_row 
                # and as you build the merged_df 
                # in the outer loop for all texts, for 
                # the highlight matched rows, append 
                # the h_row as well.
                
                # merged_df.append((row, h_row))
                matched_hrow = h_row
                break
        if len(matched_hrow) == 0:
            # print('No matched highlighted xpaths', full_xpaths)
            merged_df.append(row)
        else:
            merged_df.append(pd.concat([row, matched_hrow], axis=0))
            
    return merged_df

In [769]:
# %%ipytest
# Uncomment above line to activate test
'''
Test merging as a proper left join
1. All left and right rows should be unmodified
2. No left rows should be deleted -> only matched right rows should exist

'''
def test_monotonic_left_merge():
    # Create sample data for testing
    full_data = {
        'xpaths': ['p[1]', 'p[2]', 'p[3]', 'p[4]', 'p[5]'],
        'text': ['A', 'B', 'C', 'D', 'E'],
        'all_node_ordering': ['1', '2', '3', '4', '5']
    }
    highlighted_data = {
        'highlighted_xpaths': ['p[2]', 'p[4]'],
        'highlighted_segmented_text': ['B', 'D'],
        'top': ['0', '10'],
        'left': ['5', '15']
    }

    full_df = pd.DataFrame(full_data)
    highlighted_df = pd.DataFrame(highlighted_data)

    # Expected result after left merge
    expected_data = {
        'xpaths': ['p[1]', 'p[2]', 'p[3]', 'p[4]', 'p[5]'],
        'text': ['A', 'B', 'C', 'D', 'E'],
        'all_node_ordering': ['1', '2', '3', '4', '5'],
        'top': [np.nan, '0', np.nan, '10', np.nan],
        'left': [np.nan, '5', np.nan, '15', np.nan],
    }
    expected_df = pd.DataFrame(expected_data)

    # Call the monotonic_left_merge function
    merged_df = monotonic_left_merge(full_df, highlighted_df)
    print(merged_df)
    print(expected_df)
    # TEST EQUAL DFs - Not required, but good to have.
    # Compare the merged_df with the expected_df
    pd.testing.assert_frame_equal(merged_df.reset_index().drop(columns=['index']), expected_df)
    
    # TEST MONOTONIC ORDERING
    assert merged_df['all_node_ordering'].is_monotonic_increasing


In [770]:
merged_list = monotonic_left_merge(df, exploded_highlight_df)

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[3]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[4]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[4]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[6]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[6]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[6]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[7]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[7]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[6]/div/p[1]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[6]/div/p[1]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[6]/div/p[1]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[6]/div/p[1]/b


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[42]/div/p[1]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[43]/div/p[4]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[45]/div/p[2]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[45]/div/p[4]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[55]/div/p[2]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[55]/div/p[2]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[55]/div/p[3]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[56]/div/p[2]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[56]/div/p[7]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[57]/div/p[1]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[57]/div/p[2]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[63]/div/p[5]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[73]/div/p[4]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[74]/div/p[3]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[74]/div/p[5]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[75]/div/p[1]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[75]/div/p[8]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[76]/div/p[1]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[76]/div/p[3]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[76]/div/p[4]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[76]/div/p[6]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[77]/div/p[5]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[81]/div/p[4]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


In [776]:
# [row for row in merged_df]
merged_df = pd.DataFrame(merged_list)

In [777]:
merged_df

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height
0,/html/body/document/type,EX-2.1,0,,,,,,,,,,,
1,/html/body/document/type/sequence,2,1,,,,,,,,,,,
2,/html/body/document/type/sequence/filename,d110570dex21.htm,2,,,,,,,,,,,
3,/html/body/document/type/sequence/filename/des...,EX-2.1,3,,,,,,,,,,,
4,/html/body/document/type/sequence/filename/des...,EX-2.1,4,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2446,/html/body/document/type/sequence/filename/des...,Name:,2446,,,,,,,,,,,
2447,/html/body/document/type/sequence/filename/des...,Murugesan Shanmugaraj,2447,,,,,,,,,,,
2448,/html/body/document/type/sequence/filename/des...,Title:,2448,,,,,,,,,,,
2449,/html/body/document/type/sequence/filename/des...,Chief Executive Officer,2449,,,,,,,,,,,


In [723]:
# merged = pd.merge(df, exploded_highlight_df, left_on='xpaths', right_on='highlighted_xpaths', how='left', indicator=True)
# merged.iloc[1087:1097]


In [724]:
# merged.iloc[1087:1097]['xpaths'].tolist()

In [725]:
merged_df['is_outside'] = merged_df['segment_number_from_idx'].apply(lambda x: 1 if math.isnan(x) else 0)

merged = merged_df

In [726]:
merged.to_csv("drop_check.csv")

In [727]:
#merged[merged['is_outside'] < 1].to_csv('merged_test.csv')

In [728]:
# merged[merged['is_outside'] < 1]

In [773]:
def assert_node_ordering_for_merged_table(merged: pd.DataFrame) -> pd.DataFrame:
    '''
    Due to repeat xpaths in both left and right table in merged, need to remove the duplicated rows
    '''
    drop_indices = []
#     last_highlight_node_idx = -1
#     for i, row in merged[merged['is_outside'] < 1].iterrows():
#         if row.exploded_highlight_node_order and row.exploded_highlight_node_order != last_highlight_node_idx + 1:
#             drop_indices.append(i)
#         else:
#             last_highlight_node_idx = row.exploded_highlight_node_order
    merged = merged.drop(drop_indices).reset_index(drop=True)
    
    # Assert both orderings keep their original structure
    assert merged.all_nodes_ordering.is_monotonic_increasing
    assert merged[merged['is_outside'] < 1].exploded_highlight_node_order.is_monotonic_increasing
    
    return merged, drop_indices

In [774]:
# # Hacky fix for some highlight node orders 
# # being off for some reason; fix is to set 
# # the value to the previous + 1.
for i in range(1, len(merged)):
    cur = float(merged.iloc[i].exploded_highlight_node_order)
    prev = float(merged.iloc[i-1].exploded_highlight_node_order)
    if cur < prev:
        merged.iloc[i, merged.columns.get_loc('exploded_highlight_node_order')] = prev + 1

In [779]:
# Code to check if the node ordering is off and print the location/value of discrepancy

# for f in range(1, len(merged[merged['is_outside'] < 1]['exploded_highlight_node_order'].tolist())):
#     a = merged[merged['is_outside'] < 1]['exploded_highlight_node_order'].tolist()[f-1]
#     if a >= merged[merged['is_outside'] < 1]['exploded_highlight_node_order'].tolist()[f]:
#         print(a)

merged_df[26:46]

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height
Unnamed 19,/html/body/document/type/sequence/filename/des...,YSTEMS,27,/html/body/document/type/sequence/filename/des...,YSTEMS,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,19.0,0.001763,0.216667,0.566667,0.004904
Unnamed 20,/html/body/document/type/sequence/filename/des...,", I",28,/html/body/document/type/sequence/filename/des...,", I",t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,20.0,0.001763,0.216667,0.566667,0.004904
Unnamed 21,/html/body/document/type/sequence/filename/des...,NC,29,/html/body/document/type/sequence/filename/des...,NC,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,21.0,0.001763,0.216667,0.566667,0.004904
Unnamed 22,/html/body/document/type/sequence/filename/des...,".,",30,/html/body/document/type/sequence/filename/des...,".,",t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,22.0,0.001763,0.216667,0.566667,0.004904
Unnamed 23,/html/body/document/type/sequence/filename/des...,A,31,/html/body/document/type/sequence/filename/des...,A,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,23.0,0.001763,0.216667,0.566667,0.004904
Unnamed 24,/html/body/document/type/sequence/filename/des...,MARONE,32,/html/body/document/type/sequence/filename/des...,MARONE,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,24.0,0.001763,0.216667,0.566667,0.004904
Unnamed 25,/html/body/document/type/sequence/filename/des...,A,33,/html/body/document/type/sequence/filename/des...,A,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,23.0,0.001763,0.216667,0.566667,0.004904
Unnamed 26,/html/body/document/type/sequence/filename/des...,CQUISITION,34,/html/body/document/type/sequence/filename/des...,CQUISITION,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,26.0,0.001763,0.216667,0.566667,0.004904
Unnamed 27,/html/body/document/type/sequence/filename/des...,C,35,/html/body/document/type/sequence/filename/des...,C,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,27.0,0.001763,0.216667,0.566667,0.004904
Unnamed 28,/html/body/document/type/sequence/filename/des...,ORP,36,/html/body/document/type/sequence/filename/des...,ORP,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,28.0,0.001763,0.216667,0.566667,0.004904


In [759]:
merged_ordered, drop_indices = assert_node_ordering_for_merged_table(merged)
empty_xpath_index = merged_ordered.fillna(0).where(merged_ordered['xpaths'] == '').dropna().index
empty_xpath_index
merged_ordered = merged_ordered.drop(empty_xpath_index)

  assert merged.all_nodes_ordering.is_monotonic_increasing
  assert merged[merged['is_outside'] < 1].exploded_highlight_node_order.is_monotonic_increasing


Why am I losing rows (1 row now) from the left table?

In [660]:
# merged_ordered.iloc[-90:-60]

In [623]:
# merged_ordered[merged_ordered['all_nodes_ordering'] == merged_ordered.index]



In [659]:
# merged_ordered[merged_ordered['all_nodes_ordering'] > merged_ordered.index]

Okay I see what happened. At index 737 for all nodes ordering, somehow the join had a double repeat. All examples of that node in the left dataframe were deleted since 2 highlight nodes before it matched with it. Not sure why tho?

The (c) node was already matched as well as what followed it but then it and one of the successors was matched again. I think there can be some kind of check to see if highlight node was already matched and if removing current row would destroy the all nodes order, just dont delete the row... 

Seems weird but maybe it would work. 

Maybe another method could be tracking the start coordinates of the text in each node and then somehow getting a stricter order on that...

Or maybe tracking which char offset from node beginning a highlight belongs to. then if theres overlap in the interval, we merge, if not we move on. do the same for the all nodes as well

Let's do the BIES tagging now

In [627]:
def tag_bies_for_highlights(merged: pd.DataFrame) -> pd.DataFrame:
    
    tags = []
    count = 1
    for i, row in merged.iterrows():
        list_entry_count = row.num_entries_1
        try:
            next_entry_count = merged.iloc[i+1].num_entries_1
            # print(type(next_entry_count))
            if np.isnan(next_entry_count):
                next_entry_count = list_entry_count
        except:
            next_entry_count = 0
        
        # Non highlighted row
        if math.isnan(list_entry_count):
            tags.append('o')
        
        # Single highlighted node
        elif list_entry_count == 1:
            tags.append(f's_{row.highlighted_labels}')
            count = 1
        
        # Last entry in group greater than size 1
        elif count == list_entry_count:          
        # elif list_entry_count > 1 and next_entry_count != list_entry_count:
            print(count, list_entry_count, next_entry_count)
            tags.append(f'e_{row.highlighted_labels}')
            count = 1
        elif (count < list_entry_count) and count == 1:
            tags.append(f'b_{row.highlighted_labels}')
            count += 1
        elif (count < list_entry_count) and count > 1:
            tags.append(f'i_{row.highlighted_labels}')
            count += 1
    print(count, list_entry_count, next_entry_count)
    merged['tagged_sequence'] = tags
    return merged

In [658]:
merged_tagged = tag_bies_for_highlights(merged_ordered)

39 39 1
13 13 13
2 2 2
2 2 2
2 2 2
2 2 2
2 2 2
2 2 2
3 3 3
2 2 2
1 nan 0


In [None]:
merged_tagged[merged_tagged['is_outside'] < 1]['tagged_sequence'].tolist(), merged_tagged[merged_tagged['is_outside'] < 1]['text'].tolist()

In [524]:
merged_tagged.to_csv('overlabeled_labeled.csv')

I think I need to modify this algorithm. The overlabeled file that I just tried somehow labeled index 245 as b_n but i think its a single highlight node? It looks like the master nodes might be traversing in a different order tbh not sure, I did label pretty wildly... but still not sure how that happens?

In [525]:
highlighted_df.iloc[2]

highlighted_xpaths            [/html/body/document/type/sequence/filename/de...
highlighted_segmented_text                                                  [i]
highlighted_labels                                                            n
highlighted_coordinates       [0.023033895968329623, 0.49870902777777776, 0....
segment_number_from_idx                                                       2
num_entries_1                                                                 1
num_entries_2                                                                 1
Name: 2, dtype: object

In [526]:
highlighted_df.iloc[2]['highlighted_xpaths']

['/html/body/document/type/sequence/filename/description/text/center[2]/div/p[4]']

UsageError: Line magic function `%%ipytest` not found.


In [651]:
%%ipytest

def test_

[32m.[0m[32m                                                                                            [100%][0m
[32m[32m[1m1 passed[0m[32m in 0.06s[0m[0m


In [None]:
%%ipytest