In [19]:
import json
import pandas as pd
import ast
import math
from collections import Counter

Plan:
1. Write explicit filtering scripts for both highlight df and all nodes
    - Print or make clear any problems
2. Merge with good principles
    - Print or make clear any problems
3. Apply agreed upon period rule
4. Special characters handling (at least new lines and weird spaces that won't get tokenized properly?)

In [2]:
all_nodes_path = r"C:\Users\islam\Desktop\2023 Research\contracts\labeled\contract_0_all_nodes.json"
highlighted_nodes_path = r"C:\Users\islam\Desktop\2023 Research\contracts\labeled\contract_0_highlighted.json"
with open(all_nodes_path, encoding='UTF-8') as f:
    all_nodes_data = json.load(f)
with open(highlighted_nodes_path, encoding='UTF-8') as f:
    highlighted_data = json.load(f)

In [3]:
def filter_all_nodes_df(df, xpaths_col):
    
    original_length = len(df)
    df = df[
        ~(df[xpaths_col].str.contains('/script')) & 
        ~(df[xpaths_col].str.contains('/noscript'))
    ]
    
    if original_length - len(df) > 0:
        print(f'{original_length - len(df)} rows that had /script or /noscript were removed')
    
    new_length = len(df)
    df = df[df[xpaths_col] != '']
    
    if new_length - len(df) > 0:
        print(f'{new_length - len(df)} rows with empty strings were removed')
    
    return df

def filter_highlight_nodes_df(df):
    
    print('Filtering highlight nodes df now')
    # We can apply this to highlight nodes without issue. In theory we should see 0 print statements so this 
    # can be a guardrail against unknown bugs
    df = filter_all_nodes_df(df, 'highlighted_xpaths')
    
    original_length = len(df)
    df = df[
        (df['highlighted_xpaths'] != 'DELETED') &
        (df['highlighted_xpaths'] != 'DEL')
    ]
    
    if original_length - len(df) > 0:
        print(f'{original_length - len(df)} rows with DEL or DELETED were removed')
    
    new_length = len(df)
    df = df.dropna()
    
    if new_length - len(df) > 0:
        print(f'{new_length - len(df)} NA rows were dropped. THIS IS A PROBLEM.')
    
    return df

In [4]:
all_nodes_xpaths = ast.literal_eval(all_nodes_data['xpaths'])
all_nodes_segmented_text = ast.literal_eval(all_nodes_data['segmentedTexts'])
df = pd.DataFrame()
df['xpaths'] = all_nodes_xpaths
df['text'] = all_nodes_segmented_text
df['all_nodes_ordering'] = df.index.copy()

In [5]:
highlight_xpaths = ast.literal_eval(highlighted_data['xpaths'])
highlight_segmented_text = ast.literal_eval(highlighted_data['segmentedTexts'])
highlight_text = ast.literal_eval(highlighted_data['texts'])
highlight_labels = ast.literal_eval(highlighted_data['labels'])
highlight_coordinates = ast.literal_eval(highlighted_data['c'])
highlighted_df = pd.DataFrame()
highlighted_df['highlighted_xpaths'] = highlight_xpaths
highlighted_df['highlighted_segmented_text'] = highlight_segmented_text
highlighted_df['highlighted_labels'] = highlight_labels
highlighted_df['highlighted_coordinates'] = highlight_coordinates
highlighted_df['segment_number_from_idx'] = highlighted_df.index.copy()
highlighted_df['num_entries_1'] = highlighted_df['highlighted_xpaths'].apply(len)
highlighted_df['num_entries_2'] = highlighted_df['highlighted_segmented_text'].apply(len)

In [6]:
highlighted_df['highlighted_segmented_text'].tolist()

[['A',
  'MENDED',
  'AND',
  'R',
  'ESTATED',
  'A',
  'GREEMENT',
  'AND',
  'P',
  'LAN',
  'OF',
  'M',
  'ERGER',
  'BY',
  'AND',
  'AMONG',
  'C',
  'ISCO',
  'S',
  'YSTEMS',
  ', I',
  'NC',
  '.,',
  'A',
  'MARONE',
  'A',
  'CQUISITION',
  'C',
  'ORP',
  '.',
  'AND',
  'A',
  'CACIA',
  'C',
  'OMMUNICATIONS',
  ', I',
  'NC',
  '.',
  'J',
  'ANUARY',
  '14, 2021'],
 ['TABLE OF CONTENTS'],
 ['i'],
 ['ii'],
 ['iii'],
 ['iv'],
 ['A',
  'MENDED',
  'AND',
  'R',
  'ESTATED',
  'A',
  'GREEMENT',
  'AND',
  'P',
  'LAN',
  'OF',
  'M',
  'ERGER'],
 ['R', 'ECITALS'],
 ['ARTICLE I'],
 ['THE MERGER'],
 ['1.1.'],
 ['Certain Definitions', '.'],
 'DEL',
 'DEL',
 ['(a)'],
 ['1'],
 ['2'],
 ['3'],
 ['4'],
 ['5'],
 ['6'],
 ['7'],
 ['(b)'],
 ['8'],
 ['9'],
 ['1.2.'],
 ['The Merger', '.'],
 ['1.3.'],
 ['Closing', '.'],
 ['1.4.'],
 ['Effective Time', '.'],
 ['1.5.'],
 ['Effect of the Merger', '.'],
 ['1.6.'],
 ['Certificate of Incorporation; Bylaws', '.'],
 ['(a)'],
 ['(b)'],
 ['10'],
 

In [7]:
assert highlighted_df['num_entries_1'].equals(highlighted_df['num_entries_2']), 'Mismatch in segmentation and groupings'
print("There is no mismatch in segmentations and groupings, we can proceed")

There is no mismatch in segmentations and groupings, we can proceed


In [8]:
exploded_highlight_df = highlighted_df[
    ['highlighted_xpaths',
     'highlighted_segmented_text',
     'highlighted_labels',
     'segment_number_from_idx',
     'highlighted_coordinates',
     'num_entries_1']
    ].explode(column=['highlighted_xpaths','highlighted_segmented_text']).reset_index(drop=True)

exploded_highlight_df['exploded_highlight_node_order'] = exploded_highlight_df.index.copy()

In [9]:
exploded_highlight_df = filter_highlight_nodes_df(exploded_highlight_df)

Filtering highlight nodes df now
16 rows with DEL or DELETED were removed


In [11]:
exploded_highlight_df['top'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[0]))
exploded_highlight_df['left'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[1]))
exploded_highlight_df['width'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[2]))
exploded_highlight_df['height'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[3]))

In [12]:
exploded_highlight_df

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height
0,/html/body/document/type/sequence/filename/des...,A,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",41,0,0.001763,0.216667,0.566667,0.004904
1,/html/body/document/type/sequence/filename/des...,MENDED,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",41,1,0.001763,0.216667,0.566667,0.004904
2,/html/body/document/type/sequence/filename/des...,AND,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",41,2,0.001763,0.216667,0.566667,0.004904
3,/html/body/document/type/sequence/filename/des...,R,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",41,3,0.001763,0.216667,0.566667,0.004904
4,/html/body/document/type/sequence/filename/des...,ESTATED,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",41,4,0.001763,0.216667,0.566667,0.004904
...,...,...,...,...,...,...,...,...,...,...,...
757,/html/body/document/type/sequence/filename/des...,.,sst,553,"[0.988359955094243, 0.2596465277777778, 0.1064...",2,757,0.988360,0.259647,0.106467,0.000295
758,/html/body/document/type/sequence/filename/des...,8.12.,ssn,554,"[0.9897386406160755, 0.2393284722222222, 0.018...",1,758,0.989739,0.239328,0.018517,0.000295
759,/html/body/document/type/sequence/filename/des...,Original Agreement,sst,555,"[0.9897386406160755, 0.26016180555555557, 0.07...",2,759,0.989739,0.260162,0.076096,0.000295
760,/html/body/document/type/sequence/filename/des...,.,sst,555,"[0.9897386406160755, 0.26016180555555557, 0.07...",2,760,0.989739,0.260162,0.076096,0.000295


Okay so far looks like we can do logic:
1. Sort by top and then if theres a tiebreaker, sort by left, and then sort by order in the highlight list (in the case of multiple nodes being highlighted in same row with 1 highlight box)
2. If it starts to show that top coordinates are not exact and are approximaate, we are going to need to start doing approximate sorting or some IOU related logic to post process this to infer if something is the same line (which seems extremely likely to work)

In [13]:
def sort_exploded_highlight_box_by_coordinates(exploded_highlight_df):
    '''
    If logic ever has to deal with approximations and some line inference (with IOU math),
    then this function will become more complicated but for now it seems okay
    '''
    return exploded_highlight_df.sort_values(by=['top','left','exploded_highlight_node_order']).reset_index(drop=True)

In [14]:
exploded_highlight_df = sort_exploded_highlight_box_by_coordinates(exploded_highlight_df)
exploded_highlight_df['exploded_highlight_node_order'] = exploded_highlight_df.index.copy()
exploded_highlight_df

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height
0,/html/body/document/type/sequence/filename/des...,A,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",41,0,0.001763,0.216667,0.566667,0.004904
1,/html/body/document/type/sequence/filename/des...,MENDED,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",41,1,0.001763,0.216667,0.566667,0.004904
2,/html/body/document/type/sequence/filename/des...,AND,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",41,2,0.001763,0.216667,0.566667,0.004904
3,/html/body/document/type/sequence/filename/des...,R,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",41,3,0.001763,0.216667,0.566667,0.004904
4,/html/body/document/type/sequence/filename/des...,ESTATED,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",41,4,0.001763,0.216667,0.566667,0.004904
...,...,...,...,...,...,...,...,...,...,...,...
741,/html/body/document/type/sequence/filename/des...,.,sst,553,"[0.988359955094243, 0.2596465277777778, 0.1064...",2,741,0.988360,0.259647,0.106467,0.000295
742,/html/body/document/type/sequence/filename/des...,8.12.,ssn,554,"[0.9897386406160755, 0.2393284722222222, 0.018...",1,742,0.989739,0.239328,0.018517,0.000295
743,/html/body/document/type/sequence/filename/des...,Original Agreement,sst,555,"[0.9897386406160755, 0.26016180555555557, 0.07...",2,743,0.989739,0.260162,0.076096,0.000295
744,/html/body/document/type/sequence/filename/des...,.,sst,555,"[0.9897386406160755, 0.26016180555555557, 0.07...",2,744,0.989739,0.260162,0.076096,0.000295


In [23]:
df

Unnamed: 0,xpaths,text,all_nodes_ordering
0,/html/body/document/type,EX-2.1,0
1,/html/body/document/type/sequence,2,1
2,/html/body/document/type/sequence/filename,d110570dex21.htm,2
3,/html/body/document/type/sequence/filename/des...,EX-2.1,3
4,/html/body/document/type/sequence/filename/des...,EX-2.1,4
...,...,...,...
2447,/html/body/document/type/sequence/filename/des...,Murugesan Shanmugaraj,2447
2448,/html/body/document/type/sequence/filename/des...,Title:,2448
2449,/html/body/document/type/sequence/filename/des...,Chief Executive Officer,2449
2450,/html/body/document/type/sequence/filename/des...,[SIGNATURE PAGE TO AMENDED AND RESTATED AGREEM...,2450


In [16]:
df = filter_all_nodes_df(df, 'xpaths')

2 rows that had /script or /noscript were removed


In [17]:
df

Unnamed: 0,xpaths,text,all_nodes_ordering
0,/html/body/document/type,EX-2.1,0
1,/html/body/document/type/sequence,2,1
2,/html/body/document/type/sequence/filename,d110570dex21.htm,2
3,/html/body/document/type/sequence/filename/des...,EX-2.1,3
4,/html/body/document/type/sequence/filename/des...,EX-2.1,4
...,...,...,...
2446,/html/body/document/type/sequence/filename/des...,Name:,2446
2447,/html/body/document/type/sequence/filename/des...,Murugesan Shanmugaraj,2447
2448,/html/body/document/type/sequence/filename/des...,Title:,2448
2449,/html/body/document/type/sequence/filename/des...,Chief Executive Officer,2449


In [21]:
Counter(df.xpaths).most_common()

[('/html/body/document/type/sequence/filename/description/text/center[23]/div/p[1]',
  10),
 ('/html/body/document/type/sequence/filename/description/text/center[26]/div/p[3]',
  9),
 ('/html/body/document/type/sequence/filename/description/text/center[62]/div/p[1]',
  8),
 ('/html/body/document/type/sequence/filename/description/text/center[64]/div/p[4]',
  8),
 ('/html/body/document/type/sequence/filename/description/text/center[7]/div/p[8]',
  7),
 ('/html/body/document/type/sequence/filename/description/text/center[10]/div/p[4]',
  7),
 ('/html/body/document/type/sequence/filename/description/text/center[16]/div/p[7]',
  7),
 ('/html/body/document/type/sequence/filename/description/text/center[24]/div/p[1]',
  7),
 ('/html/body/document/type/sequence/filename/description/text/center[27]/div/p[3]',
  7),
 ('/html/body/document/type/sequence/filename/description/text/center[6]/div/p[2]',
  6),
 ('/html/body/document/type/sequence/filename/description/text/center[12]/div/p[9]',
  6),


In [22]:
df[df['xpaths'] == '/html/body/document/type/sequence/filename/description/text/center[23]/div/p[1]']

Unnamed: 0,xpaths,text,all_nodes_ordering
1273,/html/body/document/type/sequence/filename/des...,"(c) The Company has no Company Options, Compan...",1273
1275,/html/body/document/type/sequence/filename/des...,of the Company Disclosure Letter sets forth a ...,1275
1277,/html/body/document/type/sequence/filename/des...,”) of all holders of outstanding Company Optio...,1277
1279,/html/body/document/type/sequence/filename/des...,of the Company Disclosure Letter (which Schedu...,1279
1281,/html/body/document/type/sequence/filename/des...,of the Company Disclosure Letter) sets forth a...,1281
1283,/html/body/document/type/sequence/filename/des...,"directors, consultants,\nadvisory board member...",1283
1285,/html/body/document/type/sequence/filename/des...,of the Company Disclosure\nLetter sets forth a...,1285
1287,/html/body/document/type/sequence/filename/des...,of the Company Disclosure Letter (which Schedu...,1287
1289,/html/body/document/type/sequence/filename/des...,of the Company Disclosure Letter) sets forth a...,1289
1291,/html/body/document/type/sequence/filename/des...,"directors, consultants, advisory board members...",1291


In [25]:
df.iloc[1270:1292]

Unnamed: 0,xpaths,text,all_nodes_ordering
1271,/html/body/document/type/sequence/filename/des...,and are not subject to or issued in violation ...,1271
1272,/html/body/document/type/sequence/filename/des...,17,1272
1273,/html/body/document/type/sequence/filename/des...,"(c) The Company has no Company Options, Compan...",1273
1274,/html/body/document/type/sequence/filename/des...,2.2(c)-1,1274
1275,/html/body/document/type/sequence/filename/des...,of the Company Disclosure Letter sets forth a ...,1275
1276,/html/body/document/type/sequence/filename/des...,Measurement Date,1276
1277,/html/body/document/type/sequence/filename/des...,”) of all holders of outstanding Company Optio...,1277
1278,/html/body/document/type/sequence/filename/des...,2.2(c)-2,1278
1279,/html/body/document/type/sequence/filename/des...,of the Company Disclosure Letter (which Schedu...,1279
1280,/html/body/document/type/sequence/filename/des...,2.2(c)-1,1280


In [None]:
## Trying sequence merging for highlight df onto the all nodes df

In [326]:
def monotonic_left_merge(full, highlighted):
    found_xpaths = []
    merged_df = []
    for row in full.iterrows():
        full_xpaths = row[1]['xpaths']
        full_texts = row[1]['text']
        n_order = row[1]['all_nodes_ordering']

        matched_hrow = pd.Series([])
        # print(full_texts, full_xpaths)
        for h_row in highlighted.iterrows():
            h_xpaths = h_row[1]['highlighted_xpaths']
            h_texts = h_row[1]['highlighted_segmented_text']

            if h_xpaths == full_xpaths: # and h_texts == full_texts:
                if h_xpaths in found_xpaths:
                    print('CURRENT h_xpath ALREADY FOUND BEFORE:')
                    print(h_xpaths)
                    _
                    
                found_xpaths.append(h_xpaths)
                # Instead of appending to merged_df 
                # directly, save the specific h_row 
                # and as you build the merged_df 
                # in the outer loop for all texts, for 
                # the highlight matched rows, append 
                # the h_row as well.
                
                # merged_df.append((row, h_row))
                matched_hrow = h_row
                break
        if len(matched_hrow) == 0:
            # print('No matched highlighted xpaths', full_xpaths)
            merged_df.append(row[1])
        else:
            print(row[1], matched_hrow[1])
            print(pd.concat([row[1], matched_hrow[1]], axis=0))
            # print(pd.concat([row[1], matched_hrow[1]], axis=1))
            merged_df.append(pd.concat([row[1], matched_hrow[1]], axis=0))
            
    return merged_df

In [327]:
merged_list = monotonic_left_merge(df, exploded_highlight_df)

  matched_hrow = pd.Series([])


xpaths                /html/body/document/type/sequence/filename/des...
text                                       AGREEMENT AND PLAN OF MERGER
all_nodes_ordering                                                  379
Name: 379, dtype: object highlighted_xpaths               /html/body/document/type/sequence/filename/des...
highlighted_segmented_text                            AGREEMENT AND PLAN OF MERGER
highlighted_labels                                                               t
segment_number_from_idx                                                          0
highlighted_coordinates                       [2832px, 598.539px, 242.922px, 15px]
num_entries_1                                                                    1
exploded_highlight_node_order                                                    0
top                                                                         2832.0
left                                                                       598.539
width       

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


xpaths                /html/body/document/type/sequence/filename/des...
text                                                                  4
all_nodes_ordering                                                  478
Name: 478, dtype: object highlighted_xpaths               /html/body/document/type/sequence/filename/des...
highlighted_segmented_text                                                       4
highlighted_labels                                                               n
segment_number_from_idx                                                          7
highlighted_coordinates                       [4532px, 716.664px, 6.67188px, 15px]
num_entries_1                                                                    1
exploded_highlight_node_order                                                    8
top                                                                         4532.0
left                                                                       716.664
width       

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


xpaths                /html/body/document/type/sequence/filename/des...
text                                                                 16
all_nodes_ordering                                                  789
Name: 789, dtype: object highlighted_xpaths               /html/body/document/type/sequence/filename/des...
highlighted_segmented_text                                                      16
highlighted_labels                                                               n
segment_number_from_idx                                                         19
highlighted_coordinates                     [9580.5px, 713.328px, 13.3359px, 15px]
num_entries_1                                                                    1
exploded_highlight_node_order                                                   20
top                                                                         9580.5
left                                                                       713.328
width       

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


xpaths                /html/body/document/type/sequence/filename/des...
text                                                                 18
all_nodes_ordering                                                  909
Name: 909, dtype: object highlighted_xpaths               /html/body/document/type/sequence/filename/des...
highlighted_segmented_text                                                      18
highlighted_labels                                                               n
segment_number_from_idx                                                         21
highlighted_coordinates                      [10841px, 713.328px, 13.3359px, 15px]
num_entries_1                                                                    1
exploded_highlight_node_order                                                   22
top                                                                        10841.0
left                                                                       713.328
width       

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


xpaths                /html/body/document/type/sequence/filename/des...
text                                                       Section 1.02
all_nodes_ordering                                                 1073
Name: 1073, dtype: object highlighted_xpaths               /html/body/document/type/sequence/filename/des...
highlighted_segmented_text                                            Section 1.02
highlighted_labels                                                             ssn
segment_number_from_idx                                                         24
highlighted_coordinates                          [12212px, 104px, 66.6484px, 15px]
num_entries_1                                                                    1
exploded_highlight_node_order                                                   24
top                                                                        12212.0
left                                                                         104.0
width      

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


xpaths                /html/body/document/type/sequence/filename/des...
text                                                                 22
all_nodes_ordering                                                 1132
Name: 1132, dtype: object highlighted_xpaths               /html/body/document/type/sequence/filename/des...
highlighted_segmented_text                                                      22
highlighted_labels                                                               n
segment_number_from_idx                                                         34
highlighted_coordinates                      [13087px, 713.328px, 13.3359px, 15px]
num_entries_1                                                                    1
exploded_highlight_node_order                                                   37
top                                                                        13087.0
left                                                                       713.328
width      

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


xpaths                /html/body/document/type/sequence/filename/des...
text                                                                 24
all_nodes_ordering                                                 1212
Name: 1212, dtype: object highlighted_xpaths               /html/body/document/type/sequence/filename/des...
highlighted_segmented_text                                                      24
highlighted_labels                                                               n
segment_number_from_idx                                                         43
highlighted_coordinates                      [13869px, 713.328px, 13.3359px, 15px]
num_entries_1                                                                    1
exploded_highlight_node_order                                                   48
top                                                                        13869.0
left                                                                       713.328
width      

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


xpaths                /html/body/document/type/sequence/filename/des...
text                                                  Dissenting Shares
all_nodes_ordering                                                 1277
Name: 1277, dtype: object highlighted_xpaths               /html/body/document/type/sequence/filename/des...
highlighted_segmented_text                                       Dissenting Shares
highlighted_labels                                                             sst
segment_number_from_idx                                                         48
highlighted_coordinates                      [14639px, 199.648px, 99.2344px, 15px]
num_entries_1                                                                    2
exploded_highlight_node_order                                                   52
top                                                                        14639.0
left                                                                       199.648
width      

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

In [328]:
# [row for row in merged_df]
merged_df = pd.DataFrame(merged_list)

In [329]:
merged_df

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height
0,/html/body/document/type,EX-4.7,0,,,,,,,,,,,
1,/html/body/document/type/sequence,3,1,,,,,,,,,,,
2,/html/body/document/type/sequence/filename,a21-3954_1ex4d7.htm,2,,,,,,,,,,,
3,/html/body/document/type/sequence/filename/des...,EX-4.7,3,,,,,,,,,,,
4,/html/body/document/type/sequence/filename/des...,Exhibit 4.7,4,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3364,/html/body/document/type/sequence/filename/des...,Title:,3364,,,,,,,,,,,
3365,/html/body/document/type/sequence/filename/des...,[,3365,,,,,,,,,,,
3366,/html/body/document/type/sequence/filename/des...,Signature Page to Merger Agreement,3366,,,,,,,,,,,
3367,/html/body/document/type/sequence/filename/des...,],3367,,,,,,,,,,,


In [330]:
# merged = pd.merge(df, exploded_highlight_df, left_on='xpaths', right_on='highlighted_xpaths', how='left', indicator=True)
# merged.iloc[1087:1097]


In [331]:
# merged.iloc[1087:1097]['xpaths'].tolist()

In [332]:
merged_df.iloc[1364:1375]

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height
Unnamed 80,/html/body/document/type/sequence/filename/des...,Section 2.11,1364,/html/body/document/type/sequence/filename/des...,Section 2.11,ssn,61.0,"[15889.5px, 104px, 66.1562px, 15px]",1.0,69.0,15889.5,104.0,66.1562,15.0
Unnamed 81,/html/body/document/type/sequence/filename/des...,Lost Certificates,1365,/html/body/document/type/sequence/filename/des...,Lost Certificates,sst,62.0,"[15889.5px, 199.156px, 91.8047px, 15px]",2.0,70.0,15889.5,199.156,91.8047,15.0
Unnamed 82,/html/body/document/type/sequence/filename/des...,". If any Certificate shall have been lost, st...",1366,/html/body/document/type/sequence/filename/des...,.,sst,62.0,"[15889.5px, 199.156px, 91.8047px, 15px]",2.0,71.0,15889.5,199.156,91.8047,15.0
1367,/html/body/document/type/sequence/filename/des...,Article II,1367,,,,,,,,,,,
Unnamed 83,/html/body/document/type/sequence/filename/des...,(including,1368,/html/body/document/type/sequence/filename/des...,.,sst,62.0,"[15889.5px, 199.156px, 91.8047px, 15px]",2.0,71.0,15889.5,199.156,91.8047,15.0
1369,/html/body/document/type/sequence/filename/des...,Section 2.05,1369,,,,,,,,,,,
Unnamed 84,/html/body/document/type/sequence/filename/des...,).,1370,/html/body/document/type/sequence/filename/des...,.,sst,62.0,"[15889.5px, 199.156px, 91.8047px, 15px]",2.0,71.0,15889.5,199.156,91.8047,15.0
Unnamed 85,/html/body/document/type/sequence/filename/des...,Section 2.12,1371,/html/body/document/type/sequence/filename/des...,Section 2.12,ssn,63.0,"[15967px, 104px, 68.6484px, 15px]",1.0,72.0,15967.0,104.0,68.6484,15.0
Unnamed 86,/html/body/document/type/sequence/filename/des...,Further Assurances,1372,/html/body/document/type/sequence/filename/des...,Further Assurances,sst,64.0,"[15967px, 199.648px, 105.883px, 15px]",2.0,73.0,15967.0,199.648,105.883,15.0
Unnamed 87,/html/body/document/type/sequence/filename/des...,". At and after the Second Effective Time, the...",1373,/html/body/document/type/sequence/filename/des...,.,sst,64.0,"[15967px, 199.648px, 105.883px, 15px]",2.0,74.0,15967.0,199.648,105.883,15.0


In [333]:
merged_df['is_outside'] = merged_df['segment_number_from_idx'].apply(lambda x: 1 if math.isnan(x) else 0)
merged_df[1087:1097]
merged = merged_df

In [334]:
merged.to_csv("drop_check.csv")

In [335]:
#merged[merged['is_outside'] < 1].to_csv('merged_test.csv')

In [336]:
merged[merged['is_outside'] < 1]

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,is_outside
Unnamed 0,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,379,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0.0,"[2832px, 598.539px, 242.922px, 15px]",1.0,0.0,2832.0,598.539,242.9220,15.0,0
Unnamed 1,/html/body/document/type/sequence/filename/des...,ARTICLE I,404,/html/body/document/type/sequence/filename/des...,ARTICLE I,sn,1.0,"[3331.5px, 685.602px, 68.7969px, 15px]",1.0,1.0,3331.5,685.602,68.7969,15.0,0
Unnamed 2,/html/body/document/type/sequence/filename/des...,DEFINITIONS,405,/html/body/document/type/sequence/filename/des...,DEFINITIONS,st,2.0,"[3362.5px, 675.922px, 88.1562px, 15px]",1.0,2.0,3362.5,675.922,88.1562,15.0,0
Unnamed 3,/html/body/document/type/sequence/filename/des...,Section 1.01,406,/html/body/document/type/sequence/filename/des...,Section 1.01,ssn,3.0,"[3393.5px, 104px, 67.6484px, 15px]",1.0,3.0,3393.5,104.000,67.6484,15.0,0
Unnamed 4,/html/body/document/type/sequence/filename/des...,Definitions,407,/html/body/document/type/sequence/filename/des...,Definitions,sst,4.0,"[3393.5px, 199.648px, 63.3203px, 15px]",2.0,4.0,3393.5,199.648,63.3203,15.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Unnamed 83,/html/body/document/type/sequence/filename/des...,(including,1368,/html/body/document/type/sequence/filename/des...,.,sst,62.0,"[15889.5px, 199.156px, 91.8047px, 15px]",2.0,71.0,15889.5,199.156,91.8047,15.0,0
Unnamed 84,/html/body/document/type/sequence/filename/des...,).,1370,/html/body/document/type/sequence/filename/des...,.,sst,62.0,"[15889.5px, 199.156px, 91.8047px, 15px]",2.0,71.0,15889.5,199.156,91.8047,15.0,0
Unnamed 85,/html/body/document/type/sequence/filename/des...,Section 2.12,1371,/html/body/document/type/sequence/filename/des...,Section 2.12,ssn,63.0,"[15967px, 104px, 68.6484px, 15px]",1.0,72.0,15967.0,104.000,68.6484,15.0,0
Unnamed 86,/html/body/document/type/sequence/filename/des...,Further Assurances,1372,/html/body/document/type/sequence/filename/des...,Further Assurances,sst,64.0,"[15967px, 199.648px, 105.883px, 15px]",2.0,73.0,15967.0,199.648,105.8830,15.0,0


In [337]:
def assert_node_ordering_for_merged_table(merged: pd.DataFrame) -> pd.DataFrame:
    '''
    Due to repeat xpaths in both left and right table in merged, need to remove the duplicated rows
    '''
    drop_indices = []
#     last_highlight_node_idx = -1
#     for i, row in merged[merged['is_outside'] < 1].iterrows():
#         if row.exploded_highlight_node_order and row.exploded_highlight_node_order != last_highlight_node_idx + 1:
#             drop_indices.append(i)
#         else:
#             last_highlight_node_idx = row.exploded_highlight_node_order
    merged = merged.drop(drop_indices).reset_index(drop=True)
    
    # Assert both orderings keep their original structure
    assert merged.all_nodes_ordering.is_monotonic_increasing
    assert merged[merged['is_outside'] < 1].exploded_highlight_node_order.is_monotonic_increasing
    
    return merged, drop_indices

In [338]:
merged_ordered, drop_indices = assert_node_ordering_for_merged_table(merged)
empty_xpath_index = merged_ordered.fillna(0).where(merged_ordered['xpaths'] == '').dropna().index
empty_xpath_index
merged_ordered = merged_ordered.drop(empty_xpath_index)

Why am I losing rows (1 row now) from the left table?

In [339]:
merged_ordered.iloc[1075:1085]

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,is_outside
1101,/html/body/document/type/sequence/filename/des...,Section 2.02,1101,/html/body/document/type/sequence/filename/des...,Section 2.02,ssn,32.0,"[12792.5px, 104px, 66.6484px, 15px]",1.0,34.0,12792.5,104.0,66.6484,15.0,0
1102,/html/body/document/type/sequence/filename/des...,The Mergers,1102,/html/body/document/type/sequence/filename/des...,The Mergers,sst,33.0,"[12792.5px, 199.648px, 71.5703px, 15px]",2.0,35.0,12792.5,199.648,71.5703,15.0,0
1103,/html/body/document/type/sequence/filename/des...,.,1103,/html/body/document/type/sequence/filename/des...,.,sst,33.0,"[12792.5px, 199.648px, 71.5703px, 15px]",2.0,36.0,12792.5,199.648,71.5703,15.0,0
1104,/html/body/document/type/sequence/filename/des...,(a),1104,,,,,,,,,,,,1
1105,/html/body/document/type/sequence/filename/des...,"At the Closing, (i) the Company shall file a c...",1105,,,,,,,,,,,,1
1106,/html/body/document/type/sequence/filename/des...,First Certificate of Merger,1106,,,,,,,,,,,,1
1107,/html/body/document/type/sequence/filename/des...,”) with the Delaware Secretary of State and ma...,1107,,,,,,,,,,,,1
1108,/html/body/document/type/sequence/filename/des...,DGCL,1108,,,,,,,,,,,,1
1109,/html/body/document/type/sequence/filename/des...,”) in connection with the First Merger and (ii...,1109,,,,,,,,,,,,1
1110,/html/body/document/type/sequence/filename/des...,Second Certificate of Merger,1110,,,,,,,,,,,,1


In [340]:
merged_ordered[merged_ordered['all_nodes_ordering'] == merged_ordered.index]



Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,is_outside
0,/html/body/document/type,EX-4.7,0,,,,,,,,,,,,1
1,/html/body/document/type/sequence,3,1,,,,,,,,,,,,1
2,/html/body/document/type/sequence/filename,a21-3954_1ex4d7.htm,2,,,,,,,,,,,,1
3,/html/body/document/type/sequence/filename/des...,EX-4.7,3,,,,,,,,,,,,1
4,/html/body/document/type/sequence/filename/des...,Exhibit 4.7,4,,,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3363,/html/body/document/type/sequence/filename/des...,Name:,3363,,,,,,,,,,,,1
3364,/html/body/document/type/sequence/filename/des...,Title:,3364,,,,,,,,,,,,1
3365,/html/body/document/type/sequence/filename/des...,[,3365,,,,,,,,,,,,1
3366,/html/body/document/type/sequence/filename/des...,Signature Page to Merger Agreement,3366,,,,,,,,,,,,1


In [341]:
merged_ordered[merged_ordered['all_nodes_ordering'] > merged_ordered.index]

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,is_outside


In [342]:
merged_ordered[1087:1097]['text'].tolist()

['”) in connection with the Second Merger.\xa0 The First Merger shall become effective at such time (the “',
 'First Effective Time',
 '”) as the First Certificate of Merger is duly filed with the Delaware Secretary of State (or at such later time as Parent and the Company shall agree and is specified in the First Certificate of Merger) and the Second Merger shall become effective at such time (the “',
 'Second Effective Time',
 '”) as the Second Certificate of Merger is duly filed with the Delaware Secretary of State (or at such later time as Parent and the Company shall agree and is specified in the Second Certificate of Merger, but in any event following the First Effective Time and as soon as practicable following the First Effective Time).',
 '(b)',
 '(i) At the First Effective Time, Merger Sub I shall be merged with and into the Company in accordance with the DGCL (the “',
 'First Merger',
 '”), whereupon the separate existence of Merger Sub I shall cease and the Company shall be

In [343]:
merged_ordered

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,is_outside
0,/html/body/document/type,EX-4.7,0,,,,,,,,,,,,1
1,/html/body/document/type/sequence,3,1,,,,,,,,,,,,1
2,/html/body/document/type/sequence/filename,a21-3954_1ex4d7.htm,2,,,,,,,,,,,,1
3,/html/body/document/type/sequence/filename/des...,EX-4.7,3,,,,,,,,,,,,1
4,/html/body/document/type/sequence/filename/des...,Exhibit 4.7,4,,,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3363,/html/body/document/type/sequence/filename/des...,Name:,3363,,,,,,,,,,,,1
3364,/html/body/document/type/sequence/filename/des...,Title:,3364,,,,,,,,,,,,1
3365,/html/body/document/type/sequence/filename/des...,[,3365,,,,,,,,,,,,1
3366,/html/body/document/type/sequence/filename/des...,Signature Page to Merger Agreement,3366,,,,,,,,,,,,1


Okay I see what happened. At index 737 for all nodes ordering, somehow the join had a double repeat. All examples of that node in the left dataframe were deleted since 2 highlight nodes before it matched with it. Not sure why tho?

The (c) node was already matched as well as what followed it but then it and one of the successors was matched again. I think there can be some kind of check to see if highlight node was already matched and if removing current row would destroy the all nodes order, just dont delete the row... 

Seems weird but maybe it would work. 

Maybe another method could be tracking the start coordinates of the text in each node and then somehow getting a stricter order on that...

Or maybe tracking which char offset from node beginning a highlight belongs to. then if theres overlap in the interval, we merge, if not we move on. do the same for the all nodes as well

Let's do the BIES tagging now

In [344]:
def tag_bies_for_highlights(merged: pd.DataFrame) -> pd.DataFrame:
    
    tags = []
    count = 1
    for i, row in merged.iterrows():
        list_entry_count = row.num_entries_1
        
        # Non highlighted row
        if math.isnan(list_entry_count):
            tags.append('o')
        
        # Single highlighted node
        elif list_entry_count == 1:
            tags.append(f's_{row.highlighted_labels}')
            count = 1
        
        # Last entry in group greater than size 1
        elif count == list_entry_count:
            tags.append(f'e_{row.highlighted_labels}')
            count = 1
        elif (count < list_entry_count) and count == 1:
            tags.append(f'b_{row.highlighted_labels}')
            count += 1
        elif (count < list_entry_count) and count > 1:
            tags.append(f'i_{row.highlighted_labels}')
            count += 1
    merged['tagged_sequence'] = tags
    return merged

In [345]:
merged_tagged = tag_bies_for_highlights(merged_ordered)
merged_tagged
merged_tagged[merged_tagged['is_outside'] < 1]

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,is_outside,tagged_sequence
379,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,379,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0.0,"[2832px, 598.539px, 242.922px, 15px]",1.0,0.0,2832.0,598.539,242.9220,15.0,0,s_t
404,/html/body/document/type/sequence/filename/des...,ARTICLE I,404,/html/body/document/type/sequence/filename/des...,ARTICLE I,sn,1.0,"[3331.5px, 685.602px, 68.7969px, 15px]",1.0,1.0,3331.5,685.602,68.7969,15.0,0,s_sn
405,/html/body/document/type/sequence/filename/des...,DEFINITIONS,405,/html/body/document/type/sequence/filename/des...,DEFINITIONS,st,2.0,"[3362.5px, 675.922px, 88.1562px, 15px]",1.0,2.0,3362.5,675.922,88.1562,15.0,0,s_st
406,/html/body/document/type/sequence/filename/des...,Section 1.01,406,/html/body/document/type/sequence/filename/des...,Section 1.01,ssn,3.0,"[3393.5px, 104px, 67.6484px, 15px]",1.0,3.0,3393.5,104.000,67.6484,15.0,0,s_ssn
407,/html/body/document/type/sequence/filename/des...,Definitions,407,/html/body/document/type/sequence/filename/des...,Definitions,sst,4.0,"[3393.5px, 199.648px, 63.3203px, 15px]",2.0,4.0,3393.5,199.648,63.3203,15.0,0,b_sst
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,/html/body/document/type/sequence/filename/des...,(including,1368,/html/body/document/type/sequence/filename/des...,.,sst,62.0,"[15889.5px, 199.156px, 91.8047px, 15px]",2.0,71.0,15889.5,199.156,91.8047,15.0,0,b_sst
1370,/html/body/document/type/sequence/filename/des...,).,1370,/html/body/document/type/sequence/filename/des...,.,sst,62.0,"[15889.5px, 199.156px, 91.8047px, 15px]",2.0,71.0,15889.5,199.156,91.8047,15.0,0,e_sst
1371,/html/body/document/type/sequence/filename/des...,Section 2.12,1371,/html/body/document/type/sequence/filename/des...,Section 2.12,ssn,63.0,"[15967px, 104px, 68.6484px, 15px]",1.0,72.0,15967.0,104.000,68.6484,15.0,0,s_ssn
1372,/html/body/document/type/sequence/filename/des...,Further Assurances,1372,/html/body/document/type/sequence/filename/des...,Further Assurances,sst,64.0,"[15967px, 199.648px, 105.883px, 15px]",2.0,73.0,15967.0,199.648,105.8830,15.0,0,b_sst


In [346]:
# merged_tagged[merged_tagged['is_outside'] < 1]['tagged_sequence'].tolist()

In [347]:
merged_tagged[merged_tagged['is_outside'] < 1]['tagged_sequence'].tolist(), merged_tagged[merged_tagged['is_outside'] < 1]['text'].tolist()

(['s_t',
  's_sn',
  's_st',
  's_ssn',
  'b_sst',
  'e_sst',
  's_n',
  's_n',
  's_n',
  's_n',
  's_n',
  's_n',
  's_n',
  's_n',
  's_n',
  's_n',
  's_n',
  's_n',
  's_n',
  's_n',
  's_n',
  's_n',
  's_n',
  's_n',
  's_ssn',
  'b_sst',
  'e_sst',
  's_n',
  's_sn',
  's_st',
  's_ssn',
  'b_sst',
  'e_sst',
  'b_sst',
  'e_sst',
  'b_sst',
  's_n',
  's_ssn',
  'b_sst',
  'e_sst',
  's_n',
  's_ssn',
  'b_sst',
  'e_sst',
  's_ssn',
  'b_sst',
  'e_sst',
  's_n',
  's_ssn',
  'b_sst',
  'e_sst',
  's_n',
  's_n',
  's_n',
  's_ssn',
  'b_sst',
  'e_sst',
  'b_sst',
  'e_sst',
  'b_sst',
  'e_sst',
  'b_sst',
  'e_sst',
  's_ssn',
  'b_sst',
  'e_sst',
  's_n',
  's_n',
  's_ssn',
  'b_sst',
  'e_sst',
  'b_sst',
  'e_sst',
  's_n',
  's_ssn',
  'b_sst',
  'e_sst',
  's_ssn',
  'b_sst',
  'e_sst',
  's_ssn',
  'b_sst',
  'e_sst',
  'b_sst',
  'e_sst',
  's_ssn',
  'b_sst',
  'e_sst'],
 ['AGREEMENT AND PLAN OF MERGER',
  'ARTICLE I',
  'DEFINITIONS',
  'Section 1.01',
  'Defini

In [348]:
merged_tagged.to_csv('overlabeled_labeled.csv')

I think I need to modify this algorithm. The overlabeled file that I just tried somehow labeled index 245 as b_n but i think its a single highlight node? It looks like the master nodes might be traversing in a different order tbh not sure, I did label pretty wildly... but still not sure how that happens?

In [349]:
highlighted_df.iloc[2]

highlighted_xpaths            [/html/body/document/type/sequence/filename/de...
highlighted_segmented_text                                        [DEFINITIONS]
highlighted_labels                                                           st
highlighted_coordinates                  [3362.5px, 675.922px, 88.1562px, 15px]
segment_number_from_idx                                                       2
num_entries_1                                                                 1
num_entries_2                                                                 1
Name: 2, dtype: object

In [350]:
highlighted_df.iloc[2]['highlighted_xpaths']

['/html/body/document/type/sequence/filename/description/text/div[13]/p[10]/b/font']