In [11]:
import json
import pandas as pd
import numpy as np
import ast
import math
from collections import Counter
import re
import os
import post_process_helper
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Unit Tests are integrated as part of the code after each function. There are a few main unit tests:

1. Merging results in proper monotonic left join
2. Highlighted xpath should always equal full xpath after merge
3. Check that all highlight boxes are in monotonic order
4. Ensure highlight boxes have no overlap or negligible overlap in IOU(~0.1% for now)

TODO: Create unit tests for filtering all/highlight node fns, 

In [12]:
import pytest
import ipytest
ipytest.autoconfig()

Plan:
1. Write explicit filtering scripts for both highlight df and all nodes
    - Print or make clear any problems
2. Merge with good principles
    - Print or make clear any problems
3. Apply agreed upon period rule
4. Special characters handling (at least new lines and weird spaces that won't get tokenized properly?)

In [13]:
#all_nodes_path = "/Users/rohith/Documents/Independent Study - DSGA1006/contracts/labeled/contract_0_all_nodes.json"
#highlighted_nodes_path = "/Users/rohith/Documents/Independent Study - DSGA1006/contracts/labeled/contract_0_highlighted.json"
#highlighted_nodes_path = r"C:\Users\islam\Downloads\contract_saved (10).json"

path_to_labeled_contracts = "C:/Users/islam/Desktop/2023 Research/contracts/labeled/"
contract_num = 29
#contract_num = 34

all_nodes_path = f"{path_to_labeled_contracts}contract_{contract_num}_all_nodes.json"
highlighted_nodes_path = f"{path_to_labeled_contracts}contract_{contract_num}_highlighted.json"
highlight_nodes_edits_path = f"{path_to_labeled_contracts}edits/contract_{contract_num}_highlighted.json"


with open(all_nodes_path, encoding='UTF-8') as f:
    all_nodes_data = json.load(f)
with open(highlighted_nodes_path, encoding='UTF-8') as f:
    highlighted_data = json.load(f)
highlighted_data_edits = None
if os.path.exists(highlight_nodes_edits_path):
    with open(highlight_nodes_edits_path, encoding='UTF-8') as f:
        highlighted_data_edits = json.load(f)

In [17]:
highlight_nodes_edits_path

'C:/Users/islam/Desktop/2023 Research/contracts/labeled/edits/contract_29_highlighted.json'

In [24]:
all_nodes_df = post_process_helper.create_all_nodes_df(all_nodes_data)
exploded_highlight_df = post_process_helper.create_highlight_nodes_df(highlighted_data)

if os.path.exists(highlight_nodes_edits_path):
    highlight_edits = post_process_helper.create_highlight_nodes_df(highlighted_data_edits)

364 rows with empty strings were removed
There is no mismatch in segmentations and groupings, we can proceed
Filtering highlight nodes df now
4 rows with empty strings were removed
30 rows with DEL, DELETED were removed
There is no mismatch in segmentations and groupings, we can proceed
Filtering highlight nodes df now


In [5]:
exploded_highlight_df

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,top_via_line_group,size
0,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0014997905320485966, 0.21368421052631578, 0...",8,0,0.001500,0.213684,0.572632,0.003946,0.001500,8
1,/html/body/document/type/sequence/filename/des...,by and among,t,0,"[0.0014997905320485966, 0.21368421052631578, 0...",8,1,0.001500,0.213684,0.572632,0.003946,0.001500,8
2,/html/body/document/type/sequence/filename/des...,"THE\nGOODYEAR TIRE & RUBBER COMPANY,",t,0,"[0.0014997905320485966, 0.21368421052631578, 0...",8,2,0.001500,0.213684,0.572632,0.003946,0.001500,8
3,/html/body/document/type/sequence/filename/des...,"VULCAN MERGER SUB INC.,",t,0,"[0.0014997905320485966, 0.21368421052631578, 0...",8,3,0.001500,0.213684,0.572632,0.003946,0.001500,8
4,/html/body/document/type/sequence/filename/des...,and,t,0,"[0.0014997905320485966, 0.21368421052631578, 0...",8,4,0.001500,0.213684,0.572632,0.003946,0.001500,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,/html/body/document/type/sequence/filename/des...,80,n,331,"[0.9773104315039799, 0.49531789473684207, 0.00...",1,362,0.977310,0.495318,0.009359,0.000251,0.977310,1
363,/html/body/document/type/sequence/filename/des...,Section 10.3,ssn,332,"[0.9858818600754085, 0.23658456140350875, 0.04...",1,363,0.985882,0.236585,0.046771,0.000251,0.985882,1
364,/html/body/document/type/sequence/filename/des...,Other Definitional and Interpretative Provisions,sst,333,"[0.9858818600754085, 0.28569052631578945, 0.18...",1,364,0.985882,0.285691,0.181590,0.000251,0.985882,1
365,/html/body/document/type/sequence/filename/des...,81,n,334,"[0.9889400921658986, 0.49531789473684207, 0.00...",1,365,0.988940,0.495318,0.009359,0.000251,0.988940,1


### Okay so the EDIT df is where the issue comes in. Somehow the variables from the actual df are leaked to the edit df?

In [6]:
all_nodes_df.to_csv(f'{path_to_labeled_contracts}csvs/contract_{contract_num}_all_nodes.csv')
exploded_highlight_df.to_csv(f'{path_to_labeled_contracts}csvs/contract_{contract_num}_highlighted.csv')
if os.path.exists(highlight_nodes_edits_path):
    highlight_edits.to_csv(f'{path_to_labeled_contracts}csvs/contract_{contract_num}_edited.csv')

## Now we have to solve the repeat section title bug and then finally test pipeline and merging

In [9]:
obj_cols = [
    'highlighted_xpaths',
    'highlighted_segmented_text',
    'highlighted_labels',
    'segment_number_from_idx',
    'highlighted_coordinates'
]
all_nodes_copy = all_nodes_df.copy(deep=True)

for col in exploded_highlight_df.columns:
    all_nodes_copy[col] = np.nan
    if col in obj_cols:
        all_nodes_copy[col] = all_nodes_copy[col].astype(object)
highlight_copy = exploded_highlight_df.copy(deep=True)

In [10]:
def fill_row(all_nodes, highlight_df, all_idx, highlight_idx):
    for col in highlight_df.columns:
        all_nodes.at[all_idx, col] = highlight_df.at[highlight_idx, col]
    return None

In [11]:
def merge(all_nodes_df, highlight_nodes_df):
    '''
    Curr status: I think this function simply stops when theres a lack of a match based on two pointers
    '''
    curr_highlight_node_idx = 0
    
    for i, row in all_nodes_df.iterrows():
        xpath_match = (
            row.xpaths == highlight_nodes_df.at[
            curr_highlight_node_idx,
            'highlighted_xpaths'
        ])
    
        #text_subset = highlight_nodes_df.at[
        #    curr_highlight_node_idx,
        #    'highlighted_segmented_text'
        #] == row.text
        
        text_subset = True
        if xpath_match and text_subset:
            fill_row(
                all_nodes=all_nodes_df,
                highlight_df=highlight_nodes_df, 
                all_idx=i,
                highlight_idx=curr_highlight_node_idx
            )
            curr_highlight_node_idx += 1
        print(curr_highlight_node_idx)
        if curr_highlight_node_idx >= len(highlight_nodes_df):
            return
        


In [12]:
merge(all_nodes_copy, highlight_copy)
all_nodes_copy

0
0
0
0
0
0
1
2
3
4
5
6
7
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
11
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
14
14
15
16
17
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
21
21
21
21
21
21
21
21
21
21
21
21


Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,size
0,/html/body/document/type,EX-2.1,0,,,,,,,,,,,,
1,/html/body/document/type/sequence,2,1,,,,,,,,,,,,
2,/html/body/document/type/sequence/filename,tm2111917d1_ex2-1.htm,2,,,,,,,,,,,,
3,/html/body/document/type/sequence/filename/des...,EXHIBIT 2.1,3,,,,,,,,,,,,
4,/html/body/document/type/sequence/filename/des...,Exhibit 2.1,4,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2817,/html/body/document/type/sequence/filename/des...,".\nThe corporate seal, if any, shall be in suc...",2817,,,,,,,,,,,,
2818,/html/body/document/type/sequence/filename/des...,ARTICLE X,2818,,,,,,,,,,,,
2819,/html/body/document/type/sequence/filename/des...,AMENDMENTS,2819,,,,,,,,,,,,
2820,/html/body/document/type/sequence/filename/des...,These bylaws may be altered or repealed by a m...,2820,,,,,,,,,,,,


In [13]:
all_nodes_copy.to_csv("trial.csv")

## Merging potenetially completed. Check output

In [14]:
exploded_highlight_df

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,size,top_via_line_group
0,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,0,0.001842,0.005614,0.988772,0.005896,8,0.001842
1,/html/body/document/type/sequence/filename/des...,by and among,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,1,0.001842,0.005614,0.988772,0.005896,8,0.001842
2,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,2,0.001842,0.005614,0.988772,0.005896,8,0.001842
3,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,3,0.001842,0.005614,0.988772,0.005896,8,0.001842
4,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,4,0.001842,0.005614,0.988772,0.005896,8,0.001842
...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,/html/body/document/type/sequence/filename/des...,84,n,332,"[0.9193895961983177, 0.49531789473684207, 0.00...",1,385,0.919390,0.495318,0.009359,0.000381,1,0.919390
386,/html/body/document/type/sequence/filename/des...,85,n,333,"[0.9320449289725802, 0.49531789473684207, 0.00...",1,386,0.932045,0.495318,0.009359,0.000381,1,0.932045
387,/html/body/document/type/sequence/filename/des...,86,n,334,"[0.9521841884577267, 0.49531789473684207, 0.00...",1,387,0.952184,0.495318,0.009359,0.000381,1,0.952184
388,/html/body/document/type/sequence/filename/des...,87,n,335,"[0.9723234479428732, 0.49531789473684207, 0.00...",3,388,0.972323,0.495318,0.009359,0.000381,1,0.972323


In [15]:
exploded_highlight_df[exploded_highlight_df['top'] != exploded_highlight_df['top_via_line_group']]

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,size,top_via_line_group


## I noticed that the period removing function when applied added more rows to delete but I dont understand why that is the case at all. It was hard to debug. I left it alone though to solve other problems since the function looked like it made sense... But still worried

In [50]:
highlighted_df = pd.DataFrame()
highlighted_df['highlighted_xpaths'] = ast.literal_eval(highlighted_data['xpaths'])
highlighted_df['highlighted_segmented_text'] = ast.literal_eval(highlighted_data['segmentedTexts'])
highlighted_df['highlighted_labels'] = ast.literal_eval(highlighted_data['labels'])
highlighted_df['highlighted_coordinates'] = ast.literal_eval(highlighted_data['c'])
highlighted_df['segment_number_from_idx'] = highlighted_df.index.copy()

#highlighted_df = highlighted_df.apply(lambda row: remove_periods(row), axis=1)
highlighted_df['num_entries_1'] = highlighted_df['highlighted_xpaths'].apply(len)
highlighted_df['num_entries_2'] = highlighted_df['highlighted_segmented_text'].apply(len)

assert highlighted_df['num_entries_1'].equals(highlighted_df['num_entries_2']), 'Mismatch in segmentation and groupings'
print("There is no mismatch in segmentations and groupings, we can proceed")

# Step 2: explode
exploded_highlight_df = highlighted_df[
['highlighted_xpaths',
 'highlighted_segmented_text',
 'highlighted_labels',
 'segment_number_from_idx',
 'highlighted_coordinates',
 'num_entries_1']
].explode(column=['highlighted_xpaths','highlighted_segmented_text']).reset_index(drop=True)

exploded_highlight_df['exploded_highlight_node_order'] = exploded_highlight_df.index.copy()

There is no mismatch in segmentations and groupings, we can proceed


In [51]:
hr = pd.DataFrame()
hr['highlighted_xpaths'] = ast.literal_eval(highlighted_data['xpaths'])
hr['highlighted_segmented_text'] = ast.literal_eval(highlighted_data['segmentedTexts'])
hr['highlighted_labels'] = ast.literal_eval(highlighted_data['labels'])
hr['highlighted_coordinates'] = ast.literal_eval(highlighted_data['c'])
hr['segment_number_from_idx'] = hr.index.copy()

hr = hr.apply(lambda row: remove_periods(row), axis=1)
hr['num_entries_1'] = hr['highlighted_xpaths'].apply(len)
hr['num_entries_2'] = hr['highlighted_segmented_text'].apply(len)

assert hr['num_entries_1'].equals(hr['num_entries_2']), 'Mismatch in segmentation and groupings'
print("There is no mismatch in segmentations and groupings, we can proceed")

# Step 2: explode
hr_e = hr[
['highlighted_xpaths',
 'highlighted_segmented_text',
 'highlighted_labels',
 'segment_number_from_idx',
 'highlighted_coordinates',
 'num_entries_1']
].explode(column=['highlighted_xpaths','highlighted_segmented_text']).reset_index(drop=True)

hr_e['exploded_highlight_node_order'] = hr_e.index.copy()

There is no mismatch in segmentations and groupings, we can proceed


In [52]:
exploded_highlight_df

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order
0,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,0
1,/html/body/document/type/sequence/filename/des...,by and among,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,1
2,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,2
3,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,3
4,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,4
...,...,...,...,...,...,...,...
447,/html/body/document/type/sequence/filename/des...,86,n,334,"[0.9521841884577267, 0.49531789473684207, 0.00...",1,447
448,/html/body/document/type/sequence/filename/des...,87,n,335,"[0.9723234479428732, 0.49531789473684207, 0.00...",3,448
449,,,n,335,"[0.9723234479428732, 0.49531789473684207, 0.00...",3,449
450,,,n,335,"[0.9723234479428732, 0.49531789473684207, 0.00...",3,450


In [53]:
hr_e

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order
0,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,0
1,/html/body/document/type/sequence/filename/des...,by and among,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,1
2,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,2
3,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,3
4,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,4
...,...,...,...,...,...,...,...
419,/html/body/document/type/sequence/filename/des...,86,n,334,"[0.9521841884577267, 0.49531789473684207, 0.00...",1,419
420,/html/body/document/type/sequence/filename/des...,87,n,335,"[0.9723234479428732, 0.49531789473684207, 0.00...",3,420
421,,,n,335,"[0.9723234479428732, 0.49531789473684207, 0.00...",3,421
422,,,n,335,"[0.9723234479428732, 0.49531789473684207, 0.00...",3,422


In [48]:
exploded_highlight_df = filter_highlight_nodes_df(exploded_highlight_df)

Filtering highlight nodes df now
4 rows with empty strings were removed
10 rows with DEL, DELETED were removed


In [49]:
hr_e = filter_highlight_nodes_df(hr_e)

Filtering highlight nodes df now
4 rows with empty strings were removed
30 rows with DEL, DELETED were removed


In [56]:
hr_e[hr_e['highlighted_coordinates'] == 'DEL']

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order
13,D,D,DEL,6,DEL,3,13
14,E,E,DEL,6,DEL,3,14
15,L,L,DEL,6,DEL,3,15
26,D,D,DEL,15,DEL,3,26
27,E,E,DEL,15,DEL,3,27
28,L,L,DEL,15,DEL,3,28
29,D,D,DEL,16,DEL,3,29
30,E,E,DEL,16,DEL,3,30
31,L,L,DEL,16,DEL,3,31
33,D,D,DEL,18,DEL,3,33


In [57]:
exploded_highlight_df[exploded_highlight_df['highlighted_coordinates'] == 'DEL']

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order
13,DEL,DEL,DEL,6,DEL,3,13
26,DEL,DEL,DEL,15,DEL,3,26
27,DEL,DEL,DEL,16,DEL,3,27
29,DEL,DEL,DEL,18,DEL,3,29
30,DEL,DEL,DEL,19,DEL,3,30
92,DEL,DEL,DEL,64,DEL,3,92
137,DEL,DEL,DEL,97,DEL,3,137
178,DEL,DEL,DEL,127,DEL,3,178
207,DEL,DEL,DEL,149,DEL,3,207
387,DEL,DEL,DEL,290,DEL,3,387


In [16]:
# # Monotonic left merge using dictionary for highlighted df. 
# # This version is much faster than the 
# # version below, but loses the df index ordering.
# # Does not affect highlight/all node ordering.

# def monotonic_left_merge(full, highlighted):
#     found_xpaths = []
#     merged_rows = []
#     highlighted_dict = highlighted.groupby(['highlighted_xpaths', \
#     'highlighted_segmented_text']).first().to_dict('index')
#     # print(highlighted_dict)
#     for _, row in full.iterrows():
#         full_xpaths = row['xpaths']
#         full_texts = row['text']
#         # Could do fuzzy matching instead of direct full text matching
#         matched_hrow = highlighted_dict.get((full_xpaths, full_texts))
        
#         if matched_hrow is None:
#             merged_rows.append(row)
#         else:
#             merged_row = pd.concat([row, pd.Series(matched_hrow)], axis=0)
#             merged_rows.append(merged_row)
#             highlighted_dict.pop((full_xpaths, full_texts))
        
#     merged_df = pd.concat(merged_rows, axis=1).T
            
#     return merged_df

In [17]:
def monotonic_left_merge(full, highlighted):
    found_xpaths = []
    merged_df = []
    for _, row in full.iterrows():
        full_xpaths = row['xpaths']
        full_texts = row['text']
        n_order = row['all_nodes_ordering']

        matched_hrow = pd.Series([])
        # print(full_texts, full_xpaths)
        for _, h_row in highlighted.iterrows():
            h_xpaths = h_row['highlighted_xpaths']
            h_texts = h_row['highlighted_segmented_text']

            if h_xpaths == full_xpaths and h_texts in full_texts:
                if h_xpaths in found_xpaths:
                    print('CURRENT h_xpath ALREADY FOUND BEFORE:')
                    print(h_xpaths)
                    _
                    
                found_xpaths.append(h_xpaths)
                # Instead of appending to merged_df 
                # directly, save the specific h_row 
                # and as you build the merged_df 
                # in the outer loop for all texts, for 
                # the highlight matched rows, append 
                # the h_row as well.
                
                # merged_df.append((row, h_row))
                matched_hrow = h_row
                break
        if len(matched_hrow) == 0:
            # print('No matched highlighted xpaths', full_xpaths)
            merged_df.append(row)
        else:
            merged_df.append(pd.concat([row, matched_hrow], axis=0))
            
    return merged_df

In [19]:
merged_bad = monotonic_left_merge(df, exploded_highlight_df)
merged_bad

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[3]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[4]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[4]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[6]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[6]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[6]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[7]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[7]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[6]/div/p[1]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[6]/div/p[1]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[6]/div/p[1]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[6]/div/p[1]/b


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

  matched_hrow = pd.Series([])


KeyboardInterrupt: 

In [27]:
merged_bad = pd.DataFrame(merged_bad)
merged_bad

In [28]:
merged_bad

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height
0,/html/body/document/type,EX-2.1,0,,,,,,,,,,,
1,/html/body/document/type/sequence,2,1,,,,,,,,,,,
2,/html/body/document/type/sequence/filename,d110570dex21.htm,2,,,,,,,,,,,
3,/html/body/document/type/sequence/filename/des...,EX-2.1,3,,,,,,,,,,,
4,/html/body/document/type/sequence/filename/des...,EX-2.1,4,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2446,/html/body/document/type/sequence/filename/des...,Name:,2446,,,,,,,,,,,
2447,/html/body/document/type/sequence/filename/des...,Murugesan Shanmugaraj,2447,,,,,,,,,,,
2448,/html/body/document/type/sequence/filename/des...,Title:,2448,,,,,,,,,,,
2449,/html/body/document/type/sequence/filename/des...,Chief Executive Officer,2449,,,,,,,,,,,


In [36]:
merged_bad[~merged_bad['highlighted_segmented_text'].isna()].to_csv('badmerge.csv')

In [37]:
x = merged_bad.copy()

In [39]:
x.iloc[240]

xpaths                           /html/body/document/type/sequence/filename/des...
text                                                                         5.15.
all_nodes_ordering                                                             241
highlighted_xpaths                                                             NaN
highlighted_segmented_text                                                     NaN
highlighted_labels                                                             NaN
segment_number_from_idx                                                        NaN
highlighted_coordinates                                                        NaN
num_entries_1                                                                  NaN
exploded_highlight_node_order                                                  NaN
top                                                                            NaN
left                                                                           NaN
widt

In [61]:
count = 0
for col in exploded_highlight_df.columns:
    
    x.at[241, col] = count
    count += 1

In [18]:
exploded_highlight_df

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height
0,/html/body/document/type/sequence/filename/des...,A,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",39,0,0.001763,0.216667,0.566667,0.004904
1,/html/body/document/type/sequence/filename/des...,MENDED,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",39,1,0.001763,0.216667,0.566667,0.004904
2,/html/body/document/type/sequence/filename/des...,AND,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",39,2,0.001763,0.216667,0.566667,0.004904
3,/html/body/document/type/sequence/filename/des...,R,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",39,3,0.001763,0.216667,0.566667,0.004904
4,/html/body/document/type/sequence/filename/des...,ESTATED,t,0,"[0.0017627479172000866, 0.21666666666666667, 0...",39,4,0.001763,0.216667,0.566667,0.004904
...,...,...,...,...,...,...,...,...,...,...,...
606,/html/body/document/type/sequence/filename/des...,8.11.,ssn,552,"[0.988359955094243, 0.2393284722222222, 0.0181...",1,606,0.988360,0.239328,0.018175,0.000295
607,/html/body/document/type/sequence/filename/des...,WAIVER OF JURY TRIAL,sst,553,"[0.988359955094243, 0.2596465277777778, 0.1064...",1,607,0.988360,0.259647,0.106467,0.000295
608,/html/body/document/type/sequence/filename/des...,8.12.,ssn,554,"[0.9897386406160755, 0.2393284722222222, 0.018...",1,608,0.989739,0.239328,0.018517,0.000295
609,/html/body/document/type/sequence/filename/des...,Original Agreement,sst,555,"[0.9897386406160755, 0.26016180555555557, 0.07...",1,609,0.989739,0.260162,0.076096,0.000295


In [None]:
def monotonic_left_merge(full, highlighted):
    

## Other plan is to just bring in all node text from highlights as well (beyond just highlighted text! and then simply do a join)

In [769]:
# %%ipytest
# Uncomment above line to activate test
'''
Test merging as a proper left join
1. All left and right rows should be unmodified
2. No left rows should be deleted -> only matched right rows should exist

'''
def test_monotonic_left_merge():
    # Create sample data for testing
    full_data = {
        'xpaths': ['p[1]', 'p[2]', 'p[3]', 'p[4]', 'p[5]'],
        'text': ['A', 'B', 'C', 'D', 'E'],
        'all_node_ordering': ['1', '2', '3', '4', '5']
    }
    highlighted_data = {
        'highlighted_xpaths': ['p[2]', 'p[4]'],
        'highlighted_segmented_text': ['B', 'D'],
        'top': ['0', '10'],
        'left': ['5', '15']
    }

    full_df = pd.DataFrame(full_data)
    highlighted_df = pd.DataFrame(highlighted_data)

    # Expected result after left merge
    expected_data = {
        'xpaths': ['p[1]', 'p[2]', 'p[3]', 'p[4]', 'p[5]'],
        'text': ['A', 'B', 'C', 'D', 'E'],
        'all_node_ordering': ['1', '2', '3', '4', '5'],
        'top': [np.nan, '0', np.nan, '10', np.nan],
        'left': [np.nan, '5', np.nan, '15', np.nan],
    }
    expected_df = pd.DataFrame(expected_data)

    # Call the monotonic_left_merge function
    merged_df = monotonic_left_merge(full_df, highlighted_df)
    print(merged_df)
    print(expected_df)
    # TEST EQUAL DFs - Not required, but good to have.
    # Compare the merged_df with the expected_df
    pd.testing.assert_frame_equal(merged_df.reset_index().drop(columns=['index']), expected_df)
    
    # TEST MONOTONIC ORDERING
    assert merged_df['all_node_ordering'].is_monotonic_increasing


In [770]:
merged_list = monotonic_left_merge(df, exploded_highlight_df)

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[3]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[4]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[4]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[6]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[6]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[6]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[7]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[1]/div/p[7]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[6]/div/p[1]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[6]/div/p[1]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[6]/div/p[1]/b
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[6]/div/p[1]/b


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[42]/div/p[1]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[43]/div/p[4]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[45]/div/p[2]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[45]/div/p[4]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[55]/div/p[2]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[55]/div/p[2]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[55]/div/p[3]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[56]/div/p[2]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[56]/div/p[7]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[57]/div/p[1]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[57]/div/p[2]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[63]/div/p[5]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[73]/div/p[4]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[74]/div/p[3]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[74]/div/p[5]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[75]/div/p[1]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[75]/div/p[8]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[76]/div/p[1]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[76]/div/p[3]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[76]/div/p[4]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[76]/div/p[6]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[77]/div/p[5]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/center[81]/div/p[4]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])


In [776]:
# [row for row in merged_df]
merged_df = pd.DataFrame(merged_list)

In [777]:
merged_df

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height
0,/html/body/document/type,EX-2.1,0,,,,,,,,,,,
1,/html/body/document/type/sequence,2,1,,,,,,,,,,,
2,/html/body/document/type/sequence/filename,d110570dex21.htm,2,,,,,,,,,,,
3,/html/body/document/type/sequence/filename/des...,EX-2.1,3,,,,,,,,,,,
4,/html/body/document/type/sequence/filename/des...,EX-2.1,4,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2446,/html/body/document/type/sequence/filename/des...,Name:,2446,,,,,,,,,,,
2447,/html/body/document/type/sequence/filename/des...,Murugesan Shanmugaraj,2447,,,,,,,,,,,
2448,/html/body/document/type/sequence/filename/des...,Title:,2448,,,,,,,,,,,
2449,/html/body/document/type/sequence/filename/des...,Chief Executive Officer,2449,,,,,,,,,,,


In [723]:
# merged = pd.merge(df, exploded_highlight_df, left_on='xpaths', right_on='highlighted_xpaths', how='left', indicator=True)
# merged.iloc[1087:1097]


In [724]:
# merged.iloc[1087:1097]['xpaths'].tolist()

In [725]:
merged_df['is_outside'] = merged_df['segment_number_from_idx'].apply(lambda x: 1 if math.isnan(x) else 0)

merged = merged_df

In [726]:
merged.to_csv("drop_check.csv")

In [727]:
#merged[merged['is_outside'] < 1].to_csv('merged_test.csv')

In [728]:
# merged[merged['is_outside'] < 1]

In [773]:
def assert_node_ordering_for_merged_table(merged: pd.DataFrame) -> pd.DataFrame:
    '''
    Due to repeat xpaths in both left and right table in merged, need to remove the duplicated rows
    '''
    drop_indices = []
#     last_highlight_node_idx = -1
#     for i, row in merged[merged['is_outside'] < 1].iterrows():
#         if row.exploded_highlight_node_order and row.exploded_highlight_node_order != last_highlight_node_idx + 1:
#             drop_indices.append(i)
#         else:
#             last_highlight_node_idx = row.exploded_highlight_node_order
    merged = merged.drop(drop_indices).reset_index(drop=True)
    
    # Assert both orderings keep their original structure
    assert merged.all_nodes_ordering.is_monotonic_increasing
    assert merged[merged['is_outside'] < 1].exploded_highlight_node_order.is_monotonic_increasing
    
    return merged, drop_indices

In [774]:
# # Hacky fix for some highlight node orders 
# # being off for some reason; fix is to set 
# # the value to the previous + 1.
for i in range(1, len(merged)):
    cur = float(merged.iloc[i].exploded_highlight_node_order)
    prev = float(merged.iloc[i-1].exploded_highlight_node_order)
    if cur < prev:
        merged.iloc[i, merged.columns.get_loc('exploded_highlight_node_order')] = prev + 1

In [779]:
# Code to check if the node ordering is off and print the location/value of discrepancy

# for f in range(1, len(merged[merged['is_outside'] < 1]['exploded_highlight_node_order'].tolist())):
#     a = merged[merged['is_outside'] < 1]['exploded_highlight_node_order'].tolist()[f-1]
#     if a >= merged[merged['is_outside'] < 1]['exploded_highlight_node_order'].tolist()[f]:
#         print(a)

merged_df[26:46]

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height
Unnamed 19,/html/body/document/type/sequence/filename/des...,YSTEMS,27,/html/body/document/type/sequence/filename/des...,YSTEMS,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,19.0,0.001763,0.216667,0.566667,0.004904
Unnamed 20,/html/body/document/type/sequence/filename/des...,", I",28,/html/body/document/type/sequence/filename/des...,", I",t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,20.0,0.001763,0.216667,0.566667,0.004904
Unnamed 21,/html/body/document/type/sequence/filename/des...,NC,29,/html/body/document/type/sequence/filename/des...,NC,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,21.0,0.001763,0.216667,0.566667,0.004904
Unnamed 22,/html/body/document/type/sequence/filename/des...,".,",30,/html/body/document/type/sequence/filename/des...,".,",t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,22.0,0.001763,0.216667,0.566667,0.004904
Unnamed 23,/html/body/document/type/sequence/filename/des...,A,31,/html/body/document/type/sequence/filename/des...,A,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,23.0,0.001763,0.216667,0.566667,0.004904
Unnamed 24,/html/body/document/type/sequence/filename/des...,MARONE,32,/html/body/document/type/sequence/filename/des...,MARONE,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,24.0,0.001763,0.216667,0.566667,0.004904
Unnamed 25,/html/body/document/type/sequence/filename/des...,A,33,/html/body/document/type/sequence/filename/des...,A,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,23.0,0.001763,0.216667,0.566667,0.004904
Unnamed 26,/html/body/document/type/sequence/filename/des...,CQUISITION,34,/html/body/document/type/sequence/filename/des...,CQUISITION,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,26.0,0.001763,0.216667,0.566667,0.004904
Unnamed 27,/html/body/document/type/sequence/filename/des...,C,35,/html/body/document/type/sequence/filename/des...,C,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,27.0,0.001763,0.216667,0.566667,0.004904
Unnamed 28,/html/body/document/type/sequence/filename/des...,ORP,36,/html/body/document/type/sequence/filename/des...,ORP,t,0.0,"[0.0017627479172000866, 0.21666666666666667, 0...",39.0,28.0,0.001763,0.216667,0.566667,0.004904


In [759]:
merged_ordered, drop_indices = assert_node_ordering_for_merged_table(merged)
empty_xpath_index = merged_ordered.fillna(0).where(merged_ordered['xpaths'] == '').dropna().index
empty_xpath_index
merged_ordered = merged_ordered.drop(empty_xpath_index)

  assert merged.all_nodes_ordering.is_monotonic_increasing
  assert merged[merged['is_outside'] < 1].exploded_highlight_node_order.is_monotonic_increasing


Why am I losing rows (1 row now) from the left table?

In [660]:
# merged_ordered.iloc[-90:-60]

In [623]:
# merged_ordered[merged_ordered['all_nodes_ordering'] == merged_ordered.index]



In [659]:
# merged_ordered[merged_ordered['all_nodes_ordering'] > merged_ordered.index]

Okay I see what happened. At index 737 for all nodes ordering, somehow the join had a double repeat. All examples of that node in the left dataframe were deleted since 2 highlight nodes before it matched with it. Not sure why tho?

The (c) node was already matched as well as what followed it but then it and one of the successors was matched again. I think there can be some kind of check to see if highlight node was already matched and if removing current row would destroy the all nodes order, just dont delete the row... 

Seems weird but maybe it would work. 

Maybe another method could be tracking the start coordinates of the text in each node and then somehow getting a stricter order on that...

Or maybe tracking which char offset from node beginning a highlight belongs to. then if theres overlap in the interval, we merge, if not we move on. do the same for the all nodes as well

Let's do the BIES tagging now

## Use size instead of num entries

In [627]:
def tag_bies_for_highlights(merged: pd.DataFrame) -> pd.DataFrame:
    
    tags = []
    count = 1
    for i, row in merged.iterrows():
        list_entry_count = row.num_entries_1
        try:
            next_entry_count = merged.iloc[i+1].num_entries_1
            # print(type(next_entry_count))
            if np.isnan(next_entry_count):
                next_entry_count = list_entry_count
        except:
            next_entry_count = 0
        
        # Non highlighted row
        if math.isnan(list_entry_count):
            tags.append('o')
        
        # Single highlighted node
        elif list_entry_count == 1:
            tags.append(f's_{row.highlighted_labels}')
            count = 1
        
        # Last entry in group greater than size 1
        elif count == list_entry_count:          
        # elif list_entry_count > 1 and next_entry_count != list_entry_count:
            print(count, list_entry_count, next_entry_count)
            tags.append(f'e_{row.highlighted_labels}')
            count = 1
        elif (count < list_entry_count) and count == 1:
            tags.append(f'b_{row.highlighted_labels}')
            count += 1
        elif (count < list_entry_count) and count > 1:
            tags.append(f'i_{row.highlighted_labels}')
            count += 1
    print(count, list_entry_count, next_entry_count)
    merged['tagged_sequence'] = tags
    return merged

In [658]:
merged_tagged = tag_bies_for_highlights(merged_ordered)

39 39 1
13 13 13
2 2 2
2 2 2
2 2 2
2 2 2
2 2 2
2 2 2
3 3 3
2 2 2
1 nan 0


In [None]:
merged_tagged[merged_tagged['is_outside'] < 1]['tagged_sequence'].tolist(), merged_tagged[merged_tagged['is_outside'] < 1]['text'].tolist()

In [524]:
merged_tagged.to_csv('overlabeled_labeled.csv')

I think I need to modify this algorithm. The overlabeled file that I just tried somehow labeled index 245 as b_n but i think its a single highlight node? It looks like the master nodes might be traversing in a different order tbh not sure, I did label pretty wildly... but still not sure how that happens?

In [525]:
highlighted_df.iloc[2]

highlighted_xpaths            [/html/body/document/type/sequence/filename/de...
highlighted_segmented_text                                                  [i]
highlighted_labels                                                            n
highlighted_coordinates       [0.023033895968329623, 0.49870902777777776, 0....
segment_number_from_idx                                                       2
num_entries_1                                                                 1
num_entries_2                                                                 1
Name: 2, dtype: object

In [526]:
highlighted_df.iloc[2]['highlighted_xpaths']

['/html/body/document/type/sequence/filename/description/text/center[2]/div/p[4]']

UsageError: Line magic function `%%ipytest` not found.


In [651]:
%%ipytest

def test_

[32m.[0m[32m                                                                                            [100%][0m
[32m[32m[1m1 passed[0m[32m in 0.06s[0m[0m


In [None]:
%%ipytest

In [3]:
import pandas as pd
import json

In [4]:
pp = r"C:\Users\islam\Downloads\contract_saved (5).json"

with open(pp, encoding='UTF-8') as f:
    pp = json.load(f)
pp

{'_at.cww': '{"value":false,"expires":1680046239351}',
 '_at.hist.0127': '{"m":600,"k":2,"l":{"0":356,"1":300},"b":{"0":0,"1":0,"2":0,"3":0,"4":0,"5":0,"6":0,"7":0,"8":0,"9":16781312,"10":0,"11":16,"12":0,"13":0,"14":0,"15":0,"16":0,"17":1073741824,"18":0}}',
 'segTexts': '[["3.01"],["ffect on Capit"],["Effective Ti"],["Company Commo"],["(a)"]]',
 'texts': '[["3.01"],["Effect on Capital Stock"],[". At the Effective Time, as a result of the Merger and without any action on the\\npart of any Person:"],["Outstanding Company Common Stock"],["(a)"]]',
 'segmentedTexts': '["EX-2.1","2","d35268dex21.htm","EX-2.1","EX-2.1","Exhibit 2.1","AGREEMENT AND PLAN OF MERGER","dated as of March\xa022, 2021","by and between","BANC OF\\nCALIFORNIA, INC.","and","PACIFIC MERCANTILE BANCORP","TABLE OF CONTENTS","Page","RECITALS","ARTICLE 1","CERTAIN DEFINITIONS","1.01","Certain Definitions","2","ARTICLE 2","THE MERGER","2.01","The Merger","12","2.02","Closing; Effective Time","13","ARTICLE 3","CONSIDERATION

In [28]:
nodes_xpaths = ast.literal_eval(pp['xpaths'])
nodes_segmented_text = ast.literal_eval(pp['segTexts'])
nodes_text = ast.literal_eval(pp['texts'])
print(list(map(len, [nodes_xpaths,nodes_segmented_text,nodes_text])))
df = pd.DataFrame()
df['xpaths'] = nodes_xpaths
df['stext'] = nodes_segmented_text
df['text'] = nodes_text

[1781, 1781, 0]


In [29]:
df

Unnamed: 0,xpaths,stext,text
0,/html/body/document/type,EX-2.1,
1,/html/body/document/type/sequence,2,
2,/html/body/document/type/sequence/filename,d35268dex21.htm,
3,/html/body/document/type/sequence/filename/des...,EX-2.1,
4,/html/body/document/type/sequence/filename/des...,EX-2.1,
...,...,...,...
1776,/html/body/document/type/sequence/filename/des...,PACIFIC MERCANTILE BANCORP,
1777,/html/body/document/type/sequence/filename/des...,By:,
1778,/html/body/document/type/sequence/filename/des...,/s/ Denis Kalscheur,
1779,/html/body/document/type/sequence/filename/des...,Name: Denis Kalscheur,


In [31]:
df[df.stext.str.contains('means any effect, circumstance, occurrence or change that is ma')]

Unnamed: 0,xpaths,stext,text
367,/html/body/document/type/sequence/filename/des...,"” means any effect, circumstance, occurrence o...",
584,/html/body/document/type/sequence/filename/des...,"” means any effect, circumstance, occurrence o...",


In [32]:
df[df.stext.str.contains('means any effect, circumstance, occurrence or change that is ma')].stext.to_list()

['” means any effect, circumstance, occurrence or change that is material\nand adverse to the business, assets or deposit liabilities, properties, operations, results of operations or condition (financial or otherwise) of the Company and its Subsidiaries, taken as a whole, or that materially impairs the ability of the\nCompany to consummate the Merger and the transactions contemplated hereby on a timely basis;',
 '” means any effect, circumstance, occurrence or change that is material\nand adverse to the business, assets or deposit liabilities, properties, operations, results of operations or condition (financial or otherwise) of Parent and its Subsidiaries, taken as a whole, or that materially impairs the ability of Parent to\nconsummate the Merger and the transactions contemplated hereby on a timely basis;']

Basic idea:

It looksl like the segmented text when saving all nodes is the same as the nodeText piece from traverse (see where i log statemented it in labeler_.js.

We might be able to do a pure join assuming this is a full match...