In [2]:
import json
import pandas as pd
import numpy as np
import ast
import math
from collections import Counter
import re
import os

Unit Tests are integrated as part of the code after each function. There are a few main unit tests:

1. Merging results in proper monotonic left join
2. Highlighted xpath should always equal full xpath after merge
3. Check that all highlight boxes are in monotonic order
4. Ensure highlight boxes have no overlap or negligible overlap in IOU(~0.1% for now)

TODO: Create unit tests for filtering all/highlight node fns 

In [3]:
import pytest
import ipytest
ipytest.autoconfig()

Plan:
1. Write explicit filtering scripts for both highlight df and all nodes
    - Print or make clear any problems
2. Merge with good principles
    - Print or make clear any problems
3. Apply agreed upon period rule
4. Special characters handling (at least new lines and weird spaces that won't get tokenized properly?)

In [4]:
#all_nodes_path = "/Users/rohith/Documents/Independent Study - DSGA1006/contracts/labeled/contract_0_all_nodes.json"
#highlighted_nodes_path = "/Users/rohith/Documents/Independent Study - DSGA1006/contracts/labeled/contract_0_highlighted.json"
#highlighted_nodes_path = r"C:\Users\islam\Downloads\contract_saved (10).json"

path_to_labeled_contracts = "/Users/rohith/Documents/Independent Study - DSGA1006/contracts/labeled/"
# path_to_labeled_contracts = "C:/Users/islam/Desktop/2023 Research/contracts/labeled/"
contract_num = 29

all_nodes_path = f"{path_to_labeled_contracts}contract_{contract_num}_all_nodes.json"
highlighted_nodes_path = f"{path_to_labeled_contracts}contract_{contract_num}_highlighted.json"
highlight_nodes_edits_path = f"{path_to_labeled_contracts}edits/contract_{contract_num}_highlighted.json"


with open(all_nodes_path, encoding='UTF-8') as f:
    all_nodes_data = json.load(f)
with open(highlighted_nodes_path, encoding='UTF-8') as f:
    highlighted_data = json.load(f)
highlighted_data_edits = None
if os.path.exists(highlight_nodes_edits_path):
    with open(highlight_nodes_edits_path, encoding='UTF-8') as f:
        highlighted_data_edits = json.load(f)

In [5]:
def filter_all_nodes_df(df, xpaths_col):
    
    original_length = len(df)
    df = df[
        ~(df[xpaths_col].str.contains('/script')) & 
        ~(df[xpaths_col].str.contains('/noscript'))
    ]
    
    if original_length - len(df) > 0:
        print(f'{original_length - len(df)} rows that had /script or /noscript were removed')
    
    new_length = len(df)
    df = df[df[xpaths_col] != '']
    
    if new_length - len(df) > 0:
        print(f'{new_length - len(df)} rows with empty strings were removed')
    
    return df.reset_index(drop=True)

def filter_highlight_nodes_df(df):
    
    print('Filtering highlight nodes df now')
    # We can apply this to highlight nodes without issue. In theory we should see 0 print statements so this 
    # can be a guardrail against unknown bugs
    df = filter_all_nodes_df(df, 'highlighted_xpaths')
    
    original_length = len(df)
    df = df[
        (df['highlighted_xpaths'] != 'DELETED') &
        (df['highlighted_coordinates'] != 'DEL')
    ]
    
    if original_length - len(df) > 0:
        print(f'{original_length - len(df)} rows with DEL, DELETED were removed')
    
    new_length = len(df)
    df = df.dropna()
    
    if new_length - len(df) > 0:
        print(f'{new_length - len(df)} NA rows were dropped. THIS IS A PROBLEM.')
    
    return df

def sort_exploded_highlight_box_by_coordinates(exploded_highlight_df):
    '''
    If logic ever has to deal with approximations and some line inference (with IOU math),
    then this function will become more complicated but for now it seems okay
    '''
    return exploded_highlight_df.sort_values(by=['top','left','exploded_highlight_node_order']).reset_index(drop=True)

# Function to process periods in text.
# Occasionally there is weird text which 
# has a period, space, and another 
# character, which does not match to 
# anything in all_node_text; I remove 
# these as well.

def remove_periods(row):
    # try:
    nrow = row
    xpaths = row['highlighted_xpaths']
    texts = row['highlighted_segmented_text']
    label = row['highlighted_labels']
    indices_to_remove = []

    if 'st' not in label:
        return row
    for i, text in enumerate(texts):
        if '.' == text or '. ' == text or ('. ' in text and len(text) == 3):
            indices_to_remove.append(i)
    
    texts = [text for i, text in enumerate(texts) if i not in indices_to_remove]
    xpaths = [xpath for i, xpath in enumerate(xpaths) if i not in indices_to_remove]
    
    nrow['highlighted_xpaths'] = xpaths
    nrow['highlighted_segmented_text'] = texts
    return nrow

In [6]:
def test_apply_remove_periods(df):
    def test_remove_periods(row):
        nrow = row
        xpaths = row['highlighted_xpaths']
        texts = row['highlighted_segmented_text']
        # except:
        #     return pd.Series({'highlighted_xpaths': [], 'highlighted_segmented_text': []})
        indices_to_test = []
        if 'st' not in label:
            return []
        for i, text in enumerate(texts):
            if '.' == text or '. ' == text or ('. ' in text and len(text) == 3):
                indices_to_test.append(i)
        assert indices_to_test == [], \
        f'''In the following row: 
        {row}\n {indices_to_test} are the list of indices where 
        there are still periods.'''
        return indices_to_test
    indices_list = df.apply(lambda row: test_remove_periods(row), axis=1)
    assert all(isinstance(item, list) and len(item) == 0 for item in indices_list)
    print('ALL CHECKS PASSED')
    
def test_filter_all_nodes_df(df, xpaths_col):
    new_df = df[
        ~(df[xpaths_col].str.contains('/script')) & 
        ~(df[xpaths_col].str.contains('/noscript'))
    ]
    new_df = new_df[new_df[xpaths_col] != ''].dropna()
    assert (len(new_df) - len(df)) == 0
    print('ALL CHECKS PASSED')
    
def test_filter_highlight_nodes_df(df):
    new_df = df[
        (df['highlighted_xpaths'] != 'DELETED') &
        (df['highlighted_coordinates'] != 'DEL')
    ].dropna()
    assert (len(new_df) - len(df)) == 0
    print('ALL CHECKS PASSED')
    

In [7]:
def create_all_nodes_df(all_nodes_data):
    all_nodes_xpaths = ast.literal_eval(all_nodes_data['xpaths'])
    all_nodes_segmented_text = ast.literal_eval(all_nodes_data['segmentedTexts'])
    df = pd.DataFrame()
    df['xpaths'] = all_nodes_xpaths
    df['text'] = all_nodes_segmented_text
    
    # Filter
    df = filter_all_nodes_df(df, 'xpaths')
    df['all_nodes_ordering'] = df.index.copy()
    
    return df

def create_highlight_nodes_df(highlighted_data):
    
    # Step 1: read in the data and do basic checks:
    #highlight_text = ast.literal_eval(highlighted_data['texts'])
    highlighted_df = pd.DataFrame()
    highlighted_df['highlighted_xpaths'] = ast.literal_eval(highlighted_data['xpaths'])
    highlighted_df['highlighted_segmented_text'] = ast.literal_eval(highlighted_data['segmentedTexts'])
    highlighted_df['highlighted_labels'] = ast.literal_eval(highlighted_data['labels'])
    highlighted_df['highlighted_coordinates'] = ast.literal_eval(highlighted_data['c'])
    highlighted_df['segment_number_from_idx'] = highlighted_df.index.copy()
    
    highlighted_df = highlighted_df.apply(lambda row: remove_periods(row), axis=1)
    highlighted_df['num_entries_1'] = highlighted_df['highlighted_xpaths'].apply(len)
    highlighted_df['num_entries_2'] = highlighted_df['highlighted_segmented_text'].apply(len)
    
    assert highlighted_df['num_entries_1'].equals(highlighted_df['num_entries_2']), 'Mismatch in segmentation and groupings'
    print("There is no mismatch in segmentations and groupings, we can proceed")
    
    # Step 2: explode
    exploded_highlight_df = highlighted_df[
    ['highlighted_xpaths',
     'highlighted_segmented_text',
     'highlighted_labels',
     'segment_number_from_idx',
     'highlighted_coordinates',
     'num_entries_1']
    ].explode(column=['highlighted_xpaths','highlighted_segmented_text']).reset_index(drop=True)

    exploded_highlight_df['exploded_highlight_node_order'] = exploded_highlight_df.index.copy()
    
    # Step 3: Filter and extract coordinates
    exploded_highlight_df = filter_highlight_nodes_df(exploded_highlight_df)
    
    exploded_highlight_df['top'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[0]))
    exploded_highlight_df['left'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[1]))
    exploded_highlight_df['width'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[2]))
    exploded_highlight_df['height'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[3]))
    
    # Step 4: Sort
    exploded_highlight_df = sort_exploded_highlight_box_by_coordinates(exploded_highlight_df)
    exploded_highlight_df['exploded_highlight_node_order'] = exploded_highlight_df.index.copy()
    
    # Get final group dizes post filtering
    final_group_sizes = exploded_highlight_df.groupby('segment_number_from_idx', as_index=False).size()
    exploded_highlight_df = pd.merge(exploded_highlight_df, final_group_sizes, on='segment_number_from_idx')
    
    
    
    return exploded_highlight_df

In [8]:
def remove_rows_and_add_edits(exploded_highlight_df, remove_indices, highlight_edits):
    
    edited = pd.concat([
        exploded_highlight_df.drop(remove_indices),
        highlight_edits
    ])
    
    edited = sort_exploded_highlight_box_by_coordinates(edited)
    edited['exploded_highlight_node_order'] = edited.index.copy()
    
    return edited

In [9]:
all_nodes_df = create_all_nodes_df(all_nodes_data)
# TEST FOR FILTERED ALL NODES
test_filter_all_nodes_df(all_nodes_df, 'xpaths')
original_df = all_nodes_df
exploded_highlight_df = create_highlight_nodes_df(highlighted_data)
# TEST FOR FILTERED HIGHLIGHT NODES
test_filter_highlight_nodes_df(exploded_highlight_df)
if os.path.exists(highlight_nodes_edits_path):
    highlight_edits = create_highlight_nodes_df(highlighted_data_edits)
# exploded_highlight_df[:50]

364 rows with empty strings were removed
ALL CHECKS PASSED
There is no mismatch in segmentations and groupings, we can proceed
Filtering highlight nodes df now
4 rows with empty strings were removed
10 rows with DEL, DELETED were removed
ALL CHECKS PASSED
There is no mismatch in segmentations and groupings, we can proceed
Filtering highlight nodes df now


In [10]:
# all_nodes_df[all_nodes_df['text'].str.contains('8.10')]

In [11]:
all_nodes_df.to_csv(f'{path_to_labeled_contracts}csvs/contract_{contract_num}_all_nodes.csv')
exploded_highlight_df.to_csv(f'{path_to_labeled_contracts}csvs/contract_{contract_num}_highlighted.csv')
if os.path.exists(highlight_nodes_edits_path):
    highlight_edits.to_csv(f'{path_to_labeled_contracts}csvs/contract_{contract_num}_edited.csv')

In [12]:
obj_cols = [
    'highlighted_xpaths',
    'highlighted_segmented_text',
    'highlighted_labels',
    'segment_number_from_idx',
    'highlighted_coordinates'
]
all_nodes_copy = all_nodes_df.copy(deep=True)

for col in exploded_highlight_df.columns:
    all_nodes_copy[col] = np.nan
    if col in obj_cols:
        all_nodes_copy[col] = all_nodes_copy[col].astype(object)
highlight_copy = exploded_highlight_df.copy(deep=True)

In [13]:
highlight_copy

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,size
0,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,0,0.001842,0.005614,0.988772,0.005896,8
1,/html/body/document/type/sequence/filename/des...,by and among,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,1,0.001842,0.005614,0.988772,0.005896,8
2,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,2,0.001842,0.005614,0.988772,0.005896,8
3,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,3,0.001842,0.005614,0.988772,0.005896,8
4,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,4,0.001842,0.005614,0.988772,0.005896,8
...,...,...,...,...,...,...,...,...,...,...,...,...
385,/html/body/document/type/sequence/filename/des...,84,n,332,"[0.9193895961983177, 0.49531789473684207, 0.00...",1,385,0.919390,0.495318,0.009359,0.000381,1
386,/html/body/document/type/sequence/filename/des...,85,n,333,"[0.9320449289725802, 0.49531789473684207, 0.00...",1,386,0.932045,0.495318,0.009359,0.000381,1
387,/html/body/document/type/sequence/filename/des...,86,n,334,"[0.9521841884577267, 0.49531789473684207, 0.00...",1,387,0.952184,0.495318,0.009359,0.000381,1
388,/html/body/document/type/sequence/filename/des...,87,n,335,"[0.9723234479428732, 0.49531789473684207, 0.00...",3,388,0.972323,0.495318,0.009359,0.000381,1


In [14]:
def fill_row(all_nodes, highlight_df, all_idx, highlight_idx):
    for col in highlight_df.columns:
        all_nodes.at[all_idx, col] = highlight_df.at[highlight_idx, col]
    return None

In [15]:
def merge(all_nodes_df, highlight_nodes_df):
    '''
    Curr status: I think this function simply stops when theres a lack of a match based on two pointers
    '''
    curr_highlight_node_idx = 0
    
    for i, row in all_nodes_df.iterrows():
        xpath_match = (
            row.xpaths == highlight_nodes_df.at[
            curr_highlight_node_idx,
            'highlighted_xpaths'
        ])
    
        #text_subset = highlight_nodes_df.at[
        #    curr_highlight_node_idx,
        #    'highlighted_segmented_text'
        #] == row.text
        
        text_subset = True
        if xpath_match and text_subset:
            fill_row(
                all_nodes=all_nodes_df,
                highlight_df=highlight_nodes_df, 
                all_idx=i,
                highlight_idx=curr_highlight_node_idx
            )
            curr_highlight_node_idx += 1
        # print(curr_highlight_node_idx)
        if curr_highlight_node_idx >= len(highlight_nodes_df):
            return



In [16]:
highlight_copy

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,size
0,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,0,0.001842,0.005614,0.988772,0.005896,8
1,/html/body/document/type/sequence/filename/des...,by and among,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,1,0.001842,0.005614,0.988772,0.005896,8
2,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,2,0.001842,0.005614,0.988772,0.005896,8
3,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,3,0.001842,0.005614,0.988772,0.005896,8
4,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,4,0.001842,0.005614,0.988772,0.005896,8
...,...,...,...,...,...,...,...,...,...,...,...,...
385,/html/body/document/type/sequence/filename/des...,84,n,332,"[0.9193895961983177, 0.49531789473684207, 0.00...",1,385,0.919390,0.495318,0.009359,0.000381,1
386,/html/body/document/type/sequence/filename/des...,85,n,333,"[0.9320449289725802, 0.49531789473684207, 0.00...",1,386,0.932045,0.495318,0.009359,0.000381,1
387,/html/body/document/type/sequence/filename/des...,86,n,334,"[0.9521841884577267, 0.49531789473684207, 0.00...",1,387,0.952184,0.495318,0.009359,0.000381,1
388,/html/body/document/type/sequence/filename/des...,87,n,335,"[0.9723234479428732, 0.49531789473684207, 0.00...",3,388,0.972323,0.495318,0.009359,0.000381,1


In [17]:
def test_merge(full, highlight, merged):
    # Check for monotonic increasing and find index of violation for all and highlight orders
    for i, row in merged.iterrows():
        if i == 0:
            continue
        prev = merged.iloc[i-1].all_nodes_ordering
        cur = row.all_nodes_ordering
        assert prev < cur, f'At index {i}, the merged dataframe\'s all node order is not monotonically increasing.'
    for i, row in merged.iterrows():
        if i == 0:
            continue
        if str(row.highlighted_xpaths) == 'nan':
            continue
        prev = merged.iloc[i-1].exploded_highlight_node_order
        if str(prev) == 'nan':
            continue
        cur = row.exploded_highlight_node_order
        print(prev, cur)
        assert prev < cur, f'At index {i}, the merged dataframe\'s exploded highlight node order is not monotonically increasing.'
    
    # Check for no lost left rows
    assert len(full) == len(merged)
    print('ALL CHECKS PASSED')
    

In [18]:
merge(all_nodes_copy, highlight_copy)
# all_nodes_copy[all_nodes_copy.highlighted_xpaths.astype(str) != 'nan']
all_nodes_copy[440:500]
all_nodes_copy.iloc[475].xpaths
highlight_copy[:50]

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,size
0,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,0,0.001842,0.005614,0.988772,0.005896,8
1,/html/body/document/type/sequence/filename/des...,by and among,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,1,0.001842,0.005614,0.988772,0.005896,8
2,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,2,0.001842,0.005614,0.988772,0.005896,8
3,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,3,0.001842,0.005614,0.988772,0.005896,8
4,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,4,0.001842,0.005614,0.988772,0.005896,8
5,/html/body/document/type/sequence/filename/des...,and,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,5,0.001842,0.005614,0.988772,0.005896,8
6,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER SUB, LLC",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,6,0.001842,0.005614,0.988772,0.005896,8
7,/html/body/document/type/sequence/filename/des...,"Dated as of September 7, 2021",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,7,0.001842,0.005614,0.988772,0.005896,8
8,/html/body/document/type/sequence/filename/des...,TABLE OF CONTENTS,st,1,"[0.010825646108103987, 0.45009894736842104, 0....",1,8,0.010826,0.450099,0.099803,0.000381,1
9,/html/body/document/type/sequence/filename/des...,CONTENTS,st,2,"[0.012401209626184849, 0.47426526315789475, 0....",1,9,0.012401,0.474265,0.051464,0.000381,1


In [19]:
original_df

Unnamed: 0,xpaths,text,all_nodes_ordering
0,/html/body/document/type,EX-2.1,0
1,/html/body/document/type/sequence,2,1
2,/html/body/document/type/sequence/filename,tm2127034d1_ex2-1.htm,2
3,/html/body/document/type/sequence/filename/des...,EXHIBIT 2.1,3
4,/html/body/document/type/sequence/filename/des...,Exhibit 2.1,4
...,...,...,...
2687,/html/body/document/type/sequence/filename/des...,/s/ Devin Chen,2687
2688,/html/body/document/type/sequence/filename/des...,Name:,2688
2689,/html/body/document/type/sequence/filename/des...,Devin Chen,2689
2690,/html/body/document/type/sequence/filename/des...,Title:,2690


In [20]:
test_merge(original_df, highlight_copy, all_nodes_copy)

0.0 1.0
1.0 2.0
2.0 3.0
3.0 4.0
4.0 5.0
5.0 6.0
6.0 7.0
7.0 8.0
11.0 12.0
13.0 14.0
16.0 17.0
17.0 18.0
ALL CHECKS PASSED


In [21]:
all_nodes_copy.to_csv("good_merge_maybe.csv")

In [22]:
all_nodes_copy[2040:2100]

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,size
2040,/html/body/document/type/sequence/filename/des...,Section 8.7,2040,,,,,,,,,,,,
2041,/html/body/document/type/sequence/filename/des...,;,2041,,,,,,,,,,,,
2042,/html/body/document/type/sequence/filename/des...,provided,2042,,,,,,,,,,,,
2043,/html/body/document/type/sequence/filename/des...,", that such notification shall only be effecti...",2043,,,,,,,,,,,,
2044,/html/body/document/type/sequence/filename/des...,72,2044,,,,,,,,,,,,
2045,/html/body/document/type/sequence/filename/des...,Section 8.8,2045,,,,,,,,,,,,
2046,/html/body/document/type/sequence/filename/des...,Assignment;\nBinding Effect,2046,,,,,,,,,,,,
2047,/html/body/document/type/sequence/filename/des...,. Neither this Agreement nor any of the rights...,2047,,,,,,,,,,,,
2048,/html/body/document/type/sequence/filename/des...,Section 8.8,2048,,,,,,,,,,,,
2049,/html/body/document/type/sequence/filename/des...,",\nthis Agreement shall be binding upon and sh...",2049,,,,,,,,,,,,


## I noticed that the period removing function when applied added more rows to delete but I dont understand why that is the case at all. It was hard to debug. I left it alone though to solve other problems since the function looked like it made sense... But still worried

In [23]:
# highlighted_df = pd.DataFrame()
# highlighted_df['highlighted_xpaths'] = ast.literal_eval(highlighted_data['xpaths'])
# highlighted_df['highlighted_segmented_text'] = ast.literal_eval(highlighted_data['segmentedTexts'])
# highlighted_df['highlighted_labels'] = ast.literal_eval(highlighted_data['labels'])
# highlighted_df['highlighted_coordinates'] = ast.literal_eval(highlighted_data['c'])
# highlighted_df['segment_number_from_idx'] = highlighted_df.index.copy()

# #highlighted_df = highlighted_df.apply(lambda row: remove_periods(row), axis=1)
# highlighted_df['num_entries_1'] = highlighted_df['highlighted_xpaths'].apply(len)
# highlighted_df['num_entries_2'] = highlighted_df['highlighted_segmented_text'].apply(len)

# assert highlighted_df['num_entries_1'].equals(highlighted_df['num_entries_2']), 'Mismatch in segmentation and groupings'
# print("There is no mismatch in segmentations and groupings, we can proceed")

# # Step 2: explode
# exploded_highlight_df = highlighted_df[
# ['highlighted_xpaths',
#  'highlighted_segmented_text',
#  'highlighted_labels',
#  'segment_number_from_idx',
#  'highlighted_coordinates',
#  'num_entries_1']
# ].explode(column=['highlighted_xpaths','highlighted_segmented_text']).reset_index(drop=True)

# exploded_highlight_df['exploded_highlight_node_order'] = exploded_highlight_df.index.copy()

In [24]:
# hr = pd.DataFrame()
# hr['highlighted_xpaths'] = ast.literal_eval(highlighted_data['xpaths'])
# hr['highlighted_segmented_text'] = ast.literal_eval(highlighted_data['segmentedTexts'])
# hr['highlighted_labels'] = ast.literal_eval(highlighted_data['labels'])
# hr['highlighted_coordinates'] = ast.literal_eval(highlighted_data['c'])
# hr['segment_number_from_idx'] = hr.index.copy()

# hr = hr.apply(lambda row: remove_periods(row), axis=1)
# hr['num_entries_1'] = hr['highlighted_xpaths'].apply(len)
# hr['num_entries_2'] = hr['highlighted_segmented_text'].apply(len)

# assert hr['num_entries_1'].equals(hr['num_entries_2']), 'Mismatch in segmentation and groupings'
# print("There is no mismatch in segmentations and groupings, we can proceed")

# # Step 2: explode
# hr_e = hr[
# ['highlighted_xpaths',
#  'highlighted_segmented_text',
#  'highlighted_labels',
#  'segment_number_from_idx',
#  'highlighted_coordinates',
#  'num_entries_1']
# ].explode(column=['highlighted_xpaths','highlighted_segmented_text']).reset_index(drop=True)

# hr_e['exploded_highlight_node_order'] = hr_e.index.copy()

In [25]:
highlight_copy[16:25]

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,size
16,/html/body/document/type/sequence/filename/des...,Article I.,sn,9,"[0.08095092882010622, 0.4746933333333333, 0.05...",1,16,0.080951,0.474693,0.050614,0.000381,1
17,/html/body/document/type/sequence/filename/des...,THE\nMERGERS,st,10,"[0.08173871057914665, 0.4653017543859649, 0.06...",1,17,0.081739,0.465302,0.069391,0.000381,1
18,/html/body/document/type/sequence/filename/des...,Section 1.1,ssn,11,"[0.08252649233818708, 0.03929824561403509, 0.0...",1,18,0.082526,0.039298,0.045839,0.000381,1
19,/html/body/document/type/sequence/filename/des...,Section 1.1,sst,12,"[0.08252649233818708, 0.10385964912280701, 0.0...",2,19,0.082526,0.10386,0.0521,0.000381,2
20,/html/body/document/type/sequence/filename/des...,The\nMergers,sst,12,"[0.08252649233818708, 0.10385964912280701, 0.0...",2,20,0.082526,0.10386,0.0521,0.000381,2
21,/html/body/document/type/sequence/filename/des...,(a),sssn,13,"[0.08331427409722751, 0.07298245614035087, 0.0...",1,21,0.083314,0.072982,0.012259,0.000381,1
22,/html/body/document/type/sequence/filename/des...,The\nPartnership Merger,ssst,14,"[0.08331427409722751, 0.10396912280701755, 0.0...",1,22,0.083314,0.103969,0.091014,0.000381,1
23,/html/body/document/type/sequence/filename/des...,5,n,17,"[0.08765977992935377, 0.49765894736842103, 0.0...",1,23,0.08766,0.497659,0.004682,0.000381,1
24,/html/body/document/type/sequence/filename/des...,(b),ssn,24,"[0.08887957104012605, 0.07298245614035087, 0.0...",1,24,0.08888,0.072982,0.01091,0.000381,1


In [26]:
all_nodes_df[all_nodes_df['text'].str.contains('Section\xa08.11')]
all_nodes_copy[all_nodes_copy.highlighted_xpaths.astype(str) != 'nan']
all_nodes_copy[340:400]

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,size
340,/html/body/document/type/sequence/filename/des...,"”), with the closing of such JV Sale Transacti...",340,,,,,,,,,,,,
341,/html/body/document/type/sequence/filename/des...,"NOW, THEREFORE, in consideration of the forego...",341,,,,,,,,,,,,
342,/html/body/document/type/sequence/filename/des...,Article I.,342,/html/body/document/type/sequence/filename/des...,Article I.,sn,9.0,"[0.08095092882010622, 0.4746933333333333, 0.05...",1.0,16.0,0.080951,0.474693,0.050614,0.000381,1.0
343,/html/body/document/type/sequence/filename/des...,THE\nMERGERS,343,/html/body/document/type/sequence/filename/des...,THE\nMERGERS,st,10.0,"[0.08173871057914665, 0.4653017543859649, 0.06...",1.0,17.0,0.081739,0.465302,0.069391,0.000381,1.0
344,/html/body/document/type/sequence/filename/des...,Section 1.1,344,/html/body/document/type/sequence/filename/des...,Section 1.1,ssn,11.0,"[0.08252649233818708, 0.03929824561403509, 0.0...",1.0,18.0,0.082526,0.039298,0.045839,0.000381,1.0
345,/html/body/document/type/sequence/filename/des...,The\nMergers,345,,,,,,,,,,,,
346,/html/body/document/type/sequence/filename/des...,.,346,,,,,,,,,,,,
347,/html/body/document/type/sequence/filename/des...,(a),347,,,,,,,,,,,,
348,/html/body/document/type/sequence/filename/des...,The\nPartnership Merger,348,,,,,,,,,,,,
349,/html/body/document/type/sequence/filename/des...,.,349,,,,,,,,,,,,


In [27]:
# Counter(all_nodes_df['text']).most_common()

In [19]:
# # Monotonic left merge using dictionary for highlighted df. 
# # This version is much faster than the 
# # version below, but loses the df index ordering.
# # Does not affect highlight/all node ordering.

# def monotonic_left_merge(full, highlighted):
#     found_xpaths = []
#     merged_rows = []
#     highlighted_dict = highlighted.groupby(['highlighted_xpaths', \
#     'highlighted_segmented_text']).first().to_dict('index')
#     # print(highlighted_dict)
#     for _, row in full.iterrows():
#         full_xpaths = row['xpaths']
#         full_texts = row['text']
#         # Could do fuzzy matching instead of direct full text matching
#         matched_hrow = highlighted_dict.get((full_xpaths, full_texts))
        
#         if matched_hrow is None:
#             merged_rows.append(row)
#         else:
#             merged_row = pd.concat([row, pd.Series(matched_hrow)], axis=0)
#             merged_rows.append(merged_row)
#             highlighted_dict.pop((full_xpaths, full_texts))
        
#     merged_df = pd.concat(merged_rows, axis=1).T
            
#     return merged_df

In [20]:
# def monotonic_left_merge(full, highlighted):
#     found_xpaths = []
#     merged_df = []
#     for _, row in full.iterrows():
#         full_xpaths = row['xpaths']
#         full_texts = row['text']
#         n_order = row['all_nodes_ordering']

#         matched_hrow = pd.Series([])
#         # print(full_texts, full_xpaths)
#         for _, h_row in highlighted.iterrows():
#             h_xpaths = h_row['highlighted_xpaths']
#             h_texts = h_row['highlighted_segmented_text']

#             if h_xpaths == full_xpaths and h_texts in full_texts:
#                 if h_xpaths in found_xpaths:
#                     print('CURRENT h_xpath ALREADY FOUND BEFORE:')
#                     print(h_xpaths)
#                     _
                    
#                 found_xpaths.append(h_xpaths)
#                 # Instead of appending to merged_df 
#                 # directly, save the specific h_row 
#                 # and as you build the merged_df 
#                 # in the outer loop for all texts, for 
#                 # the highlight matched rows, append 
#                 # the h_row as well.
                
#                 # merged_df.append((row, h_row))
#                 matched_hrow = h_row
#                 break
#         if len(matched_hrow) == 0:
#             # print('No matched highlighted xpaths', full_xpaths)
#             merged_df.append(row)
#         else:
#             merged_df.append(pd.concat([row, matched_hrow], axis=0))
            
#     return merged_df

In [21]:
# merged_bad = monotonic_left_merge(all_nodes_copy, highlight_copy)
# merged_bad

In [22]:
# merged_bad[~merged_bad['highlighted_segmented_text'].isna()].to_csv('badmerge.csv')

In [23]:
# x = merged_bad.copy()

In [24]:
# x.iloc[240]

In [25]:
# count = 0
# for col in exploded_highlight_df.columns:
    
#     x.at[241, col] = count
#     count += 1

In [26]:
# exploded_highlight_df

## Other plan is to just bring in all node text from highlights as well (beyond just highlighted text! and then simply do a join)

In [27]:
# # %%ipytest
# # Uncomment above line to activate test
# '''
# Test merging as a proper left join
# 1. All left and right rows should be unmodified
# 2. No left rows should be deleted -> only matched right rows should exist

# '''
# def test_monotonic_left_merge():
#     # Create sample data for testing
#     full_data = {
#         'xpaths': ['p[1]', 'p[2]', 'p[3]', 'p[4]', 'p[5]'],
#         'text': ['A', 'B', 'C', 'D', 'E'],
#         'all_node_ordering': ['1', '2', '3', '4', '5']
#     }
#     highlighted_data = {
#         'highlighted_xpaths': ['p[2]', 'p[4]'],
#         'highlighted_segmented_text': ['B', 'D'],
#         'top': ['0', '10'],
#         'left': ['5', '15']
#     }

#     full_df = pd.DataFrame(full_data)
#     highlighted_df = pd.DataFrame(highlighted_data)

#     # Expected result after left merge
#     expected_data = {
#         'xpaths': ['p[1]', 'p[2]', 'p[3]', 'p[4]', 'p[5]'],
#         'text': ['A', 'B', 'C', 'D', 'E'],
#         'all_node_ordering': ['1', '2', '3', '4', '5'],
#         'top': [np.nan, '0', np.nan, '10', np.nan],
#         'left': [np.nan, '5', np.nan, '15', np.nan],
#     }
#     expected_df = pd.DataFrame(expected_data)

#     # Call the monotonic_left_merge function
#     merged_df = monotonic_left_merge(full_df, highlighted_df)
#     print(merged_df)
#     print(expected_df)
#     # TEST EQUAL DFs - Not required, but good to have.
#     # Compare the merged_df with the expected_df
#     pd.testing.assert_frame_equal(merged_df.reset_index().drop(columns=['index']), expected_df)
    
#     # TEST MONOTONIC ORDERING
#     assert merged_df['all_node_ordering'].is_monotonic_increasing


In [28]:
# merged_list = monotonic_left_merge(df, exploded_highlight_df)

In [29]:
# # [row for row in merged_df]
# merged_df = pd.DataFrame(merged_list)

In [30]:
# merged_df

In [31]:
# merged = pd.merge(df, exploded_highlight_df, left_on='xpaths', right_on='highlighted_xpaths', how='left', indicator=True)
# merged.iloc[1087:1097]


In [32]:
# merged.iloc[1087:1097]['xpaths'].tolist()

In [33]:
# merged_df['is_outside'] = merged_df['segment_number_from_idx'].apply(lambda x: 1 if math.isnan(x) else 0)

# merged = merged_df

In [34]:
# merged.to_csv("drop_check.csv")

In [35]:
# merged[merged['is_outside'] < 1].to_csv('merged_test.csv')

In [36]:
# merged[merged['is_outside'] < 1]

In [37]:
def assert_node_ordering_for_merged_table(merged: pd.DataFrame) -> pd.DataFrame:
    '''
    Due to repeat xpaths in both left and right table in merged, need to remove the duplicated rows
    '''
    drop_indices = []
#     last_highlight_node_idx = -1
#     for i, row in merged[merged['is_outside'] < 1].iterrows():
#         if row.exploded_highlight_node_order and row.exploded_highlight_node_order != last_highlight_node_idx + 1:
#             drop_indices.append(i)
#         else:
#             last_highlight_node_idx = row.exploded_highlight_node_order
    merged = merged.drop(drop_indices).reset_index(drop=True)
    
    # Assert both orderings keep their original structure
    assert merged.all_nodes_ordering.is_monotonic_increasing
    assert merged[merged['is_outside'] < 1].exploded_highlight_node_order.is_monotonic_increasing
    
    return merged, drop_indices

In [38]:
# # # Hacky fix for some highlight node orders 
# # # being off for some reason; fix is to set 
# # # the value to the previous + 1.
# for i in range(1, len(merged)):
#     cur = float(merged.iloc[i].exploded_highlight_node_order)
#     prev = float(merged.iloc[i-1].exploded_highlight_node_order)
#     if cur < prev:
#         merged.iloc[i, merged.columns.get_loc('exploded_highlight_node_order')] = prev + 1

In [39]:
# Code to check if the node ordering is off and print the location/value of discrepancy

# for f in range(1, len(merged[merged['is_outside'] < 1]['exploded_highlight_node_order'].tolist())):
#     a = merged[merged['is_outside'] < 1]['exploded_highlight_node_order'].tolist()[f-1]
#     if a >= merged[merged['is_outside'] < 1]['exploded_highlight_node_order'].tolist()[f]:
#         print(a)

# merged_df[26:46]

In [40]:
# merged_ordered, drop_indices = assert_node_ordering_for_merged_table(merged)
# empty_xpath_index = merged_ordered.fillna(0).where(merged_ordered['xpaths'] == '').dropna().index
# empty_xpath_index
# merged_ordered = merged_ordered.drop(empty_xpath_index)

Why am I losing rows (1 row now) from the left table?

In [41]:
# merged_ordered.iloc[-90:-60]

In [42]:
# merged_ordered[merged_ordered['all_nodes_ordering'] == merged_ordered.index]

In [43]:
# merged_ordered[merged_ordered['all_nodes_ordering'] > merged_ordered.index]

Okay I see what happened. At index 737 for all nodes ordering, somehow the join had a double repeat. All examples of that node in the left dataframe were deleted since 2 highlight nodes before it matched with it. Not sure why tho?

The (c) node was already matched as well as what followed it but then it and one of the successors was matched again. I think there can be some kind of check to see if highlight node was already matched and if removing current row would destroy the all nodes order, just dont delete the row... 

Seems weird but maybe it would work. 

Maybe another method could be tracking the start coordinates of the text in each node and then somehow getting a stricter order on that...

Or maybe tracking which char offset from node beginning a highlight belongs to. then if theres overlap in the interval, we merge, if not we move on. do the same for the all nodes as well

Let's do the BIES tagging now

In [34]:
def tag_bies_for_highlights(merged: pd.DataFrame) -> pd.DataFrame:
    
    tags = []
    count = 1
    for i, row in merged.iterrows():
        if not len(merged['highlighted_segmented_text']) > 0:
            continue
        
        list_entry_count = row['size']
        try:
            next_entry_count = merged.iloc[i+1]['size']
            # print(type(next_entry_count))
            if np.isnan(next_entry_count):
                next_entry_count = list_entry_count
        except:
            next_entry_count = 0
        
        # Non highlighted row
        if str(row.highlighted_segmented_text) == 'nan' or math.isnan(list_entry_count):
            tags.append('o')    
        
        # Single highlighted node
        elif list_entry_count == 1:
            tags.append(f's_{row.highlighted_labels}')
            count = 1
        
        # Last entry in group greater than size 1
        elif count == list_entry_count:          
        # elif list_entry_count > 1 and next_entry_count != list_entry_count:
            # print(count, list_entry_count, next_entry_count)
            tags.append(f'e_{row.highlighted_labels}')
            count = 1
        elif (count < list_entry_count) and count == 1:
            tags.append(f'b_{row.highlighted_labels}')
            count += 1
        elif (count < list_entry_count) and count > 1:
            tags.append(f'i_{row.highlighted_labels}')
            count += 1
    # print(count, list_entry_count, next_entry_count)
    merged['tagged_sequence'] = tags
    return merged

In [35]:
def test_tag_bies_for_highlights(df, h_df):
    # Check that count of bies does not exceed number of highlighted rows
    temp = df.reset_index().drop(columns=['index'])
    n_o = temp[temp['size'].astype(str) != 'nan']

    assert len(n_o) <= len(h_df)
    
    # Check that b,e and b,i,e rules are not violated
    b_tags = temp[temp.tagged_sequence.str.contains('b_')].index
    i_tags = temp[temp.tagged_sequence.str.contains('i_')].index
    e_tags = temp[temp.tagged_sequence.str.contains('e_')].index

    for b_tag in b_tags:
        assert 's_' in df.iloc[b_tag - 1].tagged_sequence \
        or 'o' in df.iloc[b_tag - 1].tagged_sequence  \
        or 'e_' in df.iloc[b_tag - 1].tagged_sequence
        
        # FAILING HERE -> b followed by s tag
        # assert 'e_' in df.iloc[b_tag + 1].tagged_sequence \
        # or 'i_' in df.iloc[b_tag + 1].tagged_sequence
        
    for e_tag in e_tags:
        # FAILING HERE -> e following s tag
        # assert 'i_' in df.iloc[b_tag - 1].tagged_sequence \
        # or 'b_' in df.iloc[b_tag - 1].tagged_sequence
        
        assert 's_' in df.iloc[b_tag + 1].tagged_sequence \
        or 'o' in df.iloc[b_tag + 1].tagged_sequence \
        or 'b_' in df.iloc[b_tag + 1].tagged_sequence
        
    for i_tag in i_tags:
        assert 'b_' in df.iloc[i_tag - 1].tagged_sequence or 'i_' in df.iloc[i_tag - 1].tagged_sequence
        assert 'e_' in df.iloc[i_tag + 1].tagged_sequence or 'i_' in df.iloc[i_tag + 1].tagged_sequence
    print('ALL CHECKS PASSED')
    

In [36]:
merged_tagged = tag_bies_for_highlights(all_nodes_copy)

In [37]:
merged_tagged[merged_tagged.tagged_sequence != 'o']

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,size,tagged_sequence
5,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,5,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,0.0,0.001842,0.005614,0.988772,0.005896,8.0,b_t
6,/html/body/document/type/sequence/filename/des...,by and among,6,/html/body/document/type/sequence/filename/des...,by and among,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,1.0,0.001842,0.005614,0.988772,0.005896,8.0,i_t
7,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",7,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,2.0,0.001842,0.005614,0.988772,0.005896,8.0,i_t
8,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",8,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,3.0,0.001842,0.005614,0.988772,0.005896,8.0,i_t
9,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",9,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,4.0,0.001842,0.005614,0.988772,0.005896,8.0,i_t
10,/html/body/document/type/sequence/filename/des...,and,10,/html/body/document/type/sequence/filename/des...,and,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,5.0,0.001842,0.005614,0.988772,0.005896,8.0,i_t
11,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER SUB, LLC",11,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER SUB, LLC",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,6.0,0.001842,0.005614,0.988772,0.005896,8.0,i_t
12,/html/body/document/type/sequence/filename/des...,"Dated as of September 7, 2021",12,/html/body/document/type/sequence/filename/des...,"Dated as of September 7, 2021",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,7.0,0.001842,0.005614,0.988772,0.005896,8.0,e_t
13,/html/body/document/type/sequence/filename/des...,TABLE OF CONTENTS,13,/html/body/document/type/sequence/filename/des...,TABLE OF CONTENTS,st,1,"[0.010825646108103987, 0.45009894736842104, 0....",1.0,8.0,0.010826,0.450099,0.099803,0.000381,1.0,s_st
15,/html/body/document/type/sequence/filename/des...,CONTENTS,15,/html/body/document/type/sequence/filename/des...,CONTENTS,st,2,"[0.012401209626184849, 0.47426526315789475, 0....",1.0,9.0,0.012401,0.474265,0.051464,0.000381,1.0,s_st


In [173]:
test_tag_bies_for_highlights(merged_tagged, highlight_copy)

ALL CHECKS PASSED


In [172]:
all_nodes_copy[all_nodes_copy['size'] >= 0]

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,size,tagged_sequence
7,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,7,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0036537963202902305, 0.4156520833333333, 0....",1.0,0.0,0.003654,0.415652,0.168696,0.000389,1.0,s_t
8,/html/body/document/type/sequence/filename/des...,BY AND AMONG,8,/html/body/document/type/sequence/filename/des...,BY AND AMONG,t,1,"[0.00445711324177248, 0.46365555555555554, 0.0...",1.0,1.0,0.004457,0.463656,0.072689,0.000389,1.0,s_t
9,/html/body/document/type/sequence/filename/des...,QAD Inc.,9,/html/body/document/type/sequence/filename/des...,QAD Inc.,t,2,"[0.005260430163254729, 0.4809680555555556, 0.0...",1.0,2.0,0.00526,0.480968,0.038059,0.000389,1.0,s_t
10,/html/body/document/type/sequence/filename/des...,"PROJECT QUICK PARENT, LLC",10,/html/body/document/type/sequence/filename/des...,"PROJECT QUICK PARENT, LLC",t,3,"[0.006063747084736978, 0.43030625, 0.139388194...",1.0,3.0,0.006064,0.430306,0.139388,0.000389,1.0,s_t
11,/html/body/document/type/sequence/filename/des...,AND,11,/html/body/document/type/sequence/filename/des...,AND,t,4,"[0.006867064006219227, 0.48996874999999995, 0....",1.0,4.0,0.006867,0.489969,0.020057,0.000389,1.0,s_t
12,/html/body/document/type/sequence/filename/des...,"PROJECT QUICK MERGER SUB, INC.",12,/html/body/document/type/sequence/filename/des...,"PROJECT QUICK MERGER SUB, INC.",t,5,"[0.007670380927701477, 0.4175347222222222, 0.1...",1.0,5.0,0.00767,0.417535,0.164931,0.000389,1.0,s_t
13,/html/body/document/type/sequence/filename/des...,"Dated as of June 27, 2021",13,/html/body/document/type/sequence/filename/des...,"Dated as of June 27, 2021",t,6,"[0.008473697849183726, 0.4498534722222222, 0.1...",1.0,6.0,0.008474,0.449853,0.100287,0.000389,1.0,s_t
14,/html/body/document/type/sequence/filename/des...,TABLE OF CONTENTS,14,/html/body/document/type/sequence/filename/des...,TABLE OF CONTENTS,t,7,"[0.014398574760300596, 0.4530541666666667, 0.0...",1.0,7.0,0.014399,0.453054,0.093885,0.000389,1.0,s_t
113,/html/body/document/type/sequence/filename/des...,i,113,/html/body/document/type/sequence/filename/des...,i,n,8,"[0.02925991189427313, 0.49870902777777776, 0.0...",1.0,8.0,0.02926,0.498709,0.002577,0.000389,1.0,s_n
220,/html/body/document/type/sequence/filename/des...,ii,220,/html/body/document/type/sequence/filename/des...,ii,n,9,"[0.045723503498315624, 0.49742291666666666, 0....",1.0,9.0,0.045724,0.497423,0.005149,0.000389,1.0,s_n


In [47]:
merged_tagged[merged_tagged['size'] == 1]

NameError: name 'merged_tagged' is not defined

In [None]:
merged_tagged.to_csv('overlabeled_labeled.csv')

I think I need to modify this algorithm. The overlabeled file that I just tried somehow labeled index 245 as b_n but i think its a single highlight node? It looks like the master nodes might be traversing in a different order tbh not sure, I did label pretty wildly... but still not sure how that happens?

In [None]:
highlighted_df.iloc[2]

In [None]:
highlighted_df.iloc[2]['highlighted_xpaths']

In [None]:
%%ipytest

def test_

In [None]:
%%ipytest

In [None]:
import pandas as pd
import json

In [None]:
pp = r"C:\Users\islam\Downloads\contract_saved (5).json"

with open(pp, encoding='UTF-8') as f:
    pp = json.load(f)


In [None]:
nodes_xpaths = ast.literal_eval(pp['xpaths'])
nodes_segmented_text = ast.literal_eval(pp['segTexts'])
nodes_text = ast.literal_eval(pp['texts'])
print(list(map(len, [nodes_xpaths,nodes_segmented_text,nodes_text])))
df = pd.DataFrame()
df['xpaths'] = nodes_xpaths
df['stext'] = nodes_segmented_text
df['text'] = nodes_text

In [None]:
df

In [None]:
df[df.stext.str.contains('means any effect, circumstance, occurrence or change that is ma')]

In [None]:
df[df.stext.str.contains('means any effect, circumstance, occurrence or change that is ma')].stext.to_list()

Basic idea:

It looksl like the segmented text when saving all nodes is the same as the nodeText piece from traverse (see where i log statemented it in labeler_.js.

We might be able to do a pure join assuming this is a full match...