In [1]:
import json
import pandas as pd
import numpy as np
import ast
import math
from collections import Counter
import re
import os
import post_process_helper
import postprocessor_tests
%load_ext autoreload
%autoreload 2

Unit Tests are integrated as part of the code after each function. There are a few main unit tests:

1. Merging results in proper monotonic left join
2. Highlighted xpath should always equal full xpath after merge
3. Check that all highlight boxes are in monotonic order
4. Ensure highlight boxes have no overlap or negligible overlap in IOU(~0.1% for now)

TODO: Create unit tests for filtering all/highlight node fns, 

Plan:
1. Write explicit filtering scripts for both highlight df and all nodes
    - Print or make clear any problems
2. Merge with good principles
    - Print or make clear any problems
3. Apply agreed upon period rule
4. Special characters handling (at least new lines and weird spaces that won't get tokenized properly?)

In [2]:
path_to_labeled_contracts = "/Users/rohith/Documents/Independent Study - DSGA1006/contracts/labeled/"
#path_to_labeled_contracts = "C:/Users/islam/Desktop/2023 Research/contracts/labeled/"
contract_num = 29
#contract_num = 34

all_nodes_path = f"{path_to_labeled_contracts}contract_{contract_num}_all_nodes.json"
highlighted_nodes_path = f"{path_to_labeled_contracts}contract_{contract_num}_highlighted.json"
highlight_nodes_edits_path = f"{path_to_labeled_contracts}edits/contract_{contract_num}_highlighted.json"


with open(all_nodes_path, encoding='UTF-8') as f:
    all_nodes_data = json.load(f)
with open(highlighted_nodes_path, encoding='UTF-8') as f:
    highlighted_data = json.load(f)
highlighted_data_edits = None
if os.path.exists(highlight_nodes_edits_path):
    with open(highlight_nodes_edits_path, encoding='UTF-8') as f:
        highlighted_data_edits = json.load(f)

In [3]:
all_nodes_df = post_process_helper.create_all_nodes_df(all_nodes_data)
postprocessor_tests.test_filter_all_nodes_df(all_nodes_df)
exploded_highlight_df = post_process_helper.create_highlight_nodes_df(highlighted_data)
postprocessor_tests.test_filter_highlight_nodes_df(exploded_highlight_df)

if os.path.exists(highlight_nodes_edits_path):
    highlight_edits = post_process_helper.create_highlight_nodes_df(highlighted_data_edits)

364 rows with empty strings were removed
test_filter_all_nodes_df - ALL CHECKS PASSED
['.']
['.']
['.']
['.']
['.']
['.']
['. The parties int']
['. At the Effective Time,']
['. Subject to']
['. Each share of common stock, par']
['. As of the Partnership Merger E']
['. Each Series\xa0A Common Unit of Merger Sub issued a']
['. A number of Company OP Common Units design']
['. Each Company OP Common Unit issued and outstanding immediat']
['. Each Company OP Series\xa0A Preferred Unit issued and']
['.']
['. At or immedia']
['.']
['. The cash consideration iss']
['. Any portion of the Exchange']
['. Each of Parent, M']
['. In the event that (i)\xa0a di']
['. If any Company C']
['. No dissenters’ or ap']
['. If, at any time f']
['.']
['.']
['.']
['.']
['.']
['. There are no Liabilities ']
['.']
['. Except as would not ']
['.']
['.']
['. Since December\xa031, 2020 through the']
['.']
['. The information su']
['. Except as ']
['.']
['. Except as would not ']
['. The Company Board has recei']
['

In [4]:
all_nodes_df.to_csv(f'{path_to_labeled_contracts}csvs/contract_{contract_num}_all_nodes.csv')
exploded_highlight_df.to_csv(f'{path_to_labeled_contracts}csvs/contract_{contract_num}_highlighted.csv')
if os.path.exists(highlight_nodes_edits_path):
    highlight_edits.to_csv(f'{path_to_labeled_contracts}csvs/contract_{contract_num}_edited.csv')

Prepare columns and copies for merging

In [5]:
obj_cols = [
    'highlighted_xpaths',
    'highlighted_segmented_text',
    'highlighted_labels',
    'segment_number_from_idx',
    'highlighted_coordinates'
]
all_nodes_copy = all_nodes_df.copy(deep=True)

for col in exploded_highlight_df.columns:
    all_nodes_copy[col] = np.nan
    if col in obj_cols:
        all_nodes_copy[col] = all_nodes_copy[col].astype(object)
highlight_copy = exploded_highlight_df.copy(deep=True)

In [6]:
def fill_row(all_nodes, highlight_df, all_idx, highlight_idx):
    for col in highlight_df.columns:
        all_nodes.at[all_idx, col] = highlight_df.at[highlight_idx, col]
    return None

In [7]:
def remove_highlighted_duplicates(df):
    highlight_df = df.reset_index().drop(columns=['index'])
    drop_index = []
    for i, cur in highlight_df.iterrows():
        if i == 0:
            continue
            
        prev = highlight_df.iloc[i-1]
        
        # pattern = r'(^Section ([0-9]\.[0-9])+)'
        # matches = re.findall(pattern, cur.highlighted_segmented_text.strip())
        # print(cur.highlighted_segmented_text, matches, re.findall(pattern, 'Section 3.1'))
        # if prev.highlighted_xpaths == cur.highlighted_xpaths\
        # and 'st' in cur.highlighted_labels\
        # and len(matches) > 0:
        #     drop_index.append(i)
        # if cur.highlighted_segmented_text in prev.highlighted_segmented_text\
        if prev.highlighted_segmented_text.startswith(cur.highlighted_segmented_text)\
        and prev.highlighted_xpaths == cur.highlighted_xpaths\
        and 'st' in cur.highlighted_labels:
            drop_index.append(i)

    print(drop_index)
    
    highlight_df.drop(drop_index, inplace=True)
    highlight_df.drop(columns='size', inplace=True)
    
    final_group_sizes = highlight_df.groupby('segment_number_from_idx', as_index=False).size()
    highlight_df = pd.merge(highlight_df, final_group_sizes, on='segment_number_from_idx', how='left')
    
    highlight_df = highlight_df.reset_index(drop=True)
    new_highlight_index = highlight_df.index
    highlight_df['exploded_highlight_node_order'] = new_highlight_index
    
    return highlight_df

In [8]:
def merge(all_nodes_df, highlight_nodes_df):
    '''
    Curr status: I think this function simply stops when theres a lack of a match based on two pointers
    '''
    curr_highlight_node_idx = 0
    
    for i, row in all_nodes_df.iterrows():
        xpath_match = (
            row.xpaths == highlight_nodes_df.at[
            curr_highlight_node_idx,
            'highlighted_xpaths'
        ])
        # if curr_highlight_node_idx in range(283, 285):
        #     print(row.text, xpath_match, row.xpaths, highlight_nodes_df.at[
        #     curr_highlight_node_idx,
        #     'highlighted_xpaths'
        # ])
        #text_subset = highlight_nodes_df.at[
        #    curr_highlight_node_idx,
        #    'highlighted_segmented_text'
        #] == row.text
        
        text_subset = True
        if xpath_match and text_subset:
            fill_row(
                all_nodes=all_nodes_df,
                highlight_df=highlight_nodes_df, 
                all_idx=i,
                highlight_idx=curr_highlight_node_idx
            )
            curr_highlight_node_idx += 1
        # print(curr_highlight_node_idx)
        if curr_highlight_node_idx >= len(highlight_nodes_df):
            return
        


In [9]:
dropped_highlight_df = remove_highlighted_duplicates(highlight_copy)
# dropped_highlight_df[:50]

[19, 39, 285, 305, 311, 314]


In [10]:
merge(all_nodes_copy, dropped_highlight_df)
all_nodes_copy

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,top_via_line_group,size
0,/html/body/document/type,EX-2.1,0,,,,,,,,,,,,,
1,/html/body/document/type/sequence,2,1,,,,,,,,,,,,,
2,/html/body/document/type/sequence/filename,tm2127034d1_ex2-1.htm,2,,,,,,,,,,,,,
3,/html/body/document/type/sequence/filename/des...,EXHIBIT 2.1,3,,,,,,,,,,,,,
4,/html/body/document/type/sequence/filename/des...,Exhibit 2.1,4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2687,/html/body/document/type/sequence/filename/des...,/s/ Devin Chen,2687,,,,,,,,,,,,,
2688,/html/body/document/type/sequence/filename/des...,Name:,2688,,,,,,,,,,,,,
2689,/html/body/document/type/sequence/filename/des...,Devin Chen,2689,,,,,,,,,,,,,
2690,/html/body/document/type/sequence/filename/des...,Title:,2690,,,,,,,,,,,,,


In [11]:
all_nodes_copy[all_nodes_copy.highlighted_xpaths.astype(str) != 'nan']
#highlight_copy.iloc[90:100]
#dropped_highlight_df[dropped_highlight_df.highlighted_xpaths.astype(str) != 'nan']


Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,top_via_line_group,size
5,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,5,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,0.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0
6,/html/body/document/type/sequence/filename/des...,by and among,6,/html/body/document/type/sequence/filename/des...,by and among,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,1.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0
7,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",7,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,2.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0
8,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",8,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,3.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0
9,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",9,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,4.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2394,/html/body/document/type/sequence/filename/des...,84,2394,/html/body/document/type/sequence/filename/des...,84,n,332,"[0.9193895961983177, 0.49531789473684207, 0.00...",1.0,330.0,0.919390,0.495318,0.009359,0.000381,0.919390,1.0
2429,/html/body/document/type/sequence/filename/des...,85,2429,/html/body/document/type/sequence/filename/des...,85,n,333,"[0.9320449289725802, 0.49531789473684207, 0.00...",1.0,331.0,0.932045,0.495318,0.009359,0.000381,0.932045,1.0
2522,/html/body/document/type/sequence/filename/des...,86,2522,/html/body/document/type/sequence/filename/des...,86,n,334,"[0.9521841884577267, 0.49531789473684207, 0.00...",1.0,332.0,0.952184,0.495318,0.009359,0.000381,0.952184,1.0
2615,/html/body/document/type/sequence/filename/des...,87,2615,/html/body/document/type/sequence/filename/des...,87,n,335,"[0.9723234479428732, 0.49531789473684207, 0.00...",3.0,333.0,0.972323,0.495318,0.009359,0.000381,0.972323,1.0


In [12]:
postprocessor_tests.test_merge(all_nodes_copy, dropped_highlight_df, all_nodes_copy)

test_merge - ALL CHECKS PASSED


In [13]:
all_nodes_copy.to_csv('merge_test.csv')

In [14]:
dropped_highlight_df

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,top_via_line_group,size
0,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,0,0.001842,0.005614,0.988772,0.005896,0.001842,8
1,/html/body/document/type/sequence/filename/des...,by and among,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,1,0.001842,0.005614,0.988772,0.005896,0.001842,8
2,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,2,0.001842,0.005614,0.988772,0.005896,0.001842,8
3,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,3,0.001842,0.005614,0.988772,0.005896,0.001842,8
4,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8,4,0.001842,0.005614,0.988772,0.005896,0.001842,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
330,/html/body/document/type/sequence/filename/des...,84,n,332,"[0.9193895961983177, 0.49531789473684207, 0.00...",1,330,0.919390,0.495318,0.009359,0.000381,0.919390,1
331,/html/body/document/type/sequence/filename/des...,85,n,333,"[0.9320449289725802, 0.49531789473684207, 0.00...",1,331,0.932045,0.495318,0.009359,0.000381,0.932045,1
332,/html/body/document/type/sequence/filename/des...,86,n,334,"[0.9521841884577267, 0.49531789473684207, 0.00...",1,332,0.952184,0.495318,0.009359,0.000381,0.952184,1
333,/html/body/document/type/sequence/filename/des...,87,n,335,"[0.9723234479428732, 0.49531789473684207, 0.00...",3,333,0.972323,0.495318,0.009359,0.000381,0.972323,1


In [15]:
all_nodes_copy[all_nodes_copy.highlighted_xpaths.astype(str) != 'nan']

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,top_via_line_group,size
5,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,5,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,0.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0
6,/html/body/document/type/sequence/filename/des...,by and among,6,/html/body/document/type/sequence/filename/des...,by and among,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,1.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0
7,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",7,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,2.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0
8,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",8,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,3.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0
9,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",9,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,4.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2394,/html/body/document/type/sequence/filename/des...,84,2394,/html/body/document/type/sequence/filename/des...,84,n,332,"[0.9193895961983177, 0.49531789473684207, 0.00...",1.0,330.0,0.919390,0.495318,0.009359,0.000381,0.919390,1.0
2429,/html/body/document/type/sequence/filename/des...,85,2429,/html/body/document/type/sequence/filename/des...,85,n,333,"[0.9320449289725802, 0.49531789473684207, 0.00...",1.0,331.0,0.932045,0.495318,0.009359,0.000381,0.932045,1.0
2522,/html/body/document/type/sequence/filename/des...,86,2522,/html/body/document/type/sequence/filename/des...,86,n,334,"[0.9521841884577267, 0.49531789473684207, 0.00...",1.0,332.0,0.952184,0.495318,0.009359,0.000381,0.952184,1.0
2615,/html/body/document/type/sequence/filename/des...,87,2615,/html/body/document/type/sequence/filename/des...,87,n,335,"[0.9723234479428732, 0.49531789473684207, 0.00...",3.0,333.0,0.972323,0.495318,0.009359,0.000381,0.972323,1.0


In [16]:
all_nodes_copy.to_csv("trial.csv")

## Merging potenetially completed. Check output

## Use size instead of num entries

In [22]:
def tag_bies_for_highlights(merged: pd.DataFrame) -> pd.DataFrame:
    
    tags = []
    count = 1
    for i, row in merged.iterrows():
        if not len(merged['highlighted_segmented_text']) > 0:
            continue
        
        list_entry_count = row['size']
        
        try:
            next_entry_count = merged.iloc[i+1]['size']
            # print(type(next_entry_count))
            if np.isnan(next_entry_count):
                next_entry_count = list_entry_count
        except:
            next_entry_count = 0
        # Non highlighted row
        if math.isnan(list_entry_count):
            tags.append('o')    
        
        # Single highlighted node
        elif list_entry_count == 1:
            tags.append(f's_{row.highlighted_labels}')
            count = 1
        
        # Last entry in group greater than size 1
        elif count == list_entry_count:          
        # elif list_entry_count > 1 and next_entry_count != list_entry_count:
            # print(count, list_entry_count, next_entry_count)
            tags.append(f'e_{row.highlighted_labels}')
            count = 1
        elif (count < list_entry_count) and count == 1:
            tags.append(f'b_{row.highlighted_labels}')
            count += 1
        elif (count < list_entry_count) and count > 1:
            tags.append(f'i_{row.highlighted_labels}')
            count += 1
    # print(count, list_entry_count, next_entry_count)
    merged['tagged_sequence'] = tags
    return merged

In [23]:
merged_tagged = tag_bies_for_highlights(all_nodes_copy)

In [25]:
merged_tagged[merged_tagged.tagged_sequence != 'o']

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,top_via_line_group,size,tagged_sequence
5,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,5,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,0.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0,b_t
6,/html/body/document/type/sequence/filename/des...,by and among,6,/html/body/document/type/sequence/filename/des...,by and among,t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,1.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0,i_t
7,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",7,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,2.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0,i_t
8,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",8,/html/body/document/type/sequence/filename/des...,"COLUMBIA PROPERTY TRUST OPERATING PARTNERSHIP,...",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,3.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0,i_t
9,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",9,/html/body/document/type/sequence/filename/des...,"PANTHER MERGER PARENT, INC.,",t,0,"[0.0018423928235622982, 0.005614035087719298, ...",8.0,4.0,0.001842,0.005614,0.988772,0.005896,0.001842,8.0,i_t
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2394,/html/body/document/type/sequence/filename/des...,84,2394,/html/body/document/type/sequence/filename/des...,84,n,332,"[0.9193895961983177, 0.49531789473684207, 0.00...",1.0,330.0,0.919390,0.495318,0.009359,0.000381,0.919390,1.0,s_n
2429,/html/body/document/type/sequence/filename/des...,85,2429,/html/body/document/type/sequence/filename/des...,85,n,333,"[0.9320449289725802, 0.49531789473684207, 0.00...",1.0,331.0,0.932045,0.495318,0.009359,0.000381,0.932045,1.0,s_n
2522,/html/body/document/type/sequence/filename/des...,86,2522,/html/body/document/type/sequence/filename/des...,86,n,334,"[0.9521841884577267, 0.49531789473684207, 0.00...",1.0,332.0,0.952184,0.495318,0.009359,0.000381,0.952184,1.0,s_n
2615,/html/body/document/type/sequence/filename/des...,87,2615,/html/body/document/type/sequence/filename/des...,87,n,335,"[0.9723234479428732, 0.49531789473684207, 0.00...",3.0,333.0,0.972323,0.495318,0.009359,0.000381,0.972323,1.0,s_n


In [21]:
# merged_tagged[merged_tagged['is_outside'] < 1]['tagged_sequence'].tolist(), merged_tagged[merged_tagged['is_outside'] < 1]['text'].tolist()

In [27]:
path_to_tagged_folder = f'/Users/rohith/Documents/Independent Study - DSGA1006/contracts/tagged'
merged_tagged.to_csv(f'{path_to_tagged_folder}/contract_{contract_num}_tagged.csv')