In [73]:
import json
import pandas as pd
import numpy as np
import ast
import math
from collections import Counter
import re
import os
import post_process_helper
import postprocessor_tests
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Unit Tests are integrated as part of the code after each function. There are a few main unit tests:

1. Merging results in proper monotonic left join
2. Highlighted xpath should always equal full xpath after merge
3. Check that all highlight boxes are in monotonic order
4. Ensure highlight boxes have no overlap or negligible overlap in IOU(~0.1% for now)

TODO: Create unit tests for filtering all/highlight node fns, 

Plan:
1. Write explicit filtering scripts for both highlight df and all nodes
    - Print or make clear any problems
2. Merge with good principles
    - Print or make clear any problems
3. Apply agreed upon period rule
4. Special characters handling (at least new lines and weird spaces that won't get tokenized properly?)

In [74]:
path_to_labeled_contracts = "/Users/rohith/Documents/Independent Study - DSGA1006/contracts/labeled/"
#path_to_labeled_contracts = "C:/Users/islam/Desktop/2023 Research/contracts/labeled/"
contract_num = 29
contract_num = 34
contract_num = 116

all_nodes_path = f"{path_to_labeled_contracts}contract_{contract_num}_all_nodes.json"
highlighted_nodes_path = f"{path_to_labeled_contracts}contract_{contract_num}_highlighted.json"
highlight_nodes_edits_path = f"{path_to_labeled_contracts}edits/contract_{contract_num}_highlighted.json"


with open(all_nodes_path, encoding='UTF-8') as f:
    all_nodes_data = json.load(f)
with open(highlighted_nodes_path, encoding='UTF-8') as f:
    highlighted_data = json.load(f)
highlighted_data_edits = None
if os.path.exists(highlight_nodes_edits_path):
    with open(highlight_nodes_edits_path, encoding='UTF-8') as f:
        highlighted_data_edits = json.load(f)

In [75]:
all_nodes_df = post_process_helper.create_all_nodes_df(all_nodes_data)
postprocessor_tests.test_filter_all_nodes_df(all_nodes_df)
exploded_highlight_df = post_process_helper.create_highlight_nodes_df(highlighted_data)
postprocessor_tests.test_filter_highlight_nodes_df(exploded_highlight_df)

if os.path.exists(highlight_nodes_edits_path):
    highlight_edits = post_process_helper.create_highlight_nodes_df(highlighted_data_edits)

1 rows with empty strings were removed
test_filter_all_nodes_df - ALL CHECKS PASSED
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['. ']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
['.']
There is no mismatch in segmentations and groupings, we can proceed
Filtering highlight nodes df now
43 rows with DEL, DELETED were removed
test_filter_highlight_nodes_df - ALL CHECKS PASSED


In [76]:
all_nodes_df.to_csv(f'{path_to_labeled_contracts}csvs/contract_{contract_num}_all_nodes.csv')
exploded_highlight_df.to_csv(f'{path_to_labeled_contracts}csvs/contract_{contract_num}_highlighted.csv')
if os.path.exists(highlight_nodes_edits_path):
    highlight_edits.to_csv(f'{path_to_labeled_contracts}csvs/contract_{contract_num}_edited.csv')

Prepare columns and copies for merging

In [77]:
obj_cols = [
    'highlighted_xpaths',
    'highlighted_segmented_text',
    'highlighted_labels',
    'segment_number_from_idx',
    'highlighted_coordinates'
]
all_nodes_copy = all_nodes_df.copy(deep=True)

for col in exploded_highlight_df.columns:
    all_nodes_copy[col] = np.nan
    if col in obj_cols:
        all_nodes_copy[col] = all_nodes_copy[col].astype(object)
highlight_copy = exploded_highlight_df.copy(deep=True)

In [78]:
def fill_row(all_nodes, highlight_df, all_idx, highlight_idx):
    for col in highlight_df.columns:
        all_nodes.at[all_idx, col] = highlight_df.at[highlight_idx, col]
    return None

In [79]:
def remove_highlighted_duplicates(df):
    highlight_df = df.reset_index().drop(columns=['index'])
    drop_index = []
    for i, cur in highlight_df.iterrows():
        if i == 0:
            continue
            
        prev = highlight_df.iloc[i-1]
        
        # pattern = r'(^Section ([0-9]\.[0-9])+)'
        # matches = re.findall(pattern, cur.highlighted_segmented_text.strip())
        # print(cur.highlighted_segmented_text, matches, re.findall(pattern, 'Section 3.1'))
        # if prev.highlighted_xpaths == cur.highlighted_xpaths\
        # and 'st' in cur.highlighted_labels\
        # and len(matches) > 0:
        #     drop_index.append(i)
        # if cur.highlighted_segmented_text in prev.highlighted_segmented_text\
        if len(cur.highlighted_segmented_text) < 1:
            drop_index.append(i)
        
        if prev.highlighted_segmented_text.startswith(cur.highlighted_segmented_text)\
        and prev.highlighted_xpaths == cur.highlighted_xpaths\
        and 'st' in cur.highlighted_labels:
            drop_index.append(i)

    print(drop_index)
    
    highlight_df.drop(drop_index, inplace=True)
    highlight_df.drop(columns='size', inplace=True)
    
    final_group_sizes = highlight_df.groupby('segment_number_from_idx', as_index=False).size()
    highlight_df = pd.merge(highlight_df, final_group_sizes, on='segment_number_from_idx', how='left')
    
    highlight_df = highlight_df.reset_index(drop=True)
    new_highlight_index = highlight_df.index
    highlight_df['exploded_highlight_node_order'] = new_highlight_index
    
    return highlight_df

In [80]:
def merge(all_nodes_df, highlight_nodes_df):
    '''
    Curr status: I think this function simply stops when theres a lack of a match based on two pointers
    '''
    curr_highlight_node_idx = 0
    
    for i, row in all_nodes_df.iterrows():
        xpath_match = (
            row.xpaths == highlight_nodes_df.at[
            curr_highlight_node_idx,
            'highlighted_xpaths'
        ])
        # if curr_highlight_node_idx in range(283, 285):
        #     print(row.text, xpath_match, row.xpaths, highlight_nodes_df.at[
        #     curr_highlight_node_idx,
        #     'highlighted_xpaths'
        # ])
        #text_subset = highlight_nodes_df.at[
        #    curr_highlight_node_idx,
        #    'highlighted_segmented_text'
        #] == row.text
        
        text_subset = True
        if xpath_match and text_subset:
            fill_row(
                all_nodes=all_nodes_df,
                highlight_df=highlight_nodes_df, 
                all_idx=i,
                highlight_idx=curr_highlight_node_idx
            )
            curr_highlight_node_idx += 1
        # print(curr_highlight_node_idx)
        if curr_highlight_node_idx >= len(highlight_nodes_df):
            return
        


In [81]:
dropped_highlight_df = remove_highlighted_duplicates(highlight_copy)
dropped_highlight_df[:50]

[20, 55, 61, 134, 283]


Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,top_via_line_group,size
0,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0036537963202902305, 0.4156520833333333, 0....",1,0,0.003654,0.415652,0.168696,0.000389,0.003654,1
1,/html/body/document/type/sequence/filename/des...,BY AND AMONG,t,1,"[0.00445711324177248, 0.46365555555555554, 0.0...",1,1,0.004457,0.463656,0.072689,0.000389,0.004457,1
2,/html/body/document/type/sequence/filename/des...,QAD Inc.,t,2,"[0.005260430163254729, 0.4809680555555556, 0.0...",1,2,0.00526,0.480968,0.038059,0.000389,0.00526,1
3,/html/body/document/type/sequence/filename/des...,"PROJECT QUICK PARENT, LLC",t,3,"[0.006063747084736978, 0.43030625, 0.139388194...",1,3,0.006064,0.430306,0.139388,0.000389,0.006064,1
4,/html/body/document/type/sequence/filename/des...,AND,t,4,"[0.006867064006219227, 0.48996874999999995, 0....",1,4,0.006867,0.489969,0.020057,0.000389,0.006867,1
5,/html/body/document/type/sequence/filename/des...,"PROJECT QUICK MERGER SUB, INC.",t,5,"[0.007670380927701477, 0.4175347222222222, 0.1...",1,5,0.00767,0.417535,0.164931,0.000389,0.00767,1
6,/html/body/document/type/sequence/filename/des...,"Dated as of June 27, 2021",t,6,"[0.008473697849183726, 0.4498534722222222, 0.1...",1,6,0.008474,0.449853,0.100287,0.000389,0.008474,1
7,/html/body/document/type/sequence/filename/des...,TABLE OF CONTENTS,t,7,"[0.014398574760300596, 0.4530541666666667, 0.0...",1,7,0.014399,0.453054,0.093885,0.000389,0.014399,1
8,/html/body/document/type/sequence/filename/des...,i,n,8,"[0.02925991189427313, 0.49870902777777776, 0.0...",1,8,0.02926,0.498709,0.002577,0.000389,0.02926,1
9,/html/body/document/type/sequence/filename/des...,ii,n,9,"[0.045723503498315624, 0.49742291666666666, 0....",1,9,0.045724,0.497423,0.005149,0.000389,0.045724,1


In [82]:
merge(all_nodes_copy, dropped_highlight_df)
all_nodes_copy

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,top_via_line_group,size
0,/html/body/document/type,EX-2.1,0,,,,,,,,,,,,,
1,/html/body/document/type/sequence,2,1,,,,,,,,,,,,,
2,/html/body/document/type/sequence/filename,ex_260368.htm,2,,,,,,,,,,,,,
3,/html/body/document/type/sequence/filename/des...,EXHIBIT 2.1,3,,,,,,,,,,,,,
4,/html/body/document/type/sequence/filename/des...,ex_260368.htm,4,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2370,/html/body/document/type/sequence/filename/des...,Name:,2370,,,,,,,,,,,,,
2371,/html/body/document/type/sequence/filename/des...,S. Scott Crabill,2371,,,,,,,,,,,,,
2372,/html/body/document/type/sequence/filename/des...,Title:,2372,,,,,,,,,,,,,
2373,/html/body/document/type/sequence/filename/des...,President and Assistant Treasurer,2373,,,,,,,,,,,,,


In [83]:
all_nodes_copy[all_nodes_copy.highlighted_xpaths.astype(str) != 'nan']
#highlight_copy.iloc[90:100]
#dropped_highlight_df[dropped_highlight_df.highlighted_xpaths.astype(str) != 'nan']


Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,top_via_line_group,size
7,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,7,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0036537963202902305, 0.4156520833333333, 0....",1.0,0.0,0.003654,0.415652,0.168696,0.000389,0.003654,1.0
8,/html/body/document/type/sequence/filename/des...,BY AND AMONG,8,/html/body/document/type/sequence/filename/des...,BY AND AMONG,t,1,"[0.00445711324177248, 0.46365555555555554, 0.0...",1.0,1.0,0.004457,0.463656,0.072689,0.000389,0.004457,1.0
9,/html/body/document/type/sequence/filename/des...,QAD Inc.,9,/html/body/document/type/sequence/filename/des...,QAD Inc.,t,2,"[0.005260430163254729, 0.4809680555555556, 0.0...",1.0,2.0,0.005260,0.480968,0.038059,0.000389,0.005260,1.0
10,/html/body/document/type/sequence/filename/des...,"PROJECT QUICK PARENT, LLC",10,/html/body/document/type/sequence/filename/des...,"PROJECT QUICK PARENT, LLC",t,3,"[0.006063747084736978, 0.43030625, 0.139388194...",1.0,3.0,0.006064,0.430306,0.139388,0.000389,0.006064,1.0
11,/html/body/document/type/sequence/filename/des...,AND,11,/html/body/document/type/sequence/filename/des...,AND,t,4,"[0.006867064006219227, 0.48996874999999995, 0....",1.0,4.0,0.006867,0.489969,0.020057,0.000389,0.006867,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2339,/html/body/document/type/sequence/filename/des...,Survival,2339,/html/body/document/type/sequence/filename/des...,Survival,sst,370,"[0.9784322363306556, 0.13619791666666667, 0.03...",1.0,331.0,0.978432,0.136198,0.033686,0.000389,0.978432,1.0
2346,/html/body/document/type/sequence/filename/des...,76,2346,/html/body/document/type/sequence/filename/des...,76,n,371,"[0.98003887017362, 0.4953666666666666, 0.00926...",1.0,332.0,0.980039,0.495367,0.009261,0.000389,0.980039,1.0
2347,/html/body/document/type/sequence/filename/des...,Section 9.12,2347,/html/body/document/type/sequence/filename/des...,Section 9.12,ssn,372,"[0.9816429126716766, 0.07888472222222222, 0.04...",1.0,333.0,0.981643,0.078885,0.046414,0.000389,0.981643,1.0
2348,/html/body/document/type/sequence/filename/des...,Special Committee Approval,2348,/html/body/document/type/sequence/filename/des...,Special Committee Approval,sst,373,"[0.9816429126716766, 0.13508541666666665, 0.11...",1.0,334.0,0.981643,0.135085,0.110048,0.000389,0.981643,1.0


In [84]:
postprocessor_tests.test_merge(all_nodes_copy, dropped_highlight_df, all_nodes_copy)

test_merge - ALL CHECKS PASSED


In [85]:
all_nodes_copy.to_csv('merge_test.csv')

In [86]:
dropped_highlight_df

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,top_via_line_group,size
0,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0036537963202902305, 0.4156520833333333, 0....",1,0,0.003654,0.415652,0.168696,0.000389,0.003654,1
1,/html/body/document/type/sequence/filename/des...,BY AND AMONG,t,1,"[0.00445711324177248, 0.46365555555555554, 0.0...",1,1,0.004457,0.463656,0.072689,0.000389,0.004457,1
2,/html/body/document/type/sequence/filename/des...,QAD Inc.,t,2,"[0.005260430163254729, 0.4809680555555556, 0.0...",1,2,0.005260,0.480968,0.038059,0.000389,0.005260,1
3,/html/body/document/type/sequence/filename/des...,"PROJECT QUICK PARENT, LLC",t,3,"[0.006063747084736978, 0.43030625, 0.139388194...",1,3,0.006064,0.430306,0.139388,0.000389,0.006064,1
4,/html/body/document/type/sequence/filename/des...,AND,t,4,"[0.006867064006219227, 0.48996874999999995, 0....",1,4,0.006867,0.489969,0.020057,0.000389,0.006867,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,/html/body/document/type/sequence/filename/des...,Survival,sst,370,"[0.9784322363306556, 0.13619791666666667, 0.03...",1,331,0.978432,0.136198,0.033686,0.000389,0.978432,1
332,/html/body/document/type/sequence/filename/des...,76,n,371,"[0.98003887017362, 0.4953666666666666, 0.00926...",1,332,0.980039,0.495367,0.009261,0.000389,0.980039,1
333,/html/body/document/type/sequence/filename/des...,Section 9.12,ssn,372,"[0.9816429126716766, 0.07888472222222222, 0.04...",1,333,0.981643,0.078885,0.046414,0.000389,0.981643,1
334,/html/body/document/type/sequence/filename/des...,Special Committee Approval,sst,373,"[0.9816429126716766, 0.13508541666666665, 0.11...",1,334,0.981643,0.135085,0.110048,0.000389,0.981643,1


In [89]:
all_nodes_highlighted = all_nodes_copy[all_nodes_copy.highlighted_xpaths.astype(str) != 'nan']

In [93]:
assert all_nodes_highlighted.exploded_highlight_node_order.tolist() == dropped_highlight_df.exploded_highlight_node_order.tolist()

In [88]:
all_nodes_copy.to_csv("trial.csv")

## Merging potenetially completed. Check output

## Use size instead of num entries

In [61]:
def tag_bies_for_highlights(merged: pd.DataFrame) -> pd.DataFrame:
    
    tags = []
    count = 1
    for i, row in merged.iterrows():
        if not len(merged['highlighted_segmented_text']) > 0:
            continue
        
        list_entry_count = row['size']
        
        try:
            next_entry_count = merged.iloc[i+1]['size']
            # print(type(next_entry_count))
            if np.isnan(next_entry_count):
                next_entry_count = list_entry_count
        except:
            next_entry_count = 0
        # Non highlighted row
        if math.isnan(list_entry_count):
            tags.append('o')    
        
        # Single highlighted node
        elif list_entry_count == 1:
            tags.append(f's_{row.highlighted_labels}')
            count = 1
        
        # Last entry in group greater than size 1
        elif count == list_entry_count:          
        # elif list_entry_count > 1 and next_entry_count != list_entry_count:
            # print(count, list_entry_count, next_entry_count)
            tags.append(f'e_{row.highlighted_labels}')
            count = 1
        elif (count < list_entry_count) and count == 1:
            tags.append(f'b_{row.highlighted_labels}')
            count += 1
        elif (count < list_entry_count) and count > 1:
            tags.append(f'i_{row.highlighted_labels}')
            count += 1
    # print(count, list_entry_count, next_entry_count)
    merged['tagged_sequence'] = tags
    return merged

In [62]:
merged_tagged = tag_bies_for_highlights(all_nodes_copy)

In [63]:
merged_tagged[merged_tagged.tagged_sequence != 'o']

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,top_via_line_group,size,tagged_sequence
7,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,7,/html/body/document/type/sequence/filename/des...,AGREEMENT AND PLAN OF MERGER,t,0,"[0.0036537963202902305, 0.4156520833333333, 0....",1.0,0.0,0.003654,0.415652,0.168696,0.000389,0.003654,1.0,s_t
8,/html/body/document/type/sequence/filename/des...,BY AND AMONG,8,/html/body/document/type/sequence/filename/des...,BY AND AMONG,t,1,"[0.00445711324177248, 0.46365555555555554, 0.0...",1.0,1.0,0.004457,0.463656,0.072689,0.000389,0.004457,1.0,s_t
9,/html/body/document/type/sequence/filename/des...,QAD Inc.,9,/html/body/document/type/sequence/filename/des...,QAD Inc.,t,2,"[0.005260430163254729, 0.4809680555555556, 0.0...",1.0,2.0,0.00526,0.480968,0.038059,0.000389,0.00526,1.0,s_t
10,/html/body/document/type/sequence/filename/des...,"PROJECT QUICK PARENT, LLC",10,/html/body/document/type/sequence/filename/des...,"PROJECT QUICK PARENT, LLC",t,3,"[0.006063747084736978, 0.43030625, 0.139388194...",1.0,3.0,0.006064,0.430306,0.139388,0.000389,0.006064,1.0,s_t
11,/html/body/document/type/sequence/filename/des...,AND,11,/html/body/document/type/sequence/filename/des...,AND,t,4,"[0.006867064006219227, 0.48996874999999995, 0....",1.0,4.0,0.006867,0.489969,0.020057,0.000389,0.006867,1.0,s_t
12,/html/body/document/type/sequence/filename/des...,"PROJECT QUICK MERGER SUB, INC.",12,/html/body/document/type/sequence/filename/des...,"PROJECT QUICK MERGER SUB, INC.",t,5,"[0.007670380927701477, 0.4175347222222222, 0.1...",1.0,5.0,0.00767,0.417535,0.164931,0.000389,0.00767,1.0,s_t
13,/html/body/document/type/sequence/filename/des...,"Dated as of June 27, 2021",13,/html/body/document/type/sequence/filename/des...,"Dated as of June 27, 2021",t,6,"[0.008473697849183726, 0.4498534722222222, 0.1...",1.0,6.0,0.008474,0.449853,0.100287,0.000389,0.008474,1.0,s_t
14,/html/body/document/type/sequence/filename/des...,TABLE OF CONTENTS,14,/html/body/document/type/sequence/filename/des...,TABLE OF CONTENTS,t,7,"[0.014398574760300596, 0.4530541666666667, 0.0...",1.0,7.0,0.014399,0.453054,0.093885,0.000389,0.014399,1.0,s_t
113,/html/body/document/type/sequence/filename/des...,i,113,/html/body/document/type/sequence/filename/des...,i,n,8,"[0.02925991189427313, 0.49870902777777776, 0.0...",1.0,8.0,0.02926,0.498709,0.002577,0.000389,0.02926,1.0,s_n
220,/html/body/document/type/sequence/filename/des...,ii,220,/html/body/document/type/sequence/filename/des...,ii,n,9,"[0.045723503498315624, 0.49742291666666666, 0....",1.0,9.0,0.045724,0.497423,0.005149,0.000389,0.045724,1.0,s_n


In [21]:
# merged_tagged[merged_tagged['is_outside'] < 1]['tagged_sequence'].tolist(), merged_tagged[merged_tagged['is_outside'] < 1]['text'].tolist()

In [27]:
path_to_tagged_folder = f'/Users/rohith/Documents/Independent Study - DSGA1006/contracts/tagged'
merged_tagged.to_csv(f'{path_to_tagged_folder}/contract_{contract_num}_tagged.csv')