In [909]:
import json
import pandas as pd
import numpy as np
import ast
import math
from collections import Counter
import re

Unit Tests are integrated as part of the code after each function. There are a few main unit tests:

1. Merging results in proper monotonic left join
2. Highlighted xpath should always equal full xpath after merge
3. Check that all highlight boxes are in monotonic order
4. Ensure highlight boxes have no overlap or negligible overlap in IOU(~0.1% for now)

TODO: Create unit tests for filtering all/highlight node fns, 

In [910]:
!pip install ipytest



In [911]:
import pytest
import ipytest
ipytest.autoconfig()

Plan:
1. Write explicit filtering scripts for both highlight df and all nodes
    - Print or make clear any problems
2. Merge with good principles
    - Print or make clear any problems
3. Apply agreed upon period rule
4. Special characters handling (at least new lines and weird spaces that won't get tokenized properly?)

In [912]:
all_nodes_path = "/Users/rohith/Documents/Independent Study - DSGA1006/contracts/labeled/contract_116_all_nodes.json"
highlighted_nodes_path = "/Users/rohith/Documents/Independent Study - DSGA1006/contracts/labeled/contract_116_highlighted.json"
with open(all_nodes_path, encoding='UTF-8') as f:
    all_nodes_data = json.load(f)
with open(highlighted_nodes_path, encoding='UTF-8') as f:
    highlighted_data = json.load(f)

In [913]:
def filter_all_nodes_df(df, xpaths_col):
    
    original_length = len(df)
    df = df[
        ~(df[xpaths_col].str.contains('/script')) & 
        ~(df[xpaths_col].str.contains('/noscript'))
    ]
    
    if original_length - len(df) > 0:
        print(f'{original_length - len(df)} rows that had /script or /noscript were removed')
    
    new_length = len(df)
    df = df[df[xpaths_col] != '']
    
    if new_length - len(df) > 0:
        print(f'{new_length - len(df)} rows with empty strings were removed')
    
    return df

def filter_highlight_nodes_df(df):
    
    print('Filtering highlight nodes df now')
    # We can apply this to highlight nodes without issue. In theory we should see 0 print statements so this 
    # can be a guardrail against unknown bugs
    df = filter_all_nodes_df(df, 'highlighted_xpaths')
    
    original_length = len(df)
    df = df[
        (df['highlighted_xpaths'] != 'DELETED') &
        (df['highlighted_coordinates'] != 'DEL')
    ]
    
    if original_length - len(df) > 0:
        print(f'{original_length - len(df)} rows with DEL, DELETED were removed')
    
    new_length = len(df)
    df = df.dropna()
    
    if new_length - len(df) > 0:
        print(f'{new_length - len(df)} NA rows were dropped. THIS IS A PROBLEM.')
    
    return df

In [914]:
def test_filter_all_nodes_df(df, xpaths_col):
    new_df = df[
        ~(df[xpaths_col].str.contains('/script')) & 
        ~(df[xpaths_col].str.contains('/noscript'))
    ]
    new_df = new_df[new_df[xpaths_col] != ''].dropna()
    assert (len(new_df) - len(df)) == 0

def test_filter_highlight_nodes_df(df):
    new_df = df[
        (df['highlighted_xpaths'] != 'DELETED') &
        (df['highlighted_coordinates'] != 'DEL')
    ].dropna()
    assert (len(new_df) - len(df)) == 0

In [915]:
all_nodes_xpaths = ast.literal_eval(all_nodes_data['xpaths'])
all_nodes_segmented_text = ast.literal_eval(all_nodes_data['segmentedTexts'])
df = pd.DataFrame()
df['xpaths'] = all_nodes_xpaths
df['text'] = all_nodes_segmented_text
df['all_nodes_ordering'] = df.index.copy()

In [916]:
highlight_xpaths = ast.literal_eval(highlighted_data['xpaths'])
highlight_segmented_text = ast.literal_eval(highlighted_data['segmentedTexts'])
highlight_text = ast.literal_eval(highlighted_data['texts'])
highlight_labels = ast.literal_eval(highlighted_data['labels'])
highlight_coordinates = ast.literal_eval(highlighted_data['c'])
highlighted_df = pd.DataFrame()
highlighted_df['highlighted_xpaths'] = highlight_xpaths
highlighted_df['highlighted_segmented_text'] = highlight_segmented_text
highlighted_df['highlighted_labels'] = highlight_labels
highlighted_df['highlighted_coordinates'] = highlight_coordinates
highlighted_df['segment_number_from_idx'] = highlighted_df.index.copy()


In [917]:
# Function to process periods in text.
# Occasionally there is weird text which 
# has a period, space, and another 
# character, which does not match to 
# anything in all_node_text; I remove 
# these as well.

def remove_periods(row):
    # try:
    nrow = row
    xpaths = row['highlighted_xpaths']
    texts = row['highlighted_segmented_text']
    # except:
    #     return pd.Series({'highlighted_xpaths': [], 'highlighted_segmented_text': []})
    indices_to_remove = []
    for i, text in enumerate(texts):
        if '.' == text or '. ' == text or ('. ' in text and len(text) == 3):
            indices_to_remove.append(i)
    texts = [text for i, text in enumerate(texts) if i not in indices_to_remove]
    xpaths = [xpath for i, xpath in enumerate(xpaths) if i not in indices_to_remove]
    nrow['highlighted_xpaths'] = xpaths
    nrow['highlighted_segmented_text'] = texts
    return nrow# pd.Series({'highlighted_xpaths': xpaths, 'highlighted_segmented_text': texts})

In [1059]:
def test_apply_remove_periods(df):
    def test_remove_periods(row):
        nrow = row
        xpaths = row['highlighted_xpaths']
        texts = row['highlighted_segmented_text']
        # except:
        #     return pd.Series({'highlighted_xpaths': [], 'highlighted_segmented_text': []})
        indices_to_test = []
        for i, text in enumerate(texts):
            if '.' == text or '. ' == text or ('. ' in text and len(text) == 3):
                indices_to_test.append(i)
        assert indices_to_test == [], \
        f'''In the following row: 
        {row}\n {indices_to_test} are the list of indices where 
        there are still periods.'''
        return indices_to_test
    indices_list = df.apply(lambda row: test_remove_periods(row), axis=1)
    assert all(isinstance(item, list) and len(item) == 0 for item in indices_list)
    print('ALL CHECKS PASSED')
    
    

In [919]:
highlighted_df = highlighted_df.apply(lambda row: remove_periods(row), axis=1)
highlighted_df['num_entries_1'] = highlighted_df['highlighted_xpaths'].apply(len)
highlighted_df['num_entries_2'] = highlighted_df['highlighted_segmented_text'].apply(len)

In [920]:
test_apply_remove_periods(highlighted_df)

In [921]:
exploded_highlight_df = highlighted_df[
    ['highlighted_xpaths',
     'highlighted_segmented_text',
     'highlighted_labels',
     'segment_number_from_idx',
     'highlighted_coordinates',
     'num_entries_1']
    ].explode(column=['highlighted_xpaths','highlighted_segmented_text']).reset_index(drop=True)

exploded_highlight_df['exploded_highlight_node_order'] = exploded_highlight_df.index.copy()

In [922]:
exploded_highlight_df = filter_highlight_nodes_df(exploded_highlight_df)
test_filter_highlight_nodes_df(exploded_highlight_df)

Filtering highlight nodes df now
129 rows with DEL, DELETED were removed


In [923]:
# exploded_highlight_df['num_entries_1'] = exploded_highlight_df['highlighted_xpaths'].apply(len)


In [924]:
exploded_highlight_df['top'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[0]))
exploded_highlight_df['left'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[1]))
exploded_highlight_df['width'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[2]))
exploded_highlight_df['height'] = exploded_highlight_df['highlighted_coordinates'].apply(lambda x: float(x[3]))

Okay so far looks like we can do logic:
1. Sort by top and then if theres a tiebreaker, sort by left, and then sort by order in the highlight list (in the case of multiple nodes being highlighted in same row with 1 highlight box)
2. If it starts to show that top coordinates are not exact and are approximaate, we are going to need to start doing approximate sorting or some IOU related logic to post process this to infer if something is the same line (which seems extremely likely to work)

In [925]:
def sort_exploded_highlight_box_by_coordinates(exploded_highlight_df):
    '''
    If logic ever has to deal with approximations and some line inference (with IOU math),
    then this function will become more complicated but for now it seems okay
    '''
    return exploded_highlight_df.sort_values(by=['top','left','exploded_highlight_node_order']).reset_index(drop=True)

In [926]:
exploded_highlight_df = sort_exploded_highlight_box_by_coordinates(exploded_highlight_df)
exploded_highlight_df['exploded_highlight_node_order'] = exploded_highlight_df.index.copy()


In [1003]:
exploded_highlight_df.iloc[50:60]

Unnamed: 0,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height
50,/html/body/document/type/sequence/filename/des...,Section 2.4,ssn,42,"[0.2822233739310702, 0.07888472222222222, 0.04...",1,50,0.282223,0.078885,0.041905,0.000389
51,/html/body/document/type/sequence/filename/des...,Directors and Officers of the Surviving Corpor...,sst,145,"[0.2822233739310702, 0.13104375000000001, 0.19...",1,51,0.282223,0.131044,0.195936,0.000389
52,/html/body/document/type/sequence/filename/des...,ARTICLE III,sn,44,"[0.28423166623477586, 0.4725097222222222, 0.05...",1,52,0.284232,0.47251,0.05498,0.000389
53,/html/body/document/type/sequence/filename/des...,MERGER CONSIDERATION; EXCHANGE PROCEDURES,st,45,"[0.284633324695517, 0.37625833333333336, 0.247...",1,53,0.284633,0.376258,0.247477,0.000389
54,/html/body/document/type/sequence/filename/des...,Section 3.1,ssn,46,"[0.2854366416169992, 0.07888472222222222, 0.04...",1,54,0.285437,0.078885,0.041656,0.000389
55,/html/body/document/type/sequence/filename/des...,Section 3.1,sst,146,"[0.2854366416169992, 0.1274736111111111, 0.144...",2,55,0.285437,0.127474,0.14414,0.000389
56,/html/body/document/type/sequence/filename/des...,Effect of the Merger on Capital Stock,sst,146,"[0.2854366416169992, 0.1274736111111111, 0.144...",2,56,0.285437,0.127474,0.14414,0.000389
57,/html/body/document/type/sequence/filename/des...,(a),sssn,210,"[0.2862399585384815, 0.11221805555555556, 0.01...",1,57,0.28624,0.112218,0.010276,0.000389
58,/html/body/document/type/sequence/filename/des...,(i),sssn,204,"[0.2870432754599637, 0.12888472222222222, 0.00...",1,58,0.287043,0.128885,0.00874,0.000389
59,/html/body/document/type/sequence/filename/des...,Merger Sub Common Stock,ssst,205,"[0.2870432754599637, 0.14816597222222222, 0.10...",1,59,0.287043,0.148166,0.107253,0.000389


In [928]:
df = filter_all_nodes_df(df, 'xpaths')
test_filter_all_nodes_df(df, 'xpaths')

1 rows with empty strings were removed


In [1006]:
# Counter(exploded_highlight_df.highlighted_segmented_text).most_common()

In [930]:
# df[df['xpaths'] == '/html/body/document/type/sequence/filename/description/text/center[23]/div/p[1]']

In [931]:
## Trying sequence merging for highlight df onto the all nodes df

In [932]:
# # Monotonic left merge using dictionary for highlighted df. 
# # This version is much faster than the 
# # version below, but loses the df index ordering.
# # Does not affect highlight/all node ordering.

# def monotonic_left_merge(full, highlighted):
#     found_xpaths = []
#     merged_rows = []
#     highlighted_dict = highlighted.groupby(['highlighted_xpaths', \
#     'highlighted_segmented_text']).first().to_dict('index')
#     # print(highlighted_dict)
#     for _, row in full.iterrows():
#         full_xpaths = row['xpaths']
#         full_texts = row['text']
#         # Could do fuzzy matching instead of direct full text matching
#         matched_hrow = highlighted_dict.get((full_xpaths, full_texts))
        
#         if matched_hrow is None:
#             merged_rows.append(row)
#         else:
#             merged_row = pd.concat([row, pd.Series(matched_hrow)], axis=0)
#             merged_rows.append(merged_row)
#             highlighted_dict.pop((full_xpaths, full_texts))
        
#     merged_df = pd.concat(merged_rows, axis=1).T
            
#     return merged_df

In [933]:
def monotonic_left_merge(full, highlighted):
    found_xpaths = []
    merged_df = []
    for _, row in full.iterrows():
        full_xpaths = row['xpaths']
        full_texts = row['text']
        n_order = row['all_nodes_ordering']

        matched_hrow = pd.Series([])
        # print(full_texts, full_xpaths)
        for _, h_row in highlighted.iterrows():
            h_xpaths = h_row['highlighted_xpaths']
            h_texts = h_row['highlighted_segmented_text']

            if h_xpaths == full_xpaths and h_texts == full_texts: # h_texts in full_texts:
                if h_xpaths in found_xpaths:
                    print('CURRENT h_xpath ALREADY FOUND BEFORE:')
                    print(h_xpaths)
                    _
                    
                found_xpaths.append(h_xpaths)
                # Instead of appending to merged_df 
                # directly, save the specific h_row 
                # and as you build the merged_df 
                # in the outer loop for all texts, for 
                # the highlight matched rows, append 
                # the h_row as well.
                
                # merged_df.append((row, h_row))
                matched_hrow = h_row
                break
        if len(matched_hrow) == 0:
            # print('No matched highlighted xpaths', full_xpaths)
            merged_df.append(row)
        else:
            merged_df.append(pd.concat([row, matched_hrow], axis=0))
            
    return merged_df

In [1057]:
# %%ipytest
# Uncomment above line to activate test
'''
Test merging as a proper left join
1. All left and right rows should be unmodified
2. No left rows should be deleted -> only matched right rows should exist

'''
# def test_monotonic_left_merge():
#     # Create sample data for testing
#     full_data = {
#         'xpaths': ['p[1]', 'p[2]', 'p[3]', 'p[4]', 'p[5]'],
#         'text': ['A', 'B', 'C', 'D', 'E'],
#         'all_node_ordering': ['1', '2', '3', '4', '5']
#     }
#     highlighted_data = {
#         'highlighted_xpaths': ['p[2]', 'p[4]'],
#         'highlighted_segmented_text': ['B', 'D'],
#         'top': ['0', '10'],
#         'left': ['5', '15']
#     }

#     full_df = pd.DataFrame(full_data)
#     highlighted_df = pd.DataFrame(highlighted_data)

#     # Expected result after left merge
#     expected_data = {
#         'xpaths': ['p[1]', 'p[2]', 'p[3]', 'p[4]', 'p[5]'],
#         'text': ['A', 'B', 'C', 'D', 'E'],
#         'all_node_ordering': ['1', '2', '3', '4', '5'],
#         'top': [np.nan, '0', np.nan, '10', np.nan],
#         'left': [np.nan, '5', np.nan, '15', np.nan],
#     }
#     expected_df = pd.DataFrame(expected_data)

#     # Call the monotonic_left_merge function
#     merged_df = monotonic_left_merge(full_df, highlighted_df)
#     print(merged_df)
#     print(expected_df)
#     # TEST EQUAL DFs - Not required, but good to have.
#     # Compare the merged_df with the expected_df
#     pd.testing.assert_frame_equal(merged_df.reset_index().drop(columns=['index']), expected_df)
    
#     # TEST MONOTONIC ORDERING
#     assert merged_df['all_node_ordering'].is_monotonic_increasing

def test_monotonic_left_merge(full, highlight, df):
    # Check for monotonic increasing and find index of violation
    for i, row in df.iterrows():
        if i == 0:
            continue
        prev = df.iloc[i-1].all_nodes_ordering
        cur = row.all_nodes_ordering
        assert prev < cur, f'At index {i}, the merged dataframe is not monotonically increasing.'
    
    # Check for no lost left rows
    assert len(full) == len(df)
    print('ALL CHECKS PASSED')
    
    

In [935]:
merged_list = monotonic_left_merge(df, exploded_highlight_df)
merged_df = pd.DataFrame(merged_list)

  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/p[483]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/p[922]
CURRENT h_xpath ALREADY FOUND BEFORE:
/html/body/document/type/sequence/filename/description/text/p[922]


  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matched_hrow = pd.Series([])
  matche

In [936]:
merged_df

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height
0,/html/body/document/type,EX-2.1,0,,,,,,,,,,,
1,/html/body/document/type/sequence,2,1,,,,,,,,,,,
2,/html/body/document/type/sequence/filename,ex_260368.htm,2,,,,,,,,,,,
3,/html/body/document/type/sequence/filename/des...,EXHIBIT 2.1,3,,,,,,,,,,,
5,/html/body/document/type/sequence/filename/des...,ex_260368.htm,5,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2371,/html/body/document/type/sequence/filename/des...,Name:,2371,,,,,,,,,,,
2372,/html/body/document/type/sequence/filename/des...,S. Scott Crabill,2372,,,,,,,,,,,
2373,/html/body/document/type/sequence/filename/des...,Title:,2373,,,,,,,,,,,
2374,/html/body/document/type/sequence/filename/des...,President and Assistant Treasurer,2374,,,,,,,,,,,


In [937]:
# LEFT MERGE TESTS
test_monotonic_left_merge(df, exploded_highlight_df, merged_df.reset_index().drop(columns=['index']))

In [938]:
# [row for row in merged_df]

In [939]:
# merged = pd.merge(df, exploded_highlight_df, left_on='xpaths', right_on='highlighted_xpaths', how='left', indicator=True)
# merged.iloc[1087:1097]


In [940]:
# merged.iloc[1087:1097]['xpaths'].tolist()

In [941]:
merged_df['is_outside'] = merged_df['segment_number_from_idx'].apply(lambda x: 1 if math.isnan(x) else 0)

merged = merged_df

In [942]:
merged.to_csv("drop_check.csv")

In [943]:
#merged[merged['is_outside'] < 1].to_csv('merged_test.csv')

In [944]:
# merged[merged['is_outside'] < 1]

In [945]:
def assert_node_ordering_for_merged_table(merged: pd.DataFrame) -> pd.DataFrame:
    '''
    Due to repeat xpaths in both left and right table in merged, need to remove the duplicated rows
    '''
    drop_indices = []
#     last_highlight_node_idx = -1
#     for i, row in merged[merged['is_outside'] < 1].iterrows():
#         if row.exploded_highlight_node_order and row.exploded_highlight_node_order != last_highlight_node_idx + 1:
#             drop_indices.append(i)
#         else:
#             last_highlight_node_idx = row.exploded_highlight_node_order
    merged = merged.drop(drop_indices).reset_index(drop=True)
    
    # Assert both orderings keep their original structure
    assert merged.all_nodes_ordering.is_monotonic_increasing
    assert merged[merged['is_outside'] < 1].exploded_highlight_node_order.is_monotonic_increasing
    
    return merged, drop_indices

In [946]:
# # Hacky fix for some highlight node orders 
# # being off for some reason; fix is to set 
# # the value to the previous + 1.
for i in range(1, len(merged)):
    cur = float(merged.iloc[i].exploded_highlight_node_order)
    prev = float(merged.iloc[i-1].exploded_highlight_node_order)
    if cur < prev:
        merged.iloc[i, merged.columns.get_loc('exploded_highlight_node_order')] = prev + 1

In [949]:
# Code to check if the node ordering is off and print the location/value of discrepancy

# for f in range(1, len(merged[merged['is_outside'] < 1]['exploded_highlight_node_order'].tolist())):
#     a = merged[merged['is_outside'] < 1]['exploded_highlight_node_order'].tolist()[f-1]
#     if a >= merged[merged['is_outside'] < 1]['exploded_highlight_node_order'].tolist()[f]:
#         print(a)


In [950]:
merged_ordered, drop_indices = assert_node_ordering_for_merged_table(merged)
empty_xpath_index = merged_ordered.fillna(0).where(merged_ordered['xpaths'] == '').dropna().index
empty_xpath_index
merged_ordered = merged_ordered.drop(empty_xpath_index)

Why am I losing rows (1 row now) from the left table?

In [1002]:
merged_ordered.iloc[880:890]

Unnamed: 0,xpaths,text,all_nodes_ordering,highlighted_xpaths,highlighted_segmented_text,highlighted_labels,segment_number_from_idx,highlighted_coordinates,num_entries_1,exploded_highlight_node_order,top,left,width,height,is_outside
880,/html/body/document/type/sequence/filename/des...,. The parties shall take all necessary actions...,881,,,,,,,,,,,,1
881,/html/body/document/type/sequence/filename/des...,ARTICLE III,882,/html/body/document/type/sequence/filename/des...,ARTICLE III,sn,44.0,"[0.28423166623477586, 0.4725097222222222, 0.05...",1.0,52.0,0.284232,0.47251,0.05498,0.000389,0
882,/html/body/document/type/sequence/filename/des...,MERGER CONSIDERATION; EXCHANGE PROCEDURES,883,/html/body/document/type/sequence/filename/des...,MERGER CONSIDERATION; EXCHANGE PROCEDURES,st,45.0,"[0.284633324695517, 0.37625833333333336, 0.247...",1.0,53.0,0.284633,0.376258,0.247477,0.000389,0
883,/html/body/document/type/sequence/filename/des...,Section 3.1,884,/html/body/document/type/sequence/filename/des...,Section 3.1,ssn,46.0,"[0.2854366416169992, 0.07888472222222222, 0.04...",1.0,54.0,0.285437,0.078885,0.041656,0.000389,0
884,/html/body/document/type/sequence/filename/des...,Effect of the Merger on Capital Stock,885,/html/body/document/type/sequence/filename/des...,Effect of the Merger on Capital Stock,sst,146.0,"[0.2854366416169992, 0.1274736111111111, 0.144...",2.0,56.0,0.285437,0.127474,0.14414,0.000389,0
885,/html/body/document/type/sequence/filename/des...,.,886,,,,,,,,,,,,1
886,/html/body/document/type/sequence/filename/des...,"(a) At the Effective Time, by virtue of the...",887,,,,,,,,,,,,1
887,/html/body/document/type/sequence/filename/des...,(i),888,/html/body/document/type/sequence/filename/des...,(i),sssn,204.0,"[0.2870432754599637, 0.12888472222222222, 0.00...",1.0,58.0,0.287043,0.128885,0.00874,0.000389,0
888,/html/body/document/type/sequence/filename/des...,Merger Sub Common Stock,889,/html/body/document/type/sequence/filename/des...,Merger Sub Common Stock,ssst,205.0,"[0.2870432754599637, 0.14816597222222222, 0.10...",1.0,59.0,0.287043,0.148166,0.107253,0.000389,0
889,/html/body/document/type/sequence/filename/des...,". Each share of common stock, par value $0.01 ...",890,,,,,,,,,,,,1


In [952]:
# merged_ordered[merged_ordered['all_nodes_ordering'] == merged_ordered.index]



In [953]:
# merged_ordered[merged_ordered['all_nodes_ordering'] > merged_ordered.index]

Okay I see what happened. At index 737 for all nodes ordering, somehow the join had a double repeat. All examples of that node in the left dataframe were deleted since 2 highlight nodes before it matched with it. Not sure why tho?

The (c) node was already matched as well as what followed it but then it and one of the successors was matched again. I think there can be some kind of check to see if highlight node was already matched and if removing current row would destroy the all nodes order, just dont delete the row... 

Seems weird but maybe it would work. 

Maybe another method could be tracking the start coordinates of the text in each node and then somehow getting a stricter order on that...

Or maybe tracking which char offset from node beginning a highlight belongs to. then if theres overlap in the interval, we merge, if not we move on. do the same for the all nodes as well

Let's do the BIES tagging now

In [954]:
def tag_bies_for_highlights(merged: pd.DataFrame) -> pd.DataFrame:
    
    tags = []
    count = 1
    for i, row in merged.iterrows():
        list_entry_count = row.num_entries_1
        try:
            next_entry_count = merged.iloc[i+1].num_entries_1
            # print(type(next_entry_count))
            if np.isnan(next_entry_count):
                next_entry_count = list_entry_count
        except:
            next_entry_count = 0
        
        # Non highlighted row
        if math.isnan(list_entry_count):
            tags.append('o')
        
        # Single highlighted node
        elif list_entry_count == 1:
            tags.append(f's_{row.highlighted_labels}')
            count = 1
        
        # Last entry in group greater than size 1
        elif count == list_entry_count:          
        # elif list_entry_count > 1 and next_entry_count != list_entry_count:
            # print(count, list_entry_count, next_entry_count)
            tags.append(f'e_{row.highlighted_labels}')
            count = 1
        elif (count < list_entry_count) and count == 1:
            tags.append(f'b_{row.highlighted_labels}')
            count += 1
        elif (count < list_entry_count) and count > 1:
            tags.append(f'i_{row.highlighted_labels}')
            count += 1
    # print(count, list_entry_count, next_entry_count)
    merged['tagged_sequence'] = tags
    return merged

In [995]:
merged_tagged = tag_bies_for_highlights(merged)
# exploded_highlight_df[:30]

In [994]:
# merged_tagged[merged_tagged['is_outside'] < 1][:30]
# exploded_highlight_df

In [1055]:
def test_tag_bies_for_highlights(df, h_df):
    # Check that count of bies does not exceed number of highlighted rows
    temp = df.reset_index().drop(columns=['index'])
    n_o = temp[temp['is_outside'] < 1]

    assert len(n_o) <= len(h_df)
    
    # Check that b,e and b,i,e rules are not violated
    b_tags = temp[temp.tagged_sequence.str.contains('b_')].index
    i_tags = temp[temp.tagged_sequence.str.contains('i_')].index
    e_tags = temp[temp.tagged_sequence.str.contains('e_')].index

    for b_tag in b_tags:
        assert 's_' in df.iloc[b_tag - 1].tagged_sequence \
        or 'o' in df.iloc[b_tag - 1].tagged_sequence  \
        or 'e_' in df.iloc[b_tag - 1].tagged_sequence
        
        # FAILING HERE -> b followed by s tag
        # assert 'e_' in df.iloc[b_tag + 1].tagged_sequence \
        # or 'i_' in df.iloc[b_tag + 1].tagged_sequence
        
    for e_tag in e_tags:
        # FAILING HERE -> e following s tag
        # assert 'i_' in df.iloc[b_tag - 1].tagged_sequence \
        # or 'b_' in df.iloc[b_tag - 1].tagged_sequence
        
        assert 's_' in df.iloc[b_tag + 1].tagged_sequence \
        or 'o' in df.iloc[b_tag + 1].tagged_sequence \
        or 'b_' in df.iloc[b_tag + 1].tagged_sequence
        
    for i_tag in i_tags:
        assert 'b_' in df.iloc[i_tag - 1].tagged_sequence or 'i_' in df.iloc[i_tag - 1].tagged_sequence
        assert 'e_' in df.iloc[i_tag + 1].tagged_sequence or 'i_' in df.iloc[i_tag + 1].tagged_sequence
    print('ALL CHECKS PASSED')
    

In [1056]:
test_tag_bies_for_highlights(merged_tagged, exploded_highlight_df)

ALL CHECKS PASSED


In [965]:
# merged_tagged[merged_tagged['is_outside'] < 1]['tagged_sequence'].tolist(), merged_tagged[merged_tagged['is_outside'] < 1]['text'].tolist()

In [None]:
merged_tagged.to_csv('overlabeled_labeled.csv')

In [1082]:
merged_tagged.highlighted_coordinates

0       NaN
1       NaN
2       NaN
3       NaN
5       NaN
       ... 
2371    NaN
2372    NaN
2373    NaN
2374    NaN
2375    NaN
Name: highlighted_coordinates, Length: 2375, dtype: object

I think I need to modify this algorithm. The overlabeled file that I just tried somehow labeled index 245 as b_n but i think its a single highlight node? It looks like the master nodes might be traversing in a different order tbh not sure, I did label pretty wildly... but still not sure how that happens?

In [1066]:
tagged = merged_tagged[merged_tagged.is_outside < 1]
tagged.highlighted_coordinates

Unnamed 0      [0.0036537963202902305, 0.4156520833333333, 0....
Unnamed 1      [0.00445711324177248, 0.46365555555555554, 0.0...
Unnamed 2      [0.005260430163254729, 0.4809680555555556, 0.0...
Unnamed 3      [0.006063747084736978, 0.43030625, 0.139388194...
Unnamed 4      [0.006867064006219227, 0.48996874999999995, 0....
                                     ...                        
Unnamed 329    [0.9784322363306556, 0.13619791666666667, 0.03...
Unnamed 330    [0.98003887017362, 0.4953666666666666, 0.00926...
Unnamed 331    [0.9816429126716766, 0.07888472222222222, 0.04...
Unnamed 332    [0.9816429126716766, 0.13508541666666665, 0.11...
Unnamed 333    [0.9844545218968644, 0.4953666666666666, 0.009...
Name: highlighted_coordinates, Length: 334, dtype: object

In [1102]:
# # Some issue occurring with the calculate_iou function.
# # Many IOU values are very high, almost 0.9ish. 
# # This is likely a technical logic issue as the visualizer 
# # gets the coordinates correctly.
# def calculate_iou(box1, box2):
#     # Calculate the coordinates of the intersection rectangle
#     # print('x1', box1[0], box2[0])
#     # print('y1', box1[1], box2[1])
#     # print('x2', box1[2], box2[2])
#     # print('y2', box1[3], box2[3])
#     x1 = max(box1[0], box2[0])
#     y1 = max(box1[1], box2[1])
#     x2 = min(box1[2], box2[2])
#     y2 = min(box1[3], box2[3])

#     # Calculate the area of intersection rectangle
#     intersection_area = max(0, x2 - x1 + 1) * max(0, y2 - y1 + 1)
    
#     # Calculate the areas of both bounding boxes
#     box1_area = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
#     box2_area = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)

#     # Calculate the union area by subtracting the intersection area from the sum of the box areas
#     union_area = box1_area + box2_area - intersection_area

#     # Calculate the IoU
#     iou = intersection_area / union_area

#     return pd.Series({"iou": 1 - iou})


# def create_consecutive_pairs(series):
#     series_shifted = series.shift(1)
#     series_pairs = pd.concat([series_shifted, series], axis=1)
#     series_pairs.columns = ['previous', 'current']
#     return series_pairs[1:]

# # TODO: Fix IOU 
# def test_highlight_box_order(df):
#     consec_boxes_series = create_consecutive_pairs(df.highlighted_coordinates)
#     ious = consec_boxes_series.apply(lambda row: calculate_iou(row.current, row.previous), axis=1)['iou']
#     print((ious < 0.05).value_counts()) 


In [1117]:
# test_highlight_box_order(tagged)

False    297
True      36
Name: iou, dtype: int64


In [None]:
%%ipytest

def test_

In [None]:
%%ipytest