# Import Modules

In [None]:
import os
import glob
import json
import string
from pathlib import Path

import pandas as pd
from nltk import edit_distance as lev_dist

from tqdm.auto import tqdm

In [None]:
# set pandas viewing options
pd.options.display.max_rows = None

# Outstanding Issues
* The CUAD dataset doesn't have clear page boundaries in the ground truth text files
* If there is a region of bad segmentation (multiple boxes have bad text), then it throws the alignment wrt gt boxes completely off. Succeeding gt segments all go unlabeled. Eg: DeltathreeInc_19991102_S-1A_EX-10.19_6227850_EX-10.19_Co-Branding Agreement_ Service Agreement Window Size: 2
    * need to find a way to advance window of gts we're looking at
* More bbox preds than number of gt segments. Should every bbox be assigned to some gt? Or should every gt be assigned a bbox?

# Define Functions

In [None]:
def read_gt_file(fname, split_patt='\n ', debug=False):
    '''Reads the GT .txt files from CUAD dataset'''
    if debug:
        print(f'fname: {fname}')
    
    with open(fname, 'r') as fh:
        lines = fh.read()
    
    # split text by split pattern
    split_lines = lines.split(split_patt)
    split_lines_df = pd.DataFrame({'text_segment': split_lines})
    
    # create cols and clean text
    split_lines_df['assigned_bbox'] = None
    split_lines_df['assigned_page'] = None
    split_lines_df['text_segment'] = split_lines_df['text_segment'].str.strip()
    
    return split_lines_df

In [None]:
def match_pred_bbox(pred_row, unassigned_gt_fc, percent_match=0.5, remove_punc=True, lowercase=True):
    '''Matches Pred Text Segment with GT Text Segment'''
    #TODO: add remove_punc and lowercase as user args
    pred_seg = pred_row.loc['text']
    
    if remove_punc:
        pred_seg = ''.join(char for char in pred_seg if char.isalnum())
        unassigned_gt_fc['text_segment'] = unassigned_gt_fc['text_segment'].apply(lambda gt_text: ''.join(char for char in gt_text if char.isalnum()))
    
    if lowercase:
        pred_seg = pred_seg.lower()
        unassigned_gt_fc['text_segment'] = unassigned_gt_fc['text_segment'].str.lower()
    
    # compute edit distances for 'window_size' num of unassigned text segments 
    unassigned_gt_fc['edit_distances'] = unassigned_gt_fc['text_segment'].apply(lambda gt_text: lev_dist(gt_text, pred_seg))
    
    # if the edit distance is greater than some percent of the predicted text then ignore the bbox
    if unassigned_gt_fc['edit_distances'].min() >= int(percent_match * len(pred_seg)):
        match_idx = None
        coords = None
        page = None
    
    else:
        match_idx = unassigned_gt_fc['edit_distances'].idxmin()

        coords = {}
        coords['xmin'] = pred_row.loc['xmin']
        coords['ymin'] = pred_row.loc['ymin']
        coords['xmax'] = pred_row.loc['xmax']
        coords['ymax'] = pred_row.loc['ymax']
        
        coords = json.dumps(coords)
        
        page = pred_row.loc['page']
    
    return match_idx, coords, page, unassigned_gt_fc

In [None]:
#TODO: Can this workflow be optimized? Avoid looping through df using df.iterrows()
def process_doc(doc_id, pred_file_contents, gt_file_contents, 
                window_size=2, debug=False, percent_match=0.5,
                remove_punc=True, lowercase=True):
    '''Defines Execution Flow for a Single Doc
    '''
    
    # keep track of which bboxes have been updated
    updated_bboxes = []
    for row_idx, pred_row in tqdm(pred_file_contents.iterrows(), total=len(pred_file_contents), desc='Per Doc Prog Bar'):        
        #TODO: Add debug statements to verify this logic
        # get idx of last updated entry and fetch unassigned bboxes after that. 
        # If a box was missed earlier and 
        # succeeding segments were assigned bboxes, then ignore that            
        not_null_cond = gt_file_contents['assigned_bbox'].notnull()
        not_null_gt_file_contents = gt_file_contents.loc[not_null_cond]
        
        if not_null_gt_file_contents.empty:
            last_assigned_gt_idx = 0
        else:
            last_assigned_gt_idx = not_null_gt_file_contents.index.max()
        
        # index only those gt text segments that do not have any succeeding entries that have been assigned 
        candidate_gt_file_contents = gt_file_contents.loc[last_assigned_gt_idx: ]
        
        # define cond to fetch just those gt_segments that haven't been assigned yet    
        unassigned_gt_cond = candidate_gt_file_contents['assigned_bbox'].isnull()
        unassigned_gt_fc = candidate_gt_file_contents.loc[unassigned_gt_cond]
        
        
        # if no gt_segments are remaining then break
        if unassigned_gt_fc.empty:
            print("ALL GT BOXES HAVE BEEN ASSIGNED. EXITING LOOP")            

            if debug:
                print(f"Is Candidate GT File Contents Empty: {candidate_gt_file_contents.empty}")
                print(f"Is Unassigned Candidate GT File Contents Empty: {unassigned_gt_fc.empty}")
            
            break
            
        # fetch window_size number of gt_text_segments
        unassigned_gt_fc = unassigned_gt_fc.iloc[:window_size]
        
        # get results from matching
        match_idx, coords, page_num, updated_gt_fc = match_pred_bbox(pred_row, unassigned_gt_fc, 
                                                                     percent_match=percent_match,
                                                                     remove_punc=remove_punc, lowercase=lowercase)
        
        if match_idx is None:
            print(f"\nNO GT TEXT SEGMENT MATCH FOR BBOX, page_id: {pred_row.loc['page']} bbox_id: {pred_row.loc['bbId']}")
            
            # assign False in case no match was found
#             gt_file_contents.loc[match_idx, 'assigned_bbox'] = False
#             gt_file_contents.loc[match_idx, 'assigned_page'] = False

        else:
            updated_bboxes.append(match_idx)

            # assign the matched bbox coords to the gt segment
            gt_file_contents.loc[match_idx, 'assigned_bbox'] = coords
            gt_file_contents.loc[match_idx, 'assigned_page'] = page_num

            # check if the match idx is indeed the min arg value of the col
            assert match_idx == updated_gt_fc.loc[:, ['edit_distances']].sort_values(by=['edit_distances'], ascending=True).index[0], \
                    f"INDEX OF MATCHED GT TEXT SEGMENT DOES NOT MATCH THE GT TEXT SEGMENT WITH LEAST EDIT DISTANCE!\n" + \
                    f"doc_id: {doc_id}, pred_text: {pred_row['text']} match_idx: {match_idx}\n" + \
                    f"Min Edit Distance: {unassigned_gt_fc['edit_distances'].min()} Match Threshold: {int(percent_match * len(pred_row['text']))}\n" + \
                    f"Threshold Cond Check {unassigned_gt_fc['edit_distances'].min() < int(percent_match * len(pred_row['text']))}\n" + \
                    f"\n{updated_gt_fc}"
            

            # verify if the update was performed successfully
            assert gt_file_contents.loc[match_idx, 'assigned_bbox'] is not None, f"GT FILE CONTENTS (BBOX) WAS NOT UPDATED SUCCESSFULLY! doc_id={doc_id}"
            assert gt_file_contents.loc[match_idx, 'assigned_page'] is not None, f"GT FILE CONTENTS (PAGE) WAS NOT UPDATED SUCCESSFULLY! doc_id={doc_id}"

        
        if debug:
            print('\n')
            print("*"* 25)
            if match_idx is not None:
                print(f"SUCCESSFULL MATCH\ndoc_id: {doc_id} page_id: {pred_row.loc['page']} bbox_id: {pred_row.loc['bbId']}\n")
                print(f'match_idx: {match_idx}')
                print(f"pred_text: {pred_row.loc['text']}\n\nLength of Pred Text: {len(pred_row.loc['text'])}")
            else:
                print(f"\nNO GT TEXT SEGMENT MATCH FOR BBOX, page_id: {pred_row.loc['page']} bbox_id: {pred_row.loc['bbId']}")                
                print(
                      f"doc_id: {doc_id}\npred_text: {pred_row['text']}\n\nLength of Pred Text: {len(pred_row.loc['text'])} match_idx: {match_idx}\n" + \
                      f"Min Edit Distance: {unassigned_gt_fc['edit_distances'].min()} Match Threshold: {int(percent_match * len(pred_row['text']))}\n" + \
                      f"Threshold Cond Check {unassigned_gt_fc['edit_distances'].min() < int(percent_match * len(pred_row['text']))}\n"
                        )
                            
            print("Candidate GT Segments With Edit Distances")
            display(updated_gt_fc)
                    

    # return the fully updated gt contents
    return gt_file_contents, updated_bboxes

        

In [None]:
def process_dataset(merged_df, window_size=2, debug=False, percent_match=0.5, remove_punc=True, lowercase=True):
    '''Defines Execution Flow for a Set of Documents
    
    :param merged_df: pd.DataFrame. Consists of model preds (bboxes + text)
        merged with gt text segments at a document level
    '''
    
    print(f"RUNNING ON {len(merged_df)} docs!\nUser Args:\n\tWindow Size: {window_size}\n\tPercent Match: {percent_match}\n\tRemove Punc: {remove_punc}\n\tLowercase: {lowercase}\n\tdebug mode: {debug}\n")
        
    for idx, row in tqdm(merged_df.iterrows(), total=len(merged_df), desc='Dataset Progbar'):        
        doc_id = row.loc['doc_id']
        if debug:
            print(f"doc_id: {doc_id}")
            
        gt_file_contents = row.loc['file_contents_gt']

        # create new col in the original file_contents_pred in the merged df to track if the bbox has been assigned to a gt segment
        pred_file_contents = row.loc['file_contents_pred']
        pred_file_contents['is_gt_assigned'] = False

        pred_file_contents['text'] = pred_file_contents['text'].str.strip()
                            
        merged_df.loc[idx, "file_contents_pred"].update(pred_file_contents)
        
        # fetch the updated gt_file_contents, with bbox after matching
        gt_file_contents, updated_bboxes = process_doc(doc_id, pred_file_contents, gt_file_contents, 
                                                       window_size=window_size, debug=debug,
                                                       percent_match=percent_match,
                                                       remove_punc=remove_punc, lowercase=lowercase)
        
        # update the merged_df with the updated gt_file_contents
        merged_df.loc[idx, 'file_contents_gt'].update(gt_file_contents)
        
        # update the pred_file_contents with the bboxes that have been assigned
        pred_file_contents.loc[updated_bboxes, 'is_gt_assigned'] = True
        
        # updated the merged_df with the updated pred_file_contents
        merged_df.loc[idx, 'file_contents_pred'].update(pred_file_contents)

        # verify that the merged_df was updated successfully
        assert merged_df.loc[idx, 'file_contents_gt'].equals(gt_file_contents), f"GT FILE CONTENTS WAS NOT UPDATED SUCCESSFULLY! doc_id={doc_id}"
        assert merged_df.loc[idx, 'file_contents_pred'].equals(pred_file_contents), f"PRED FILE CONTENTS WAS NOT UPDATED SUCCESSFULLY! doc_id={doc_id}"

        # check if number of bboxes updated matches in the saved output
        assert len(updated_bboxes) == len(pred_file_contents.loc[pred_file_contents['is_gt_assigned']]), f"NUMBER OF UPDATED BBOXES DOESN'T MATCH OUTPUT! doc_id={doc_id}"

        
        if debug:
            print(f"\n\nNumber of bboxes in pred: {len(pred_file_contents)}\nNumber of Assigned Pred Bboxes: {len(updated_bboxes)}")
            print(f"Number of gt segments: {len(gt_file_contents)}\nNumber of Assigned Gt Segments: {gt_file_contents['assigned_bbox'].notnull().sum()}")  
            
            print("\n\nupdated gt_file_contents")
            display(gt_file_contents)
            
            print("\n\nupdated pred_file_contents")
            display(pred_file_contents)
            print("*" * 25)
            
            break

        
    return merged_df

# Define User Params

### Define data params

In [None]:
gt_txt_data_dir = "../../CUAD_v1/full_contract_txt/"
gt_pdf_data_dir = "../../CUAD_v1/full_contract_pdf/"

pred_data_dir = "../../20220925_doctr_initial_csvs/"

### Define Algo Params

In [None]:
window_size = 10 # number of gt_text segments to look at when performing matching
debug = True # if the algo should be run in debug mode. Prints intermediate outputs at each step
percent_match = 0.5 # if the edit distance of a predicted text segment is greater than this percent of the length of the segment, then ignore assigning that box to a gt
remove_punc = True  # remove spaces, special chars and punctuation from string when computing edit distance
lowercase = True # convert to lowercase when computing edit distance


pdf_exts = ['.pdf', '.PDF']

# Main Execution

In [None]:
# read the ground truth docs and parse into required structure
gt_txt_filepaths = [fpath.resolve() for fpath in Path(gt_txt_data_dir).rglob("*.txt")]

gt_pdf_filepaths = []
for ext in pdf_exts:
    gt_pdf_filepaths += [fpath.resolve() for fpath in Path(gt_pdf_data_dir).rglob(f"*{ext}")]

assert len(gt_txt_filepaths) == len(gt_pdf_filepaths), "NUMBER OF GT TXT FILES DOES NOT MATCH NUMBER OF GT PDF FILES"

gt_txt_filepaths.sort()
gt_pdf_filepaths.sort()

gt_df = pd.DataFrame({
                        'txt_filepath': gt_txt_filepaths,
                        'pdf_filepath': gt_pdf_filepaths,
                     })

# create doc_id col
gt_df['doc_id'] = gt_df['txt_filepath'].apply(lambda filepath: os.path.basename(filepath).rsplit('.', 1)[0])

# # create col containing DF of all gt text segments
gt_df['file_contents'] = gt_df['txt_filepath'].apply(lambda fname: read_gt_file(fname))

print(f"Length of GT DF: {len(gt_df)}")

In [None]:
# create pred_df
pred_df = pd.DataFrame({'filepath': [fpath.resolve() for fpath in Path(pred_data_dir).rglob("*.csv")]})

# create doc_id col
pred_df['doc_id'] = pred_df['filepath'].apply(lambda filepath: os.path.basename(filepath).rsplit('.', 1)[0])

# create col containing DF of all pred text segments
pred_df['file_contents'] = pred_df['filepath'].apply(lambda fname: pd.read_csv(fname, index_col=0))

print(f"Length of PRED DF: {len(pred_df)}")

In [None]:
# perform inner join b/w pred and gt text segments on doc_id
merged_df = pred_df.merge(gt_df, how='inner', left_on='doc_id', right_on='doc_id', suffixes=['_pred', '_gt'])

# merged_df = merged_df.sample(frac=1.0)
print(f"Length of MERGED DF: {len(merged_df)}")

### Display sample of each df 

In [None]:
gt_df.head(2)

In [None]:
pred_df.head(2)

In [None]:
merged_df.head(2)

## Run Process on Dataset

In [None]:
results_df = process_dataset(merged_df, window_size=window_size, debug=debug, percent_match=percent_match, remove_punc=remove_punc, lowercase=lowercase)

# Visualize Results

In [None]:
# sample random doc or let user choose
# take gt results for the doc and draw bboxes on the pdf doc

In [None]:
a = 'from this Exhibit 10.8 \nfiled with the Securi...'

In [None]:
xlation_table = str.maketrans('', '', string.punctuation)

In [None]:
a.translate(xlation_table)