In [1]:
import sys
import os
sys.path.append("../doctr/")
# from doctr.models import ocr_predictor
# from doctr.io import DocumentFile
import json
import pickle
import pandas as pd
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re
import cv2
import PIL
import math
from pdf2image import convert_from_path, convert_from_bytes

In [2]:
def get_file_by_dict(json_output):
    '''
    Given json output from OCR, construct a dictionary to better represent the data
    '''
    file_as_dict = {'words':[], 'ymin_ymax':[], 'page_id':[], 'page_dimensions':[], 'line_merge_next':[], 'bboxes': []}
    for page in json_output['pages']:
        for block in page['blocks']:
            for idx, line in enumerate(block['lines']):
                ((xmin, ymin),(xmax, ymax)) = line['geometry']

                
                file_as_dict['words'].append([line['words'][i]['value'] for i in range(len(line['words']))])
                file_as_dict['bboxes'].append([line['words'][ii]['geometry'] for ii in range(len(line['words']))])
                
                file_as_dict['ymin_ymax'].append((ymin, ymax))
                file_as_dict['page_id'].append(page['page_idx'])
                file_as_dict['page_dimensions'].append(page['dimensions'])

    return file_as_dict

def getIOU(segment1, segment2, threshold):
    '''
    Merging algorithm for line geometries. If lines are in different blocks but have similar line coordinates, we will be 
    able to "merge" them as one line this way. Function needed because OCR model does not always treat the same line
    as an item within the same block
    '''
    ymin1, ymax1 = segment1
    ymin2, ymax2 = segment2
    
    less_ymax = min(ymax1, ymax2)
    less_ymin = min(ymin1, ymin2)
    greater_ymax = max(ymax1, ymax2)
    greater_ymin = max(ymin1, ymin2)
    
    intersection = less_ymax - greater_ymin
    
    # no overlap
    if intersection < 0:
        return False
    
    union = greater_ymax - less_ymin
    
    if intersection / union > threshold:
        return True
    return False

def get_lines_to_merge(file_as_dict, threshold):
    lines_to_merge_indices = []
    for i in range(0, len(file_as_dict['words'])):
        for j in range(i+1, len(file_as_dict['words'])):
            if (file_as_dict['page_id'][i] == file_as_dict['page_id'][j]) and getIOU(file_as_dict['ymin_ymax'][i], 
                                                                                     file_as_dict['ymin_ymax'][j], 
                                                                                     threshold):
                lines_to_merge_indices.append((i,j))
    return lines_to_merge_indices

def line_merging(lines_to_merge_indices):
    follows = dict()
    lines = dict()
    for i, j in lines_to_merge_indices:

        if i not in lines:
            if i not in follows:
                lines[i] = [i, j]
                follows[j] = [i]
            else:  
                one_link_back = follows[i][0]
                while one_link_back in follows:
                    one_link_back = follows[one_link_back][0]
                if j not in lines[one_link_back]:
                    lines[follows[i][0]].append(j)
        else:
            lines[i].append(j)

        if j not in follows:
            follows[j] = [i]
        else:
            follows[j].append(i)
    return lines, follows

In [3]:
def final_file_line_by_line(file_as_dict, threshold):
    final_file_as_dict = {'full_line':[], 'page_id':[], 'ymax_max':[], 'ymin_min':[], 'bboxes': []}
    lines_to_merge = get_lines_to_merge(file_as_dict, threshold)
    line_merge_map, follow_merge_map = line_merging(lines_to_merge)

    # final_file_as_dict['bboxes'] = [bbox for bbox in file_as_dict['bboxes']]
    for i in range(len(file_as_dict['words'])):
        if i not in line_merge_map and i not in follow_merge_map:
            final_file_as_dict['full_line'].extend([file_as_dict['words'][i]])
            final_file_as_dict['bboxes'].extend([file_as_dict['bboxes'][i]])
            final_file_as_dict['page_id'].append(file_as_dict['page_id'][i])
            final_file_as_dict['ymin_min'].append(file_as_dict['ymin_ymax'][i][0])
            final_file_as_dict['ymax_max'].append(file_as_dict['ymin_ymax'][i][1])
            
        else:
            if i in line_merge_map:
                line = []
                ymin_min = []
                ymax_max = []
                bboxes = []
                for j in line_merge_map[i]:
                    line.extend(file_as_dict['words'][j])
                    ymin_min.append(file_as_dict['ymin_ymax'][j][0])
                    ymax_max.append(file_as_dict['ymin_ymax'][j][1])
                    bboxes.extend(file_as_dict['bboxes'][j])
                
                final_file_as_dict['full_line'].append(line)
                final_file_as_dict['bboxes'].append(bboxes)
                final_file_as_dict['ymin_min'].append(ymin_min)
                final_file_as_dict['ymax_max'].append(ymax_max)
                final_file_as_dict['page_id'].append(file_as_dict['page_id'][i])
                
                #final_file_as_dict['full_line'].append([file_as_dict['words'][j] for j in line_merge_map[i]])
                #final_file_as_dict['ymin_min'].append([file_as_dict['ymin_ymax'][j][0] for j in line_merge_map[i]])
                #final_file_as_dict['ymax_max'].append([file_as_dict['ymin_ymax'][j][1] for j in line_merge_map[i]])
    
    
    for idx, (min_element, max_element) in enumerate(zip(final_file_as_dict['ymin_min'], final_file_as_dict['ymax_max'])):
        if type(min_element) == type(list()):
            new_min_element = min(min_element)
            new_max_element = max(max_element)
            final_file_as_dict['ymin_min'][idx] = new_min_element
            final_file_as_dict['ymax_max'][idx] = new_max_element
    
    print(len(final_file_as_dict['full_line']), len(final_file_as_dict['bboxes']))
    return final_file_as_dict

def get_toc_page(preprocessed_output):
    regex_exp = r"(table of contents|tableof(?:contents)?|(?:table\s)?of*conten|contents?)"
    
    for page_id, line in zip(preprocessed_output['page_id'],
                             preprocessed_output['full_line']):
        if re.search(regex_exp, " ".join(line).lower()):
            return page_id
    return None

def match_line(section_info, preprocessed_output, idx1, idx2,
               subset_match_threshold, line_len_match_threshold,
               beg_line_match_threshold, first_line_match_threshold, toc_page):
    '''
    replace bulk of find_start logic with this. this function operates 1 line at a time or 2 lines at a time
    if both idx are passed in. -- need to figure out how to incorporate proper bboxes logic though
    '''
    
    if preprocessed_output['page_id'][idx1] == toc_page:
        return None, None
    
    if (idx1 is not None) and (idx2 is not None):
        if preprocessed_output['page_id'][idx2] == toc_page:
            return None, None
        if preprocessed_output['page_id'][idx1] != preprocessed_output['page_id'][idx2]:
            return None, None
        
        ## 2 line merge
        multi_line = " ".join(preprocessed_output['full_line'][idx1] + preprocessed_output['full_line'][idx2])
        line1 = " ".join(preprocessed_output['full_line'][idx1])
        
        beg_line = multi_line[0: len(section_info[0])*2] # not really necessary for this case but keeping it for consistency
        
        if (fuzz.partial_ratio(section_info[0].lower(), multi_line.lower()) > subset_match_threshold       and
            len(multi_line) >= len(section_info[0]) * line_len_match_threshold                             and
            fuzz.partial_ratio(section_info[0].lower(), beg_line.lower()) > beg_line_match_threshold and
            fuzz.partial_ratio(section_info[0].lower(), line1.lower()) > first_line_match_threshold):

            # check if the beginning of the section info (first 2 words- design choice )has a high match with 
            # the beginning of the matched line. 
            section_info_beg = section_info[0].split()[:2]
            section_info_beg = " ".join(section_info_beg)

            matched_line_beg = multi_line.split()[:2]
            matched_line_beg = " ".join(matched_line_beg)

            # if debug:
                # print(f"section_info_beg: {section_info_beg}\nmatched_line_beg: {matched_line_beg}, match: {fuzz.partial_ratio(section_info_beg.lower(), matched_line_beg.lower())}")

            if fuzz.partial_ratio(section_info_beg.lower(), matched_line_beg.lower()) > subset_match_threshold:
                ymin = preprocessed_output['ymin_min'][idx1] # min of first line
                ymax = preprocessed_output['ymax_max'][idx2] # max of second line
                
                page_id = preprocessed_output['page_id'][idx1]
                bboxes = [preprocessed_output['bboxes'][idx1], preprocessed_output['bboxes'][idx2]]
                line2 = " ".join(preprocessed_output['full_line'][idx2])
                
                return (line1, line2, section_info[0], ymin, ymax, page_id, bboxes), idx2 + 1
            
            # if the beginning of the section info and the matched line don't have a high match then
            # return None
            else:
                return None, None
        
        return None, None

    line = " ".join(preprocessed_output['full_line'][idx1])
    beg_line = line[0: len(section_info[0])*2]

    if (fuzz.partial_ratio(section_info[0].lower(), line.lower()) > subset_match_threshold and
        len(line) >= len(section_info[0]) * line_len_match_threshold                       and
        fuzz.partial_ratio(section_info[0].lower(), beg_line.lower()) > beg_line_match_threshold):
        
        section_info_beg = section_info[0].split()[:2]
        section_info_beg = " ".join(section_info_beg)

        matched_line_beg = line.split()[:2]
        matched_line_beg = " ".join(matched_line_beg)

        # if debug:
        # print(f"section_info_beg: {section_info_beg}\nmatched_line_beg: {matched_line_beg}, match: {fuzz.partial_ratio(section_info_beg.lower(), matched_line_beg.lower())}")

        if fuzz.partial_ratio(section_info_beg.lower(), matched_line_beg.lower()) > subset_match_threshold:
            ymin = preprocessed_output['ymin_min'][idx1]
            ymax = preprocessed_output['ymax_max'][idx1]
            page_id = preprocessed_output['page_id'][idx1]
            bboxes = preprocessed_output['bboxes'][idx1]

            return (line, None, section_info[0], ymin, ymax, page_id, bboxes), idx1 + 1
        
        else:
            None, None
    
    return None, None
    
def find_start_new(section_info, preprocessed_output,
                   subset_match_threshold, line_len_match_threshold,
                   beg_line_match_threshold, first_line_match_threshold, 
                   last_line_pointer):
    '''
    Given a toc section title, iterate all the lines in the file from the last line associated with a section title
    going forward
    '''
    
    if last_line_pointer == len(preprocessed_output['full_line']):
        return None, last_line_pointer
    
    toc_page = get_toc_page(preprocessed_output)
    
    itertuple = zip(range(last_line_pointer, len(preprocessed_output['full_line'])),
                    range(last_line_pointer + 1, len(preprocessed_output['full_line'])))
    
    for idx1, idx2 in itertuple:
        
        ## first try matching with first line
        match = match_line(section_info, preprocessed_output, idx1, None,
                           subset_match_threshold, line_len_match_threshold,
                           beg_line_match_threshold, first_line_match_threshold, 
                           toc_page)
        if match[0]:
            return match
        
        ## let's try matching with second line only. This code is extremely inefficient but keeping for now.
        match = match_line(section_info, preprocessed_output, idx2, None,
                           subset_match_threshold, line_len_match_threshold,
                           beg_line_match_threshold, first_line_match_threshold, 
                           toc_page)
        if match[0]:
            return match
    

        # now try matching with 2 lines 
        match = match_line(section_info, preprocessed_output, idx1, idx2,
                           subset_match_threshold, line_len_match_threshold,
                           beg_line_match_threshold, first_line_match_threshold, 
                           toc_page)
        if match[0]:
            return match
        ## if no match, move onto the next pair of lines
    
    return None, last_line_pointer

def get_starts_all(section_dict, preprocessed_output):
    last_line_pointer = 0
    #lines = list(zip(preprocessed_output2['page_id'],
    #                  preprocessed_output2['full_line'],
    #                  preprocessed_output2['ymin_min'],
    #                  preprocessed_output2['ymax_max']))


    subset_match_threshold = 80    # Design decision to only allow subset match ratios of > 80/100
    line_len_match_threshold = 0.8 # Design decision to potentially only match document lines
                                   # that are not much smaller than table of content label
    beg_line_match_threshold = 80
    first_line_match_threshold = 20 # in case of merged lines lets make sure top line is at least mildly relevant
    starts = []

    for key, section_info in section_dict.items():

        start, last_line_pointer = find_start_new(section_info, preprocessed_output,
                                                  subset_match_threshold, line_len_match_threshold,
                                                  beg_line_match_threshold, first_line_match_threshold, 
                                                  last_line_pointer)
        if start:
            starts.append(start)
        else:
            print(f"Couldn't match {section_info[0]} with a line. Moving onto next TOC section")
    
    return starts

def flatten_contract_dict(nested_dict):
    i = 1
    section_dict_flattened = {}

    for item in nested_dict.items():
        section_dict_flattened[i] = (item[1][0], {})
        i += 1
        for sub_item in item[1][1].items():
            section_dict_flattened[i] = (sub_item[1][0], {})
            i += 1
    return section_dict_flattened

In [4]:
def merge_bboxes(line1_words, line2_words, query, bboxes, window_size, match_score):
    if isinstance(line1_words, str):
        line1_words = line1_words.split()

    if isinstance(line2_words, str):
        line2_words = line2_words.split()

    if line2_words:
        full_line_words = line1_words + line2_words
    else:
        full_line_words = line1_words
    
    # print(line1_words)
    # print(line2_words)
    # print(full_line_words)
    # print(bboxes)
    
    max_window_score, match_idx = 0, 0
    for idx, start_idx in enumerate(range(0, len(full_line_words), window_size)):
        window_text = " ".join(full_line_words[start_idx : start_idx + window_size])
        
        window_score = process.extractBests(window_text, [query], scorer=fuzz.token_set_ratio)[0][-1]

        if line2_words:
            candidate_bboxes = bboxes[0] + bboxes[1]
        else:
            candidate_bboxes = bboxes[start_idx : start_idx + window_size]

        if window_score >= match_score:
            # x min is the x_left of the first bbox 
            # y_min is the min of the top left y's for each box
            # x_min = candidate_bboxes[0][0][0]
            x_min = min([x[0][0] for x in candidate_bboxes])
            y_min = min([y[0][-1] for y in candidate_bboxes])

            # x max is the x_right of the last bbox 
            # y_max is the max of the bottom_right y's for each box
            # x_max = candidate_bboxes[-1][1][0]
            x_max = max([x[1][0] for x in candidate_bboxes])
            y_max = max([y[1][-1] for y in candidate_bboxes])

            merged_bbox = [ [x_min, y_min], [x_max, y_max] ]
            
            return merged_bbox, window_text
        
        else:
            if window_score > max_window_score:
                max_window_score = window_score
                match_text = window_text
                match_candidate_bboxes = candidate_bboxes

            continue
    
    # in case the window_text has a lower match score than with the entire string, match with the 
    # segment with highest matching score
    # x_min = match_candidate_bboxes[0][0][0]
    x_min = min([x[0][0] for x in match_candidate_bboxes])
    y_min = min([y[0][-1] for y in match_candidate_bboxes])

    # x max is the x_right of the last bbox 
    # y_max is the max of the bottom_right y's for each box
    # x_max = match_candidate_bboxes[-1][1][0]
    x_max = max([x[1][0] for x in match_candidate_bboxes])
    y_max = max([y[1][-1] for y in match_candidate_bboxes])

    merged_bbox = [ [x_min, y_min], [x_max, y_max] ]

    return merged_bbox, match_text

def extract_exact_match(row):
    
    query = row['Section Title via HTML']
    if query is not None:
        line1 = row.loc['Line via OCR']
        line2 = row.loc['Line2 via OCR']

        if line2 is not None:
            full_line = line1 + line2
        else:
            full_line = line1
        
        match = process.extractBests(full_line, [query], scorer=fuzz.token_set_ratio)
        
        match_text, match_score = match[0]
        window_size = len(match_text.split())
        
        
        merged_bbox, exact_match_text = merge_bboxes(line1, 
                                                     line2,
                                                     query, 
                                                     row['bboxes'], 
                                                     window_size, 
                                                     match_score)
        
        return merged_bbox, exact_match_text
    
    else:
        return None, None

In [5]:
def make_images(poppler_path, parent_path, pdf_path):
    pdf_name = pdf_path.split("/")[-1][:-4]
    print(f"Converting pdf to image for {pdf_name}")
    pdf_name = pdf_name if len(pdf_name) < 20 else pdf_name[:20]
    full_output_folder = parent_path + pdf_name
    if not os.path.exists(full_output_folder):
        os.mkdir(full_output_folder)
        print(f"Saving images in {full_output_folder}")
    convert_from_path(pdf_path=pdf_path, output_folder=full_output_folder, poppler_path=poppler_path, fmt='jpeg')
    return full_output_folder

def write_bbox_images(pdf_parsed_df, full_output_folder):
    for page_id in pdf_parsed_df['page_id'].unique():
        sub_df = df[df['page_id'] == page_id].copy()
        
        img_page_id = f"{page_id + 1}.jpg"
        img_name = list(filter(lambda x: True if img_page_id in x else False, os.listdir(full_output_folder)))[0]
        img_read_path = full_output_folder + "/" + img_name
        img_write_parent_path = f"{full_output_folder}/bboxes/"
        
        if not os.path.exists(img_write_parent_path):
            os.mkdir(img_write_parent_path)
        
        img_write_full_path = img_write_parent_path + img_name
        img = cv2.imread(img_read_path)
        width, height = PIL.Image.open(img_read_path).size
        
        for idx, row in sub_df.iterrows():
            (x1, y1), (x2, y2) = row['exact_match_bbox']
            
            x1 = math.floor(width * x1)
            x2 = math.ceil(width * x2)
            y1 = math.floor(height * y1)
            y2 = math.ceil(height * y2)
            
            img = cv2.rectangle(img, (x1, y1), (x2, y2), (255,0,0), 2)
            print(f"Drew bounding boxes for {img_name} page ")
        cv2.imwrite(img_write_full_path, img)
        print(f"Wrote image with bboxes @ {img_write_full_path}")
    return None

In [6]:
def run_matching(contract_key, section_dicts, path_dict):
    section_dict = section_dicts[path_dict["section_dict_key"]]
    section_dict = flatten_contract_dict(section_dict)
    doctr_output_path = path_dict["doctr_output_json"]
    
    with open(doctr_output_path, 'r') as f:
        doctr_output = json.load(f)
        
    file_as_dict = get_file_by_dict(doctr_output)
    preprocessed_output = final_file_line_by_line(file_as_dict, threshold=0.65)
    df = pd.DataFrame(preprocessed_output)
    preprocessed_output = df.sort_values(by=['page_id', 'ymin_min']).to_dict(orient='list')

    df = pd.DataFrame(get_starts_all(section_dict, preprocessed_output)).rename(columns={0:'Line via OCR',
                                                                                         1:'Line2 via OCR',
                                         2:'Section Title via HTML',
                                         3:'ymin',
                                         4:'ymax',
                                         5:'page_id',
                                         6:'bboxes'})
    df['exact_match_bbox'], df['exact_match_text'] = zip(*df.apply(lambda row: extract_exact_match(row), axis=1))
    return df

In [7]:
all_pdfs_relative_paths_list = \
['CUAD_v1/full_contract_pdf/Part_I/Outsourcing/ParatekPharmaceuticalsInc_20170505_10-KA_EX-10.29_10323872_EX-10.29_Outsourcing Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Manufacturing/UpjohnInc_20200121_10-12G_EX-2.6_11948692_EX-2.6_Manufacturing Agreement_ Supply Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Joint Venture/TRANSPHORM,INC_02_14_2020-EX-10.12(1)-JOINT VENTURE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_I/Development/AimmuneTherapeuticsInc_20200205_8-K_EX-10.3_11967170_EX-10.3_Development Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Development/ElPolloLocoHoldingsInc_20200306_10-K_EX-10.16_12041700_EX-10.16_Development Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Development/FuelcellEnergyInc_20191106_8-K_EX-10.1_11868007_EX-10.1_Development Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Maintenance/AzulSa_20170303_F-1A_EX-10.3_9943903_EX-10.3_Maintenance Agreement2.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Maintenance/BloomEnergyCorp_20180321_DRSA (on S-1)_EX-10_11240356_EX-10_Maintenance Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Maintenance/AtnInternationalInc_20191108_10-Q_EX-10.1_11878541_EX-10.1_Maintenance Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Maintenance/CardlyticsInc_20180112_S-1_EX-10.16_11002987_EX-10.16_Maintenance Agreement1.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/IP/GarrettMotionInc_20181001_8-K_EX-2.4_11364532_EX-2.4_Intellectual Property Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/IP/CerenceInc_20191002_8-K_EX-10.4_11827494_EX-10.4_Intellectual Property Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Franchise/PfHospitalityGroupInc_20150923_10-12G_EX-10.1_9266710_EX-10.1_Franchise Agreement1.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Franchise/SoupmanInc_20150814_8-K_EX-10.1_9230148_EX-10.1_Franchise Agreement1.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Transportation/RangeResourcesLouisianaInc_20150417_8-K_EX-10.5_9045501_EX-10.5_Transportation Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Co_Branding/StampscomInc_20001114_10-Q_EX-10.47_2631630_EX-10.47_Co-Branding Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Strategic Alliance/MOELIS_CO_03_24_2014-EX-10.19-STRATEGIC ALLIANCE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_I/Endorsement/PerformanceSportsBrandsInc_20110909_S-1_EX-10.10_7220214_EX-10.10_Endorsement Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Promotion/DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Promotion/ExactSciencesCorp_20180822_8-K_EX-10.1_11331629_EX-10.1_Promotion Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Promotion/SigaTechnologiesInc_20190603_8-K_EX-10.1_11695818_EX-10.1_Promotion Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Service/VerizonAbsLlc_20200123_8-K_EX-10.4_11952335_EX-10.4_Service Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_I/Distributor/ZogenixInc_20190509_10-Q_EX-10.2_11663313_EX-10.2_Distributor Agreement.pdf',
 'CUAD_v1/full_contract_pdf/Part_II/Outsourcing/TRICITYBANKSHARESCORP_05_15_1998-EX-10-OUTSOURCING AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_II/Outsourcing/ASPIRITYHOLDINGSLLC_05_07_2012-EX-10.6-OUTSOURCING AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_II/Maintenance/IMAGEWARESYSTEMSINC_12_20_1999-EX-10.22-MAINTENANCE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_II/IP/BABCOCK_WILCOXENTERPRISES,INC_08_04_2015-EX-10.17-INTELLECTUAL PROPERTY AGREEMENT between THE BABCOCK _ WILCOX COMPANY and BABCOCK _ WILCOX ENTERPRISES, INC..PDF',
 'CUAD_v1/full_contract_pdf/Part_II/Hosting/HEALTHGATEDATACORP_11_24_1999-EX-10.1-HOSTING AND MANAGEMENT AGREEMENT (1).pdf',
 'CUAD_v1/full_contract_pdf/Part_II/Franchise/BUFFALOWILDWINGSINC_06_05_1998-EX-10.3-FRANCHISE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_II/Franchise/MRSFIELDSORIGINALCOOKIESINC_01_29_1998-EX-10-FRANCHISE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_II/Transportation/GRANTIERRAENERGYINC_05_07_2012-EX-10.6-TRANSPORTATION CONTRACT.PDF',
 'CUAD_v1/full_contract_pdf/Part_II/Collaboration/INNOVIVA,INC_08_07_2014-EX-10.1-COLLABORATION AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_II/Collaboration/XENCORINC_10_25_2013-EX-10.24-COLLABORATION AGREEMENT (3).PDF',
 'CUAD_v1/full_contract_pdf/Part_II/Promotion/IMMUNOMEDICSINC_08_07_2019-EX-10.1-PROMOTION AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Outsourcing/ELANDIAINTERNATIONALINC_04_25_2007-EX-10.21-Outsourcing Agreement.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Manufacturing/Sonos, Inc. - Manufacturing Agreement .PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Manufacturing/VAPOTHERM, INC. - Manufacturing and Supply Agreement.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Maintenance/SLOVAKWIRELESSFINANCECOBV_03_28_2001-EX-4.(B)(II).3-Maintenance and support contract for SICAP(R) modules.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Maintenance/SANDRIDGEENERGYINC_08_06_2009-EX-10.6-OPERATIONS AND MAINTENANCE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Maintenance/STARTECGLOBALCOMMUNICATIONSCORP_11_16_1998-EX-10.30-CONSTRUCTION AND MAINTENANCE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Maintenance/VERTEXENERGYINC_08_14_2014-EX-10.24-OPERATION AND MAINTENANCE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Maintenance/TELEGLOBEINTERNATIONALHOLDINGSLTD_03_29_2004-EX-10.10-CONSTRUCTION AND MAINTENANCE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/IP/GSITECHNOLOGYINC_11_16_2009-EX-10.2-INTELLECTUAL PROPERTY AGREEMENT between SONY ELECTRONICS INC. and GSI TECHNOLOGY, INC..PDF',
 'CUAD_v1/full_contract_pdf/Part_III/IP/OTISWORLDWIDECORP_04_03_2020-EX-10.4-INTELLECTUAL PROPERTY AGREEMENT by and among UNITED TECHNOLOGIES CORPORATION, OTIS WORLDWIDE CORPORATION and CARRIER ~1.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/IP/FIDELITYNATIONALINFORMATIONSERVICES,INC_08_05_2009-EX-10.3-INTELLECTUAL PROPERTY AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Hosting/INKTOMICORP_06_08_1998-EX-10.14-SOFTWARE HOSTING AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Franchise/INTERNATIONALFASTFOODCORP_04_04_1997-EX-99-FRANCHISE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Franchise/HOSPITALITYINVESTORSTRUST,INC_04_07_2014-EX-10.26-FRANCHISE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Franchise/GOOSEHEADINSURANCE,INC_04_02_2018-EX-10.6-Franchise Agreement.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Franchise/JOINTCORP_09_19_2014-EX-10.15-FRANCHISE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Transportation/ENTERPRISEPRODUCTSPARTNERSLP_07_08_1998-EX-10.3-TRANSPORTATION CONTRACT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Transportation/ENERGYXXILTD_05_08_2015-EX-10.13-Transportation AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Supply/VERICELCORP_08_06_2019-EX-10.10-SUPPLY AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Marketing/NUVEEN - REMARKETING AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Marketing/Monsanto Company - SECOND A_R EXCLUSIVE AGENCY AND MARKETING AGREEMENT .PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Joint Venture _ Filing/IGENEBIOTECHNOLOGYINC_05_13_2003-EX-1-JOINT VENTURE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Collaboration/HPILHOLDING_01_07_2015-EX-99.1-COOPERATION AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Collaboration/FIBROGENINC_10_01_2014-EX-10.11-COLLABORATION AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Collaboration/FOUNDATIONMEDICINE,INC_02_02_2015-EX-10.2-Collaboration Agreement.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Collaboration/CERES,INC_01_25_2012-EX-10.20-Collaboration Agreement.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Strategic Alliance/USASYNTHETICFUELCORP_10_21_2010-EX-10.10-STRATEGIC ALLIANCE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Strategic Alliance/CHERRYHILLMORTGAGEINVESTMENTCORP_09_26_2013-EX-10.1-Strategic Alliance Agreement.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Strategic Alliance/SUCAMPOPHARMACEUTICALS,INC_11_04_2015-EX-10.2-STRATEGIC ALLIANCE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Service/WPPPLC_04_30_2020-EX-4.28-SERVICE AGREEMENT.PDF',
 'CUAD_v1/full_contract_pdf/Part_III/Distributor/NEONSYSTEMSINC_03_01_1999-EX-10.5-DISTRIBUTOR AGREEMENT_New.pdf']

In [8]:
## enter basic local path info here
poppler_path = "C:/Users/islam/poppler-22.04.0/Library/bin/"
all_section_dicts_path = "section_dicts_new.json"
all_doctr_json_outputs_path = "saved_jsons/saved_jsons/"
cuad_master_parent_path = "C:/Users/islam/Desktop/shortcutpaths/CUAD_v1/"  ## this will be joined to relative pdf path
image_output_parent_path = "C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/"

master_path_dictionary = dict()

with open(all_section_dicts_path, "r") as f:
    section_dicts = json.load(f)

for idx, (section_dict_key, section_dict) in enumerate(section_dicts.items()):
    key = f"contract_{idx}"
    master_path_dictionary[key] = dict()
    master_path_dictionary[key]['section_dict_key'] = section_dict_key
    master_path_dictionary[key]['doctr_output_json'] = all_doctr_json_outputs_path + section_dict_key[:-4] + "json"
    
    for rel_pdf_path in all_pdfs_relative_paths_list:
        if section_dict_key[:-4] in rel_pdf_path:
            master_path_dictionary[key]['pdf_path'] = cuad_master_parent_path + rel_pdf_path
    
    master_path_dictionary[key]['text_file_path'] = cuad_master_parent_path + "CUAD_v1/full_contract_txt/" + section_dict_key[:-4] + "txt"

In [9]:
for contract_key, path_dict in master_path_dictionary.items():
    
    print(f"On contract {contract_key}")
    
    df = run_matching(contract_key=contract_key,
                      section_dicts=section_dicts, 
                      path_dict=path_dict)
    
    full_output_folder = make_images(poppler_path=poppler_path, 
                                     parent_path=image_output_parent_path, 
                                     pdf_path=path_dict['pdf_path'])
    
    write_bbox_images(pdf_parsed_df=df,
                      full_output_folder=full_output_folder)            
    
    if contract_key[-1] == '5':
        
        ## let's only do 5 contracts
        break

On contract contract_0
2266 2266
Couldn't match 1.7 BII   Technology with a line. Moving onto next TOC section
Couldn't match 1.17 Due   Date with a line. Moving onto next TOC section
Couldn't match 1.42 Specification(s) with a line. Moving onto next TOC section
Converting pdf to image for XENCORINC_10_25_2013-EX-10.24-COLLABORATION AGREEMENT (3)
Saving images in C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/XENCORINC_10_25_2013
Drew bounding boxes for 49b01906-34d0-4a2e-9247-7c16fc6b7d78-007.jpg page 
Drew bounding boxes for 49b01906-34d0-4a2e-9247-7c16fc6b7d78-007.jpg page 
Wrote image with bboxes @ C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/XENCORINC_10_25_2013/bboxes/49b01906-34d0-4a2e-9247-7c16fc6b7d78-007.jpg
Drew bounding boxes for 49b01906-34d0-4a2e-9247-7c16fc6b7d78-008.jpg page 
Drew bounding boxes for 49b01906-34d0-4a2e-9247-7c16fc6b7d78-008.jpg page 
Drew bounding boxes for 49b01906-3

4190 4190
Converting pdf to image for CERES,INC_01_25_2012-EX-10.20-Collaboration Agreement
Saving images in C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/CERES,INC_01_25_2012
Drew bounding boxes for e81d87c6-3cbc-4f11-89bb-20392d84bdc1-007.jpg page 
Wrote image with bboxes @ C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/CERES,INC_01_25_2012/bboxes/e81d87c6-3cbc-4f11-89bb-20392d84bdc1-007.jpg
On contract contract_3
644 644
Couldn't match ARTICLE II PURCHASE AND SALE with a line. Moving onto next TOC section
Couldn't match ARTICLE III CLOSING with a line. Moving onto next TOC section
Couldn't match ARTICLE IV REPRESENTATIONS AND WARRANTIES OF SELLER PARTIES with a line. Moving onto next TOC section
Couldn't match 4.10  Customers and Suppliers

Section with a line. Moving onto next TOC section
Couldn't match 4.11  Insurance

Section with a line. Moving onto next TOC section
Couldn't match 4.12  Legal 

Drew bounding boxes for f9e64928-bc1c-4930-bcba-b42bac049d79-020.jpg page 
Wrote image with bboxes @ C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/GOOSEHEADINSURANCE,I/bboxes/f9e64928-bc1c-4930-bcba-b42bac049d79-020.jpg
Drew bounding boxes for f9e64928-bc1c-4930-bcba-b42bac049d79-026.jpg page 
Wrote image with bboxes @ C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/GOOSEHEADINSURANCE,I/bboxes/f9e64928-bc1c-4930-bcba-b42bac049d79-026.jpg
Drew bounding boxes for f9e64928-bc1c-4930-bcba-b42bac049d79-027.jpg page 
Wrote image with bboxes @ C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/GOOSEHEADINSURANCE,I/bboxes/f9e64928-bc1c-4930-bcba-b42bac049d79-027.jpg
Drew bounding boxes for f9e64928-bc1c-4930-bcba-b42bac049d79-029.jpg page 
Wrote image with bboxes @ C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/GOOSEHEADINSURANC