In [1]:
import sys
import os
sys.path.append("../doctr/")
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
import json
import pickle
import pandas as pd
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_file_by_dict(json_output):
    '''
    Given json output from OCR, construct a dictionary to better represent the data
    '''
    file_as_dict = {'words':[], 'ymin_ymax':[], 'page_id':[], 'page_dimensions':[], 'line_merge_next':[], 'bboxes': []}
    for page in json_output['pages']:
        for block in page['blocks']:
            for idx, line in enumerate(block['lines']):
                ((xmin, ymin),(xmax, ymax)) = line['geometry']

                
                file_as_dict['words'].append([line['words'][i]['value'] for i in range(len(line['words']))])
                file_as_dict['bboxes'].append([line['words'][ii]['geometry'] for ii in range(len(line['words']))])
                
                file_as_dict['ymin_ymax'].append((ymin, ymax))
                file_as_dict['page_id'].append(page['page_idx'])
                file_as_dict['page_dimensions'].append(page['dimensions'])

    return file_as_dict

def getIOU(segment1, segment2, threshold):
    '''
    Merging algorithm for line geometries. If lines are in different blocks but have similar line coordinates, we will be 
    able to "merge" them as one line this way. Function needed because OCR model does not always treat the same line
    as an item within the same block
    '''
    ymin1, ymax1 = segment1
    ymin2, ymax2 = segment2
    
    less_ymax = min(ymax1, ymax2)
    less_ymin = min(ymin1, ymin2)
    greater_ymax = max(ymax1, ymax2)
    greater_ymin = max(ymin1, ymin2)
    
    intersection = less_ymax - greater_ymin
    
    # no overlap
    if intersection < 0:
        return False
    
    union = greater_ymax - less_ymin
    
    if intersection / union > threshold:
        return True
    return False

def get_lines_to_merge(file_as_dict, threshold):
    lines_to_merge_indices = []
    for i in range(0, len(file_as_dict['words'])):
        for j in range(i+1, len(file_as_dict['words'])):
            if (file_as_dict['page_id'][i] == file_as_dict['page_id'][j]) and getIOU(file_as_dict['ymin_ymax'][i], 
                                                                                     file_as_dict['ymin_ymax'][j], 
                                                                                     threshold):
                lines_to_merge_indices.append((i,j))
    return lines_to_merge_indices

def line_merging(lines_to_merge_indices):
    follows = dict()
    lines = dict()
    for i, j in lines_to_merge_indices:

        if i not in lines:
            if i not in follows:
                lines[i] = [i, j]
                follows[j] = [i]
            else:  
                one_link_back = follows[i][0]
                while one_link_back in follows:
                    one_link_back = follows[one_link_back][0]
                if j not in lines[one_link_back]:
                    lines[follows[i][0]].append(j)
        else:
            lines[i].append(j)

        if j not in follows:
            follows[j] = [i]
        else:
            follows[j].append(i)
    return lines, follows

In [3]:
def final_file_line_by_line(file_as_dict, threshold):
    final_file_as_dict = {'full_line':[], 'page_id':[], 'ymax_max':[], 'ymin_min':[], 'bboxes': []}
    lines_to_merge = get_lines_to_merge(file_as_dict, threshold)
    line_merge_map, follow_merge_map = line_merging(lines_to_merge)

    # final_file_as_dict['bboxes'] = [bbox for bbox in file_as_dict['bboxes']]
    for i in range(len(file_as_dict['words'])):
        if i not in line_merge_map and i not in follow_merge_map:
            final_file_as_dict['full_line'].extend([file_as_dict['words'][i]])
            final_file_as_dict['bboxes'].extend([file_as_dict['bboxes'][i]])
            final_file_as_dict['page_id'].append(file_as_dict['page_id'][i])
            final_file_as_dict['ymin_min'].append(file_as_dict['ymin_ymax'][i][0])
            final_file_as_dict['ymax_max'].append(file_as_dict['ymin_ymax'][i][1])
            
        else:
            if i in line_merge_map:
                line = []
                ymin_min = []
                ymax_max = []
                bboxes = []
                for j in line_merge_map[i]:
                    line.extend(file_as_dict['words'][j])
                    ymin_min.append(file_as_dict['ymin_ymax'][j][0])
                    ymax_max.append(file_as_dict['ymin_ymax'][j][1])
                    bboxes.extend(file_as_dict['bboxes'][j])
                
                final_file_as_dict['full_line'].append(line)
                final_file_as_dict['bboxes'].append(bboxes)
                final_file_as_dict['ymin_min'].append(ymin_min)
                final_file_as_dict['ymax_max'].append(ymax_max)
                final_file_as_dict['page_id'].append(file_as_dict['page_id'][i])
                
                #final_file_as_dict['full_line'].append([file_as_dict['words'][j] for j in line_merge_map[i]])
                #final_file_as_dict['ymin_min'].append([file_as_dict['ymin_ymax'][j][0] for j in line_merge_map[i]])
                #final_file_as_dict['ymax_max'].append([file_as_dict['ymin_ymax'][j][1] for j in line_merge_map[i]])
    
    
    for idx, (min_element, max_element) in enumerate(zip(final_file_as_dict['ymin_min'], final_file_as_dict['ymax_max'])):
        if type(min_element) == type(list()):
            new_min_element = min(min_element)
            new_max_element = max(max_element)
            final_file_as_dict['ymin_min'][idx] = new_min_element
            final_file_as_dict['ymax_max'][idx] = new_max_element
    
    print(len(final_file_as_dict['full_line']), len(final_file_as_dict['bboxes']))
    return final_file_as_dict

def get_toc_page(preprocessed_output):
    regex_exp = r"(table of contents|tableof(?:contents)?|(?:table\s)?of*conten|contents?)"
    
    for page_id, line in zip(preprocessed_output['page_id'],
                             preprocessed_output['full_line']):
        if re.search(regex_exp, " ".join(line).lower()):
            return page_id
    return None

def find_start(section_info, preprocessed_output,
               subset_match_threshold, line_len_match_threshold,
               beg_line_match_threshold, last_line_pointer):
    '''
    Given a toc section title, iterate all the lines in the file from the last line associated with a section title
    going forward
    '''
    
    if last_line_pointer == len(preprocessed_output['full_line']):
        return None, last_line_pointer
    
    # # toc_page = get_toc_page(preprocessed_output)
    toc_page = 1 # # # CHANGE THIS WHEN YOU GET THE CODE FOR MATCH
    for idx in range(last_line_pointer, len(preprocessed_output['full_line'])):
        
        if preprocessed_output['page_id'][idx] == toc_page:
            continue
        line = " ".join(preprocessed_output['full_line'][idx])
        bboxes = preprocessed_output['bboxes'][idx]
    
        beg_line = line[0: len(section_info[0])*2]

        if (fuzz.partial_ratio(section_info[0].lower(), line.lower()) > subset_match_threshold and
            len(line) >= len(section_info[0]) * line_len_match_threshold                       and
            fuzz.partial_ratio(section_info[0].lower(), beg_line.lower()) > beg_line_match_threshold):
            
            ymin = preprocessed_output['ymin_min'][idx]
            ymax = preprocessed_output['ymax_max'][idx]
            page_id = preprocessed_output['page_id'][idx]
            
            return (line, section_info[0], ymin, ymax, page_id, bboxes), idx + 1
    
    return None, last_line_pointer

def get_starts_all(section_dict, preprocessed_output):
    last_line_pointer = 0
    #lines = list(zip(preprocessed_output2['page_id'],
    #                  preprocessed_output2['full_line'],
    #                  preprocessed_output2['ymin_min'],
    #                  preprocessed_output2['ymax_max']))


    subset_match_threshold = 80    # Design decision to only allow subset match ratios of > 80/100
    line_len_match_threshold = 0.8 # Design decision to potentially only match document lines
                                   # that are not much smaller than table of content label
    beg_line_match_threshold = 80
    starts = []
    # # ADD THIS BACK WHEN YOU GET THE CODE toc_page = get_toc_page(preprocessed_output)
    toc_page = 1
    for key, section_info in section_dict.items():

        start, last_line_pointer = find_start(section_info, preprocessed_output,
                                               subset_match_threshold, line_len_match_threshold,
                                               beg_line_match_threshold, last_line_pointer)
        if start:
            starts.append(start)
        else:
            print(f"Couldn't match {section_info[0]} with a line. Moving onto next TOC section")
    
    return starts
def flatten_contract_dict(nested_dict):
    i = 1
    section_dict_flattened = {}

    for item in nested_dict.items():
        section_dict_flattened[i] = (item[1][0], {})
        i += 1
        for sub_item in item[1][1].items():
            section_dict_flattened[i] = (sub_item[1][0], {})
            i += 1
    return section_dict_flattened

In [4]:
def merge_bboxes(words, query, bboxes, window_size, match_score):
    
    max_window_score, match_idx = 0, 0
    for idx, start_idx in enumerate(range(0, len(words), window_size)):
        window_text = " ".join(words[start_idx : start_idx + window_size])
        
        window_score = process.extractBests(window_text, [query], scorer=fuzz.token_set_ratio)[0][-1]
        candidate_bboxes = bboxes[start_idx : start_idx + window_size]

        if window_score >= match_score:
            # x min is the x_left of the first bbox 
            # y_min is the min of the top left y's for each box
            x_min = candidate_bboxes[0][0][0]
            y_min = min([y[0][-1] for y in candidate_bboxes])

            # x max is the x_right of the last bbox 
            # y_max is the max of the bottom_right y's for each box
            x_max = candidate_bboxes[-1][1][0]
            y_max = max([y[1][-1] for y in candidate_bboxes])

            merged_bbox = [ [x_min, y_min], [x_max, y_max] ]
            
            return merged_bbox, window_text
        
        else:
            if window_score > max_window_score:
                max_window_score = window_score
                match_text = window_text
                match_candidate_bboxes = candidate_bboxes

            continue
    
    # in case the window_text has a lower match score than with the entire string, match with the 
    # segment with highest matching score
    x_min = match_candidate_bboxes[0][0][0]
    y_min = min([y[0][-1] for y in match_candidate_bboxes])

    # x max is the x_right of the last bbox 
    # y_max is the max of the bottom_right y's for each box
    x_max = match_candidate_bboxes[-1][1][0]
    y_max = max([y[1][-1] for y in match_candidate_bboxes])

    merged_bbox = [ [x_min, y_min], [x_max, y_max] ]

    return merged_bbox, match_text

def extract_exact_match(row):
    
    query = row['Section Title via HTML']
    if query is not None:

        match = process.extractBests(row.loc['Line via OCR'], [query], scorer=fuzz.token_set_ratio)
        
        match_text, match_score = match[0]
        window_size = len(match_text.split())
        
        
        merged_bbox, exact_match_text = merge_bboxes(row['Line via OCR'].split(), 
                                                     query, 
                                                     row['bboxes'], 
                                                     window_size, 
                                                     match_score)
        
        return merged_bbox, exact_match_text
    
    else:
        return None, None

In [6]:
with open("TOC_Labels_Set1.pkl", "rb") as f:
    section_dicts = pickle.load(f)
section_dicts = [flatten_contract_dict(section_dict) for section_dict in section_dicts]

In [7]:
json_files = {
    'pdf_0': 
            {
                "name": "ZogenixInc_20190509_10-Q_EX-10.2_11663313_EX-10.2_Distributor Agreement",
                "path": "pdf_0_from_list_in_discord.json"
            },
    'pdf_1': 
            {
                "name": "PerformanceSportsBrandsInc_20110909_S-1_EX-10.10_7220214_EX-10.10_Endorsement Agreement",
                "path": "pdf_1_from_list_in_discord.json"
            },
    'pdf_2': 
            {
                "name": "OTISWORLDWIDECORP_04_03_2020-EX-10.4-INTELLECTUAL PROPERTY AGREEMENT by and among UNITED TECHNOLOGIES CORPORATION, OTIS WORLDWIDE CORPORATION and CARRIER ~1",
                "path": "pdf_2_from_list_in_discord.json"
            },
    'pdf_3': 
            {
                "name": "NUVEEN - REMARKETING AGREEMENT",
                "path": "pdf_3_from_list_in_discord.json"
            },
    'pdf_4': 
            {
                "name": "'ParatekPharmaceuticalsInc_20170505_10-KA_EX-10.29_10323872_EX-10.29_Outsourcing Agreement",
                "path": "pdf_4_from_list_in_discord.json"
            },
 

}

In [8]:
file_key = 'pdf_1'
json_path = json_files[file_key]['path']
filename = json_files[file_key]['name']
pickle_key = int(file_key.split('_')[1])
section_dict = section_dicts[pickle_key]


with open(json_path, 'r') as f:
    json_output = json.load(f)
file_as_dict = get_file_by_dict(json_output)
preprocessed_output = final_file_line_by_line(file_as_dict, threshold=0.65)
df = pd.DataFrame(preprocessed_output)
preprocessed_output = df.sort_values(by=['page_id', 'ymin_min']).to_dict(orient='list')

df = pd.DataFrame(get_starts_all(section_dict, preprocessed_output)).rename(columns={0:'Line via OCR',
                                     1:'Section Title via HTML',
                                     2:'ymin',
                                     3:'ymax',
                                     4:'page_id',
                                     5:'bboxes'})
df['exact_match_bbox'], df['exact_match_text'] = zip(*df.apply(lambda row: extract_exact_match(row), axis=1))
df

304 304


Unnamed: 0,Line via OCR,Section Title via HTML,ymin,ymax,page_id,bboxes,exact_match_bbox,exact_match_text
0,"1. Definitions. As used herein, the following ...",1. Definitions,0.334961,0.349609,2,"[[[0.0859375, 0.3359375], [0.09765625, 0.34667...","[[0.0859375, 0.3349609375], [0.1923828125, 0.3...",1. Definitions.
1,2. Term. The Term of this Agreement shall be f...,2. Term,0.09668,0.111328,3,"[[[0.0830078125, 0.09765625], [0.0986328125, 0...","[[0.0830078125, 0.09765625], [0.158203125, 0.1...",2. Term.
2,3. Grant ofLicense and Exclusivity. Subject to...,3. Grant of License and Exclusivity,0.119141,0.136719,3,"[[[0.0830078125, 0.1201171875], [0.0986328125,...","[[0.0830078125, 0.1201171875], [0.3623046875, ...",3. Grant ofLicense and Exclusivity. Subject
3,4. Retention of Rights. All rights not specifi...,4. Retention of Rights,0.253906,0.270508,3,"[[[0.0830078125, 0.2548828125], [0.0986328125,...","[[0.0830078125, 0.2548828125], [0.2412109375, ...",4. Retention of Rights.
4,5. Appearances.,5. Appearances,0.290039,0.303711,3,"[[[0.0849609375, 0.2900390625], [0.095703125, ...","[[0.0849609375, 0.2900390625], [0.2060546875, ...",5. Appearances.
5,7. Compensation.,6. Compensation,0.154297,0.167969,5,"[[[0.083984375, 0.154296875], [0.099609375, 0....","[[0.083984375, 0.154296875], [0.2119140625, 0....",7. Compensation.
6,Supply of Endorsed Products. Company shall del...,7. Supply of Endorsed Products,0.277344,0.292969,6,"[[[0.1484375, 0.279296875], [0.193359375, 0.29...","[[0.1484375, 0.2783203125], [0.3798828125, 0.2...",Supply of Endorsed Products. Company
7,8. Approval of Advertising.,8. Approval of Advertising,0.323242,0.337891,6,"[[[0.083984375, 0.3232421875], [0.0986328125, ...","[[0.083984375, 0.3232421875], [0.271484375, 0....",8. Approval of Advertising.
8,"9. Ownership. All materials, in any form whats...",9. Ownership,0.186523,0.202148,7,"[[[0.0830078125, 0.1875], [0.0986328125, 0.200...","[[0.0830078125, 0.1875], [0.19140625, 0.200195...",9. Ownership.
9,10. SAGandOrAFTRA In the event of any of the A...,10. SAG and/or AFTRA,0.254883,0.269531,7,"[[[0.0849609375, 0.2548828125], [0.1044921875,...","[[0.0849609375, 0.2548828125], [0.28515625, 0....",10. SAGandOrAFTRA In the


In [10]:
df.to_csv("PerformanceSportsBrandsInc_20110909_S-1_EX-10.10_7220214_EX-10.10_Endorsement Agreement_debug.csv", index=False)

## That's from the file parsing, let's now convert these files to images and then draw bounding boxes around the findings

In [11]:
from pdf2image import convert_from_path, convert_from_bytes
poppler_path = "C:/Users/islam/poppler-22.04.0/Library/bin/"
output_path = "C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/"
pdf_path = "C:/Users/islam/Desktop/shortcutpaths/CUAD_v1/CUAD_v1/full_contract_pdf/Part_I/Endorsement/PerformanceSportsBrandsInc_20110909_S-1_EX-10.10_7220214_EX-10.10_Endorsement Agreement.pdf"

In [12]:
pdf_name = pdf_path.split("/")[-1][:-4]
print(pdf_name)
full_output_folder = output_path + pdf_name
if not os.path.exists(full_output_folder):
    os.mkdir(full_output_folder)

PerformanceSportsBrandsInc_20110909_S-1_EX-10.10_7220214_EX-10.10_Endorsement Agreement


In [13]:
convert_from_path(pdf_path=pdf_path, output_folder=full_output_folder, poppler_path=poppler_path, fmt='jpeg')

[<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1653x2339>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1653x2339>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1653x2339>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1653x2339>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1653x2339>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1653x2339>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1653x2339>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1653x2339>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1653x2339>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1653x2339>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1653x2339>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1653x2339>]

In [38]:
import cv2
import PIL
import math
img_path = r"C:\Users\islam\Desktop\NYU Files\Classwork\Fall 2022 Semester\Capstone\cuad_image_samples\PerformanceSportsBrandsInc_20110909_S-1_EX-10.10_7220214_EX-10.10_Endorsement Agreement\a794b1ff-a763-4698-a86a-e60c4209e5c6-03.jpg".replace("\\","/")

In [44]:
(x1, y1), (x2, y2) = df.iloc[0]['exact_match_bbox']
print((x1, y1), (x2, y2))
img = cv2.imread(img_path)
width, height = PIL.Image.open(img_path).size
x1 = math.floor(width * x1)
x2 = math.ceil(width * x2)
y1 = math.floor(height * y1)
y2 = math.ceil(height * y2)

(x1, y1), (x2, y2)

(0.0859375, 0.3349609375) (0.1923828125, 0.3466796875)


((142, 783), (319, 811))

In [45]:
img2 = cv2.rectangle(img, (x1, y1), (x2, y2), (255,0,0), 2)
cv2.imwrite("my.png", img2)

True

## Okay with the scrapwork done, lets write everything into a pipeline

In [95]:
def make_images(poppler_path, parent_path, pdf_path):
    pdf_name = pdf_path.split("/")[-1][:-4]
    print(f"Converting pdf to image for {pdf_name}")
    pdf_name = pdf_name if len(pdf_name) < 20 else pdf_name[:20]
    full_output_folder = parent_path + pdf_name
    if not os.path.exists(full_output_folder):
        os.mkdir(full_output_folder)
        print(f"Saving images in {full_output_folder}")
    convert_from_path(pdf_path=pdf_path, output_folder=full_output_folder, poppler_path=poppler_path, fmt='jpeg')
    return full_output_folder

def write_bbox_images(pdf_parsed_df, full_output_folder):
    for page_id in pdf_parsed_df['page_id'].unique():
        sub_df = df[df['page_id'] == page_id].copy()
        
        img_page_id = f"{page_id + 1}.jpg"
        img_name = list(filter(lambda x: True if img_page_id in x else False, os.listdir(full_output_folder)))[0]
        img_read_path = full_output_folder + "/" + img_name
        img_write_parent_path = f"{full_output_folder}/bboxes/"
        
        if not os.path.exists(img_write_parent_path):
            os.mkdir(img_write_parent_path)
        
        img_write_full_path = img_write_parent_path + img_name
        img = cv2.imread(img_read_path)
        width, height = PIL.Image.open(img_read_path).size
        
        for idx, row in sub_df.iterrows():
            (x1, y1), (x2, y2) = row['exact_match_bbox']
            
            x1 = math.floor(width * x1)
            x2 = math.ceil(width * x2)
            y1 = math.floor(height * y1)
            y2 = math.ceil(height * y2)
            
            img = cv2.rectangle(img, (x1, y1), (x2, y2), (255,0,0), 2)
            print(f"Drew bounding boxes for {img_name} page ")
        cv2.imwrite(img_write_full_path, img)
        print(f"Wrote image with bboxes @ {img_write_full_path}")
    return None

In [61]:
write_bbox_images(pdf_parsed_df=df, full_output_folder=full_output_folder)

Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-03.jpg page 
Wrote image with bboxes @ C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/PerformanceSportsBrandsInc_20110909_S-1_EX-10.10_7220214_EX-10.10_Endorsement Agreement/bboxes/a794b1ff-a763-4698-a86a-e60c4209e5c6-03.jpg
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-04.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-04.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-04.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-04.jpg page 
Wrote image with bboxes @ C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/PerformanceSportsBrandsInc_20110909_S-1_EX-10.10_7220214_EX-10.10_Endorsement Agreement/bboxes/a794b1ff-a763-4698-a86a-e60c4209e5c6-04.jpg
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-04.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86

Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-10.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-10.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-10.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-10.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-10.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-10.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-10.jpg page 
Wrote image with bboxes @ C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/PerformanceSportsBrandsInc_20110909_S-1_EX-10.10_7220214_EX-10.10_Endorsement Agreement/bboxes/a794b1ff-a763-4698-a86a-e60c4209e5c6-10.jpg
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-10.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-10.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-10.jpg page 
Drew 

Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-11.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-11.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-11.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-11.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-11.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-11.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-11.jpg page 
Wrote image with bboxes @ C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/PerformanceSportsBrandsInc_20110909_S-1_EX-10.10_7220214_EX-10.10_Endorsement Agreement/bboxes/a794b1ff-a763-4698-a86a-e60c4209e5c6-11.jpg
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-11.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-11.jpg page 
Drew bounding boxes for a794b1ff-a763-4698-a86a-e60c4209e5c6-11.jpg page 
Drew 

## Let's try another pdf now

In [96]:
file_key = 'pdf_2'
json_path = json_files[file_key]['path']
filename = json_files[file_key]['name']
pickle_key = int(file_key.split('_')[1])
section_dict = section_dicts[pickle_key]


with open(json_path, 'r') as f:
    json_output = json.load(f)
file_as_dict = get_file_by_dict(json_output)
preprocessed_output = final_file_line_by_line(file_as_dict, threshold=0.65)
df = pd.DataFrame(preprocessed_output)
preprocessed_output = df.sort_values(by=['page_id', 'ymin_min']).to_dict(orient='list')

df = pd.DataFrame(get_starts_all(section_dict, preprocessed_output)).rename(columns={0:'Line via OCR',
                                     1:'Section Title via HTML',
                                     2:'ymin',
                                     3:'ymax',
                                     4:'page_id',
                                     5:'bboxes'})
df['exact_match_bbox'], df['exact_match_text'] = zip(*df.apply(lambda row: extract_exact_match(row), axis=1))
df

594 594
Couldn't match ARTICLE I DEFINITIONS with a line. Moving onto next TOC section
Couldn't match ARTICLE III LICENSING OF INTELLECTUAL PROPERTY RIGHTS with a line. Moving onto next TOC section
Couldn't match ARTICLE IV TRADEMARKS with a line. Moving onto next TOC section
Couldn't match ARTICLE V EXCLUDED AGREEMENTS with a line. Moving onto next TOC section
Couldn't match ARTICLE VI CONFIDENTIALITY with a line. Moving onto next TOC section
Couldn't match ARTICLE VII LIMITATIONS AND DISCLAIMERS with a line. Moving onto next TOC section
Couldn't match ARTICLE VIII GOVERNING LAW AND DISPUTE RESOLUTION with a line. Moving onto next TOC section


Unnamed: 0,Line via OCR,Section Title via HTML,ymin,ymax,page_id,bboxes,exact_match_bbox,exact_match_text
0,1.1 Defined Terms. For the purposes of this Ag...,1.1 Defined Terms,0.110352,0.125,4,"[[[0.1337890625, 0.111328125], [0.154296875, 0...","[[0.1337890625, 0.1103515625], [0.2412109375, ...",1.1 Defined Terms.
1,ASSIGNMENT OF SOLELY OWNED! INTELLECTUAL PROPE...,ARTICLE II ASSIGNMENT OF SOLELY OWNED INTELLEC...,0.255859,0.267578,9,"[[[0.2880859375, 0.2568359375], [0.375, 0.2675...","[[0.2880859375, 0.255859375], [0.7138671875, 0...",ASSIGNMENT OF SOLELY OWNED! INTELLECTUAL PROPE...
2,2.1 Assigned Intellectual Property Rights,2.1 Assigned Intellectual Property Rights,0.279297,0.293945,9,"[[[0.1328125, 0.279296875], [0.1533203125, 0.2...","[[0.1328125, 0.279296875], [0.3701171875, 0.29...",2.1 Assigned Intellectual Property Rights
3,"(collectively, ""Assigned Intellectual Property...",3.1 Licensed Intellectual Property Rights,0.098633,0.113281,10,"[[[0.08203125, 0.0986328125], [0.1572265625, 0...","[[0.08203125, 0.0986328125], [0.39453125, 0.11...","(collectively, ""Assigned Intellectual Property..."
4,3.1 Licensed Intellectual Property Rights,3.2 Reserved Intellectual Property Rights,0.15918,0.171875,10,"[[[0.1328125, 0.1591796875], [0.1533203125, 0....","[[0.1328125, 0.1591796875], [0.365234375, 0.17...",3.1 Licensed Intellectual Property Rights
5,3.3 No Rescission. The provisions of this Agre...,3.3 No Rescission,0.111328,0.126953,12,"[[[0.1328125, 0.1123046875], [0.1533203125, 0....","[[0.1328125, 0.1123046875], [0.240234375, 0.125]]",3.3 No Rescission.
6,4.1 Ownership ofUnited Technologies Trademarks.,4.1 Ownership of United Technologies Trademarks,0.21875,0.233398,12,"[[[0.1328125, 0.2197265625], [0.154296875, 0.2...","[[0.1328125, 0.21875], [0.423828125, 0.2333984...",4.1 Ownership ofUnited Technologies Trademarks.
7,infringements of the United Technologies Trade...,4.2 Use of United Technologies Trademarks,0.338867,0.354492,12,"[[[0.08203125, 0.33984375], [0.1640625, 0.3535...","[[0.08203125, 0.33984375], [0.3896484375, 0.35...",infringements of the United Technologies Trade...
8,4.3 Special Trademark Provisions. Special prov...,4.3 Special Trademark Provisions.,0.134766,0.149414,14,"[[[0.1328125, 0.1357421875], [0.1533203125, 0....","[[0.1328125, 0.134765625], [0.3232421875, 0.14...",4.3 Special Trademark Provisions.
9,5.1 No Change to Excluded Agreements. The Part...,5.1 No Change to Excluded Agreements,0.195312,0.209961,14,"[[[0.1328125, 0.1953125], [0.1533203125, 0.208...","[[0.1328125, 0.1953125], [0.3623046875, 0.2089...",5.1 No Change to Excluded Agreements.


In [99]:
pdf_path = r"C:\Users\islam\Desktop\shortcutpaths\CUAD_v1\CUAD_v1\full_contract_pdf\Part_III\IP\OTISWORLDWIDECORP_04_03_2020-EX-10.4-INTELLECTUAL PROPERTY AGREEMENT by and among UNITED TECHNOLOGIES CORPORATION, OTIS WORLDWIDE CORPORATION and CARRIER ~1.PDF".replace("\\","/")
parent_path = "C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/"
full_output_folder = make_images(poppler_path, parent_path, pdf_path)

Converting pdf to image for OTISWORLDWIDECORP_04_03_2020-EX-10.4-INTELLECTUAL PROPERTY AGREEMENT by and among UNITED TECHNOLOGIES CORPORATION, OTIS WORLDWIDE CORPORATION and CARRIER ~1


In [100]:
write_bbox_images(pdf_parsed_df=df, full_output_folder=full_output_folder)

Drew bounding boxes for 99a7a715-5692-4abe-b734-fc6548e4aaf0-05.jpg page 
Wrote image with bboxes @ C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/OTISWORLDWIDECORP_04/bboxes/99a7a715-5692-4abe-b734-fc6548e4aaf0-05.jpg
Drew bounding boxes for 99a7a715-5692-4abe-b734-fc6548e4aaf0-10.jpg page 
Drew bounding boxes for 99a7a715-5692-4abe-b734-fc6548e4aaf0-10.jpg page 
Wrote image with bboxes @ C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/OTISWORLDWIDECORP_04/bboxes/99a7a715-5692-4abe-b734-fc6548e4aaf0-10.jpg
Drew bounding boxes for 99a7a715-5692-4abe-b734-fc6548e4aaf0-11.jpg page 
Drew bounding boxes for 99a7a715-5692-4abe-b734-fc6548e4aaf0-11.jpg page 
Wrote image with bboxes @ C:/Users/islam/Desktop/NYU Files/Classwork/Fall 2022 Semester/Capstone/cuad_image_samples/OTISWORLDWIDECORP_04/bboxes/99a7a715-5692-4abe-b734-fc6548e4aaf0-11.jpg
Drew bounding boxes for 99a7a715-5692-4abe-b734-fc6548e4aaf0-13.jp