In [1]:
import sys
import os
sys.path.append("../doctr/")
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
import json
import pickle
import pandas as pd
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_file_by_dict(json_output):
    '''
    Given json output from OCR, construct a dictionary to better represent the data
    '''
    file_as_dict = {'words':[], 'ymin_ymax':[], 'page_id':[], 'page_dimensions':[], 'line_merge_next':[]}
    for page in json_output['pages']:
        for block in page['blocks']:
            for line in block['lines']:
                ((xmin, ymin),(xmax, ymax)) = line['geometry']

                file_as_dict['words'].append([line['words'][i]['value'] for i in range(len(line['words']))])
                file_as_dict['ymin_ymax'].append((ymin, ymax))
                file_as_dict['page_id'].append(page['page_idx'])
                file_as_dict['page_dimensions'].append(page['dimensions'])
                # file_as_dict['line_merge_next'].append(None)
    return file_as_dict

def getIOU(segment1, segment2, threshold):
    '''
    Merging algorithm for line geometries. If lines are in different blocks but have similar line coordinates, we will be 
    able to "merge" them as one line this way. Function needed because OCR model does not always treat the same line
    as an item within the same block
    '''
    ymin1, ymax1 = segment1
    ymin2, ymax2 = segment2
    
    less_ymax = min(ymax1, ymax2)
    less_ymin = min(ymin1, ymin2)
    greater_ymax = max(ymax1, ymax2)
    greater_ymin = max(ymin1, ymin2)
    
    intersection = less_ymax - greater_ymin
    
    # no overlap
    if intersection < 0:
        return False
    
    union = greater_ymax - less_ymin
    
    if intersection / union > threshold:
        return True
    return False

def get_lines_to_merge(file_as_dict, threshold):
    lines_to_merge_indices = []
    for i in range(0, len(file_as_dict['words'])):
        for j in range(i+1, len(file_as_dict['words'])):
            if (file_as_dict['page_id'][i] == file_as_dict['page_id'][j]) and getIOU(file_as_dict['ymin_ymax'][i], 
                                                                                     file_as_dict['ymin_ymax'][j], 
                                                                                     threshold):
                lines_to_merge_indices.append((i,j))
    return lines_to_merge_indices

def line_merging(lines_to_merge_indices):
    follows = dict()
    lines = dict()
    for i, j in lines_to_merge_indices:

        if i not in lines:
            if i not in follows:
                lines[i] = [i, j]
                follows[j] = [i]
            else:
                #print(f"i: {i}, j: {j}")
                #print("lines: ")
                #print(lines)
                #print("follows:")
                #print(follows)     
                one_link_back = follows[i][0]
                while one_link_back in follows:
                    one_link_back = follows[one_link_back][0]
                if j not in lines[one_link_back]:
                    lines[follows[i][0]].append(j)
        else:
            lines[i].append(j)

        if j not in follows:
            follows[j] = [i]
        else:
            follows[j].append(i)
    return lines, follows

In [3]:
def final_file_line_by_line(file_as_dict, threshold):
    final_file_as_dict = {'full_line':[], 'page_id':[], 'ymax_max':[], 'ymin_min':[]}
    lines_to_merge = get_lines_to_merge(file_as_dict, threshold)
    line_merge_map, follow_merge_map = line_merging(lines_to_merge)
    
    for i in range(len(file_as_dict['words'])):
        if i not in line_merge_map and i not in follow_merge_map:
            final_file_as_dict['full_line'].extend([file_as_dict['words'][i]])
            final_file_as_dict['page_id'].append(file_as_dict['page_id'][i])
            final_file_as_dict['ymin_min'].append(file_as_dict['ymin_ymax'][i][0])
            final_file_as_dict['ymax_max'].append(file_as_dict['ymin_ymax'][i][1])
            
        else:
            if i in line_merge_map:
                line = []
                ymin_min = []
                ymax_max = []
                for j in line_merge_map[i]:
                    line.extend(file_as_dict['words'][j])
                    ymin_min.append(file_as_dict['ymin_ymax'][j][0])
                    ymax_max.append(file_as_dict['ymin_ymax'][j][1])
                
                final_file_as_dict['full_line'].append(line)
                final_file_as_dict['ymin_min'].append(ymin_min)
                final_file_as_dict['ymax_max'].append(ymax_max)
                final_file_as_dict['page_id'].append(file_as_dict['page_id'][i])
                
                #final_file_as_dict['full_line'].append([file_as_dict['words'][j] for j in line_merge_map[i]])
                #final_file_as_dict['ymin_min'].append([file_as_dict['ymin_ymax'][j][0] for j in line_merge_map[i]])
                #final_file_as_dict['ymax_max'].append([file_as_dict['ymin_ymax'][j][1] for j in line_merge_map[i]])
    
    
    for idx, (min_element, max_element) in enumerate(zip(final_file_as_dict['ymin_min'], final_file_as_dict['ymax_max'])):
        if type(min_element) == type(list()):
            new_min_element = min(min_element)
            new_max_element = max(max_element)
            final_file_as_dict['ymin_min'][idx] = new_min_element
            final_file_as_dict['ymax_max'][idx] = new_max_element
    
                
    return final_file_as_dict
def get_toc_page(preprocessed_output):
    for page_id, line in zip(preprocessed_output['page_id'],
                             preprocessed_output['full_line']):
        if "table of contents" in " ".join(line).lower():
            return page_id
    return None

def find_start(section_info, preprocessed_output,
               subset_match_threshold, line_len_match_threshold,
               beg_line_match_threshold, last_line_pointer):
    '''
    Given a toc section title, iterate all the lines in the file from the last line associated with a section title
    going forward
    '''
    
    if last_line_pointer == len(preprocessed_output['full_line']):
        return None, last_line_pointer
    
    toc_page = get_toc_page(preprocessed_output)
    
    for idx in range(last_line_pointer, len(preprocessed_output['full_line'])):
        
        if preprocessed_output['page_id'][idx] == toc_page:
            continue
        line = " ".join(preprocessed_output['full_line'][idx])
    
        beg_line = line[0: len(section_info[0])*2]

        if (fuzz.partial_ratio(section_info[0].lower(), line.lower()) > subset_match_threshold and
            len(line) >= len(section_info[0]) * line_len_match_threshold                       and
            fuzz.partial_ratio(section_info[0].lower(), beg_line.lower()) > beg_line_match_threshold):
            
            ymin = preprocessed_output['ymin_min'][idx]
            ymax = preprocessed_output['ymax_max'][idx]
            page_id = preprocessed_output['page_id'][idx]
            
            return (line, section_info[0], ymin, ymax, page_id), idx + 1
    
    return None, last_line_pointer

def get_starts_all(section_dict, preprocessed_output):
    last_line_pointer = 0
    #lines = list(zip(preprocessed_output2['page_id'],
    #                  preprocessed_output2['full_line'],
    #                  preprocessed_output2['ymin_min'],
    #                  preprocessed_output2['ymax_max']))


    subset_match_threshold = 80    # Design decision to only allow subset match ratios of > 80/100
    line_len_match_threshold = 0.8 # Design decision to potentially only match document lines
                                   # that are not much smaller than table of content label
    beg_line_match_threshold = 80
    starts = []
    toc_page = get_toc_page(preprocessed_output)

    for key, section_info in section_dict.items():

        start, last_line_pointer = find_start(section_info, preprocessed_output,
                                               subset_match_threshold, line_len_match_threshold,
                                               beg_line_match_threshold, last_line_pointer)
        if start:
            starts.append(start)
        else:
            print(f"Couldn't match {section_info[0]} with a line. Moving onto next TOC section")
    
    return starts
def flatten_contract_dict(nested_dict):
    i = 1
    section_dict_flattened = {}

    for item in nested_dict.items():
        section_dict_flattened[i] = (item[1][0], {})
        i += 1
        for sub_item in item[1][1].items():
            section_dict_flattened[i] = (sub_item[1][0], {})
            i += 1
    return section_dict_flattened

In [4]:
json_output_paths = ['pdf_0_from_list_in_discord.json',
                     'pdf_1_from_list_in_discord.json',
                     'pdf_2_from_list_in_discord.json',
                     'pdf_3_from_list_in_discord.json',]

In [5]:
with open(json_output_paths[0], 'r') as f:
    json_output = json.load(f)
file_as_dict = get_file_by_dict(json_output)
preprocessed_output = final_file_line_by_line(file_as_dict, threshold=0.65)
df = pd.DataFrame(preprocessed_output)
preprocessed_output = df.sort_values(by=['page_id', 'ymin_min']).to_dict(orient='list')

In [6]:
with open("TOC_Labels_Set1.pkl", "rb") as f:
    section_dicts = pickle.load(f)
section_dicts = [flatten_contract_dict(section_dict) for section_dict in section_dicts]

In [7]:
section_dict = section_dicts[0]

In [8]:
get_starts_all(section_dict, preprocessed_output)

Couldn't match Article 1. Definitions with a line. Moving onto next TOC section
Couldn't match Article 2. Grant of Rights with a line. Moving onto next TOC section
Couldn't match Article 3. Governance with a line. Moving onto next TOC section
Couldn't match Article 4. Development and Regulatory Activities with a line. Moving onto next TOC section
Couldn't match Article 5. Commercialization; Supply: Trademarks with a line. Moving onto next TOC section
Couldn't match Article 6. Payments with a line. Moving onto next TOC section
Couldn't match Article 7. Payments, Books and Records with a line. Moving onto next TOC section


[('2.1 Grant of Rights to Distributor. Subject to the terms and conditions of this Agreement, Zogenix hereby appoints',
  '2.1 Grant of Rights to Distributor',
  0.185546875,
  0.203125,
  13),
 ('2.2 Sub-distribution by Distributor. Distributor shall not have the right to (a) appoint any Sub-distributors for the',
  '2.2 Sub-distribution by Distributor',
  0.296875,
  0.314453125,
  13),
 ('2.3 Supply of Product for Distributorship. Zogenix shall supply (orhave supplied) to. Distributor, in accordance with the',
  '2.3 Supply of Product for Distributorship',
  0.5078125,
  0.5263671875,
  13),
 ('2.4 No Other Rights; Other Limitations. Except for the rights expressly granted in this Agreement, Zogenix retains all',
  '2.4 No Other Rights; Other Limitations',
  0.1455078125,
  0.1630859375,
  14),
 ('2.5 Non-Compete Covenant. During the Term, without the prior written approval of Zogenix, Distributor shall not, and',
  '2.5 Non-Compete Covenant',
  0.328125,
  0.345703125,
  14),
 ('2.

Commentary:
For zogenix, it looks like it matched everything except the lines with "article" in it. This is because the TOC has article ..[SOME LABEL] in one line whereas the rest of the contract has it in 2 lines. We can make a somewhat reasonably fix for this edge case.

Let's do the next contract now

In [10]:
json_index = 1
section_dict_index = 2 # json files skip the endorsement agreement since we already did that. sorry this is confusing will fix

section_dict = section_dicts[section_dict_index]

with open(json_output_paths[json_index], 'r') as f:
    json_output = json.load(f)
file_as_dict = get_file_by_dict(json_output)
preprocessed_output = final_file_line_by_line(file_as_dict, threshold=0.65)
df = pd.DataFrame(preprocessed_output)
preprocessed_output = df.sort_values(by=['page_id', 'ymin_min']).to_dict(orient='list')

In [11]:
get_starts_all(section_dict, preprocessed_output)

[('ARTICLEIDEFNITONS', 'ARTICLE I DEFINITIONS', 0.076171875, 0.0869140625, 1),
 ('1.1 Defined Terms', '1.1 Defined Terms', 0.0869140625, 0.099609375, 1),
 ('ARTICLEIASIGAMENT OF SOLELY OWNED INTELLECTUAL PROPERIYRIGHTS',
  'ARTICLE II ASSIGNMENT OF SOLELY OWNED INTELLECTUAL PROPERTY RIGHTS',
  0.0986328125,
  0.1123046875,
  1),
 ('2.1 Assigned Intellectual Property Rights 7',
  '2.1 Assigned Intellectual Property Rights',
  0.1103515625,
  0.1240234375,
  1),
 ('ARTICLEI III LICENSING OF INTELLECTUAL PROPERTY RIGHTS 8',
  'ARTICLE III LICENSING OF INTELLECTUAL PROPERTY RIGHTS',
  0.123046875,
  0.13671875,
  1),
 ('3.1 Licensed Intellectual Property Rights',
  '3.1 Licensed Intellectual Property Rights',
  0.1357421875,
  0.1484375,
  1),
 ('3.2 Reserved Intellectual Property Rights',
  '3.2 Reserved Intellectual Property Rights',
  0.1474609375,
  0.162109375,
  1),
 ('3.3 No Rescission', '3.3 No Rescission', 0.1591796875, 0.171875, 1),
 ('ARTICLEIV TRADEMARKS', 'ARTICLE IV TRADEMARK

'OTISWORLDWIDECORP_04_03_2020-EX-10.4-INTELLECTUAL PROPERTY AGREEMENT by and among UNITED TECHNOLOGIES CORPORATION, OTIS WORLDWIDE CORPORATION and CARRIER ~1'

Matches the TOC page. This is likely because the ocr must have mispelled table of contents. We can try to fix this via some other kind of broader match for TOC

In [12]:
preprocessed_output['full_line']

[['Exhibit', '10.4'],
 ['EXECUTION', 'VERSION'],
 ['AGREEMENT'],
 ['by', 'and', 'among'],
 ['INTEDTECINOLOGIESCORPORATON,'],
 ['OTIS', 'WORLDWIDE', 'CORPORATION'],
 ['and'],
 ['CARRIER', 'GLOBAL', 'CORPORATION'],
 ['Dated', 'as', 'of', 'April', '12,', '2020'],
 ['TABLEOFCONIENIS'],
 ['ARTICLEIDEFNITONS'],
 ['1.1', 'Defined', 'Terms'],
 ['ARTICLEIASIGAMENT',
  'OF',
  'SOLELY',
  'OWNED',
  'INTELLECTUAL',
  'PROPERIYRIGHTS'],
 ['2.1', 'Assigned', 'Intellectual', 'Property', 'Rights', '7'],
 ['ARTICLEI',
  'III',
  'LICENSING',
  'OF',
  'INTELLECTUAL',
  'PROPERTY',
  'RIGHTS',
  '8'],
 ['3.1', 'Licensed', 'Intellectual', 'Property', 'Rights'],
 ['3.2', 'Reserved', 'Intellectual', 'Property', 'Rights'],
 ['10'],
 ['3.3', 'No', 'Rescission'],
 ['ARTICLEIV', 'TRADEMARKS'],
 ['4.1', 'Ownership', 'of', 'United', 'Technologies', 'Trademarks'],
 ['4.2', 'Use', 'ofUnited', 'Technologies', 'Trademarks'],
 ['43', 'Special', 'Trademark', 'Provisions.'],
 ['ARTICLEY', 'VI', 'EXCLUDED', 'AGREEMENT

Confirmed the above hypothesis. This pdf is formatted poorly with characters having no space. This hurts OCR

Let's see the next document

In [13]:
json_index = 2
section_dict_index = 3 # json files skip the endorsement agreement since we already did that. sorry this is confusing will fix

section_dict = section_dicts[section_dict_index]

with open(json_output_paths[json_index], 'r') as f:
    json_output = json.load(f)
file_as_dict = get_file_by_dict(json_output)
preprocessed_output = final_file_line_by_line(file_as_dict, threshold=0.65)
df = pd.DataFrame(preprocessed_output)
preprocessed_output = df.sort_values(by=['page_id', 'ymin_min']).to_dict(orient='list')

In [14]:
get_starts_all(section_dict, preprocessed_output)

[('Section 1. Definitions. Capitalized terms used herein that are not otherwise defined shall have the meanings assigned to',
  'Section1. Definitions',
  0.490234375,
  0.5068359375,
  3),
 ('Section 2. Appointment and Obligations ofthe Remarketing Agent.',
  'Section2. Appointment and Obligations of the Remarketing Agent',
  0.1044921875,
  0.1201171875,
  5),
 ('Section 3. Representations. Warranties and Covenants oft the Remarketing Agent and the Fund.',
  'Section3. Representations, Warranties and Covenants of the Remarketing Agent and the Fund',
  0.1484375,
  0.1650390625,
  8),
 ('Section 4. Feesand Expenses. For the performance ofi its services as Remarketing Agent hereunder, the Fund shall pay',
  'Section4. Fees and Expenses',
  0.1337890625,
  0.1513671875,
  9),
 ('Section 5. Resignation. Suspension and Removal oft the Remarketing. Agent.',
  'Section5. Resignation, Suspension and Removal of the Remarketing Agent',
  0.3095703125,
  0.328125,
  9),
 ('Section 6. Dealing in

In [15]:
len(section_dict), len(get_starts_all(section_dict, preprocessed_output))

(22, 22)

Looks like NUVEEN - REMARKETING AGREEMENT is a perfect match

## Big concern for bias in labeling data: Will many contract section headers be biased to these type of contracts where the header is normal font sized and usually in line with other words?

On to the next example

In [16]:
json_index = 3
section_dict_index = 4 # json files skip the endorsement agreement since we already did that. sorry this is confusing will fix

section_dict = section_dicts[section_dict_index]

with open(json_output_paths[json_index], 'r') as f:
    json_output = json.load(f)
file_as_dict = get_file_by_dict(json_output)
preprocessed_output = final_file_line_by_line(file_as_dict, threshold=0.65)
df = pd.DataFrame(preprocessed_output)
preprocessed_output = df.sort_values(by=['page_id', 'ymin_min']).to_dict(orient='list')

In [17]:
get_starts_all(section_dict, preprocessed_output)

Couldn't match 3.8 [* * *] with a line. Moving onto next TOC section
Couldn't match 3.9 [* * *] with a line. Moving onto next TOC section


[('Article 1 Interpretation',
  'Article 1 Interpretation',
  0.4501953125,
  0.4638671875,
  6),
 ('1.1 Definitions', '1.1 Definitions', 0.4677734375, 0.48046875, 6),
 ('1.2 Other Definitions',
  '1.2 Other Definitions',
  0.330078125,
  0.3427734375,
  10),
 ('1.3Currency', '1.3 Currency', 0.3779296875, 0.392578125, 10),
 ('1.41 Headings', '1.4 Headings', 0.4267578125, 0.439453125, 10),
 ('1.5 Exhibits', '1.5 Exhibits', 0.486328125, 0.4990234375, 10),
 ('1.6. Applicablel Law', '1.6 Applicable Law', 0.095703125, 0.1103515625, 11),
 ('Article 2 Term', 'Article 2 Term', 0.15625, 0.16796875, 11),
 ('2.1 Term', '2.1 Term', 0.173828125, 0.185546875, 11),
 ('2.2 Effect of Expiration on Purchase Orders',
  '2.2 Effect of Expiration on Purchase Orders',
  0.306640625,
  0.3193359375,
  11),
 ('Article 3 Supply of Product',
  'Article 3 Supply of Product',
  0.3779296875,
  0.3916015625,
  11),
 ('3.1 Supply of Product',
  '3.1 Supply of Product',
  0.396484375,
  0.4091796875,
  11),
 ('3.2 M

Looks like it matches the table of contents again...let's confirm

In [18]:
preprocessed_output['full_line']

[['Exhibit', '10.29'],
 ['THIS',
  'EXHIBIT',
  'HASI',
  'BEEN',
  'REDACTED.',
  'ANDI',
  'ISTHE',
  'SUBJECTOFACONFIDENTIALTREATMENT',
  'REQUEST.',
  'REDACTED!',
  'MATERIALIS'],
 ['MARKED',
  'WITHI',
  '[**',
  '*JANDHASI',
  'BEENI',
  'HILEDSEPARATELY',
  'WITH',
  'THE',
  'SECURITIESAND:',
  'EXCHANGE',
  'COMMISSION.'],
 ['Execution', 'Version'],
 ['Outsourcing', 'Agreement'],
 ['Between'],
 ['Paratek', 'Pharmaceuticals,', 'Inc.'],
 ['and'],
 ['CARBOGEN.', 'AMCIS', 'AG'],
 ['Date'],
 ['30', 'December', '2016'],
 ['Source:', 'PARATEK', 'PHARMACEUTICALS,', 'INC.,', '10-K/A,', '5/5/2017'],
 ['THIS',
  'EXHIBITHAS!',
  'BEENI',
  'REDACTED.',
  'ANDI',
  'ISTHE',
  'SURIECTOFACONIDENTALTREATMENT',
  'REQUEST.',
  'REDACTIDMATERIALIS'],
 ['MARKED',
  'WITH!*',
  '*JANDHASI',
  'BEENI',
  'FILEDSEPARATELY',
  'WITITIESECIRITESANDENCIANGE',
  'COMMISSION.'],
 ['Table', 'of', 'Contents'],
 ['Article', '1', 'Interpretation', '7'],
 ['1.1', 'Definitions', '7'],
 ['1.2', 'Other', 'De

Oh wait no it actually matched the lines! It just looked like TOC matches but it was not.

## Out of 5 articles here are results:
1. PerformanceSportsBrandsInc_20110909_S-1_EX-10.10_7220214_EX-10.10_Endorsement Agreement
    - perfect match

2. ZogenixInc_20190509_10-Q_EX-10.2_11663313_EX-10.2_Distributor Agreement 
    - many good matches; doesn't match lines where TOC label is actually 2 lines in the document (specifically cases where the label is "ARTICLE X ...."

3. 'OTISWORLDWIDECORP_04_03_2020-EX-10.4-INTELLECTUAL PROPERTY AGREEMENT by and among UNITED TECHNOLOGIES CORPORATION, OTIS WORLDWIDE CORPORATION and CARRIER ~1'
    - matched the TOC page itself. Need better method to find that page. Use state dictionary and match twice method OR fuzzy matching OR some other method
    
4. NUVEEN - REMARKETING AGREEMENT i
    - Perfect match
    
5. ParatekPharmaceuticalsInc_20170505_10-KA_EX-10.29_10323872_EX-10.29_Outsourcing Agreement
    - Looks like a perfect match except for these two lines in the actual TOC that are actually ambiguous. Not sure what they even mean. The two lines were:
        - 3.8 [* * *]
        - 3.9 [* * * ]