## Multiline match problem:
- horizontal information mapping doesnt translate with window style match for exact match when we use 2 lines 
- solution:
    - choose line that has more characters and try to do some xcordinate matching with this line
        - might need to look at character level windows? not sure
    - extend this x cor to the ycor of line above / below 

In [29]:
import sys
import os
sys.path.append("../doctr/")
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
import json
import pickle
import pandas as pd
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re

In [30]:
def get_file_by_dict(json_output):
    '''
    Given json output from OCR, construct a dictionary to better represent the data
    '''
    file_as_dict = {'words':[], 'ymin_ymax':[], 'page_id':[], 'page_dimensions':[], 'line_merge_next':[], 'bboxes': []}
    for page in json_output['pages']:
        for block in page['blocks']:
            for idx, line in enumerate(block['lines']):
                ((xmin, ymin),(xmax, ymax)) = line['geometry']

                
                file_as_dict['words'].append([line['words'][i]['value'] for i in range(len(line['words']))])
                file_as_dict['bboxes'].append([line['words'][ii]['geometry'] for ii in range(len(line['words']))])
                
                file_as_dict['ymin_ymax'].append((ymin, ymax))
                file_as_dict['page_id'].append(page['page_idx'])
                file_as_dict['page_dimensions'].append(page['dimensions'])

    return file_as_dict

def getIOU(segment1, segment2, threshold):
    '''
    Merging algorithm for line geometries. If lines are in different blocks but have similar line coordinates, we will be 
    able to "merge" them as one line this way. Function needed because OCR model does not always treat the same line
    as an item within the same block
    '''
    ymin1, ymax1 = segment1
    ymin2, ymax2 = segment2
    
    less_ymax = min(ymax1, ymax2)
    less_ymin = min(ymin1, ymin2)
    greater_ymax = max(ymax1, ymax2)
    greater_ymin = max(ymin1, ymin2)
    
    intersection = less_ymax - greater_ymin
    
    # no overlap
    if intersection < 0:
        return False
    
    union = greater_ymax - less_ymin
    
    if intersection / union > threshold:
        return True
    return False

def get_lines_to_merge(file_as_dict, threshold):
    lines_to_merge_indices = []
    for i in range(0, len(file_as_dict['words'])):
        for j in range(i+1, len(file_as_dict['words'])):
            if (file_as_dict['page_id'][i] == file_as_dict['page_id'][j]) and getIOU(file_as_dict['ymin_ymax'][i], 
                                                                                     file_as_dict['ymin_ymax'][j], 
                                                                                     threshold):
                lines_to_merge_indices.append((i,j))
    return lines_to_merge_indices

def line_merging(lines_to_merge_indices):
    follows = dict()
    lines = dict()
    for i, j in lines_to_merge_indices:

        if i not in lines:
            if i not in follows:
                lines[i] = [i, j]
                follows[j] = [i]
            else:  
                one_link_back = follows[i][0]
                while one_link_back in follows:
                    one_link_back = follows[one_link_back][0]
                if j not in lines[one_link_back]:
                    lines[follows[i][0]].append(j)
        else:
            lines[i].append(j)

        if j not in follows:
            follows[j] = [i]
        else:
            follows[j].append(i)
    return lines, follows

In [62]:
def final_file_line_by_line(file_as_dict, threshold):
    final_file_as_dict = {'full_line':[], 'page_id':[], 'ymax_max':[], 'ymin_min':[], 'bboxes': []}
    lines_to_merge = get_lines_to_merge(file_as_dict, threshold)
    line_merge_map, follow_merge_map = line_merging(lines_to_merge)

    # final_file_as_dict['bboxes'] = [bbox for bbox in file_as_dict['bboxes']]
    for i in range(len(file_as_dict['words'])):
        if i not in line_merge_map and i not in follow_merge_map:
            final_file_as_dict['full_line'].extend([file_as_dict['words'][i]])
            final_file_as_dict['bboxes'].extend([file_as_dict['bboxes'][i]])
            final_file_as_dict['page_id'].append(file_as_dict['page_id'][i])
            final_file_as_dict['ymin_min'].append(file_as_dict['ymin_ymax'][i][0])
            final_file_as_dict['ymax_max'].append(file_as_dict['ymin_ymax'][i][1])
            
        else:
            if i in line_merge_map:
                line = []
                ymin_min = []
                ymax_max = []
                bboxes = []
                for j in line_merge_map[i]:
                    line.extend(file_as_dict['words'][j])
                    ymin_min.append(file_as_dict['ymin_ymax'][j][0])
                    ymax_max.append(file_as_dict['ymin_ymax'][j][1])
                    bboxes.extend(file_as_dict['bboxes'][j])
                
                final_file_as_dict['full_line'].append(line)
                final_file_as_dict['bboxes'].append(bboxes)
                final_file_as_dict['ymin_min'].append(ymin_min)
                final_file_as_dict['ymax_max'].append(ymax_max)
                final_file_as_dict['page_id'].append(file_as_dict['page_id'][i])
                
                #final_file_as_dict['full_line'].append([file_as_dict['words'][j] for j in line_merge_map[i]])
                #final_file_as_dict['ymin_min'].append([file_as_dict['ymin_ymax'][j][0] for j in line_merge_map[i]])
                #final_file_as_dict['ymax_max'].append([file_as_dict['ymin_ymax'][j][1] for j in line_merge_map[i]])
    
    
    for idx, (min_element, max_element) in enumerate(zip(final_file_as_dict['ymin_min'], final_file_as_dict['ymax_max'])):
        if type(min_element) == type(list()):
            new_min_element = min(min_element)
            new_max_element = max(max_element)
            final_file_as_dict['ymin_min'][idx] = new_min_element
            final_file_as_dict['ymax_max'][idx] = new_max_element
    
    print(len(final_file_as_dict['full_line']), len(final_file_as_dict['bboxes']))
    return final_file_as_dict

def get_toc_page(preprocessed_output):
    regex_exp = r"(table of contents|tableof(?:contents)?|(?:table\s)?of*conten|contents?)"
    
    for page_id, line in zip(preprocessed_output['page_id'],
                             preprocessed_output['full_line']):
        if re.search(regex_exp, " ".join(line).lower()):
            return page_id
    return None

def match_line(section_info, preprocessed_output, idx1, idx2,
               subset_match_threshold, line_len_match_threshold,
               beg_line_match_threshold, first_line_match_threshold, toc_page):
    '''
    replace bulk of find_start logic with this. this function operates 1 line at a time or 2 lines at a time
    if both idx are passed in. -- need to figure out how to incorporate proper bboxes logic though
    '''
    
    if preprocessed_output['page_id'][idx1] == toc_page:
        return None, None
    
    if (idx1 is not None) and (idx2 is not None):
        if preprocessed_output['page_id'][idx2] == toc_page:
            return None, None
        if preprocessed_output['page_id'][idx1] != preprocessed_output['page_id'][idx2]:
            return None, None
        
        ## 2 line merge
        line = " ".join(preprocessed_output['full_line'][idx1] + preprocessed_output['full_line'][idx2])
        line1 = " ".join(preprocessed_output['full_line'][idx1])
        
        beg_line = line[0: len(section_info[0])*2] # not really necessary for this case but keeping it for consistency
        
        if (fuzz.partial_ratio(section_info[0].lower(), line.lower()) > subset_match_threshold       and
            len(line) >= len(section_info[0]) * line_len_match_threshold                             and
            fuzz.partial_ratio(section_info[0].lower(), beg_line.lower()) > beg_line_match_threshold and
            fuzz.partial_ratio(section_info[0].lower(), line1.lower()) > first_line_match_threshold):

            ymin = preprocessed_output['ymin_min'][idx1] # min of first line
            ymax = preprocessed_output['ymax_max'][idx2] # max of second line
            
            page_id = preprocessed_output['page_id'][idx1]
            bboxes = [preprocessed_output['bboxes'][idx1], preprocessed_output['bboxes'][idx2]]
            line2 = " ".join(preprocessed_output['full_line'][idx2])
            
            return (line1, line2, section_info[0], ymin, ymax, page_id, bboxes), idx2 + 1
        
        return None, None

    line = " ".join(preprocessed_output['full_line'][idx1])
    beg_line = line[0: len(section_info[0])*2]

    if (fuzz.partial_ratio(section_info[0].lower(), line.lower()) > subset_match_threshold and
        len(line) >= len(section_info[0]) * line_len_match_threshold                       and
        fuzz.partial_ratio(section_info[0].lower(), beg_line.lower()) > beg_line_match_threshold):

        ymin = preprocessed_output['ymin_min'][idx1]
        ymax = preprocessed_output['ymax_max'][idx1]
        page_id = preprocessed_output['page_id'][idx1]
        bboxes = preprocessed_output['bboxes'][idx1]

        return (line, None, section_info[0], ymin, ymax, page_id, bboxes), idx1 + 1
    
    return None, None
    
def find_start_new(section_info, preprocessed_output,
                   subset_match_threshold, line_len_match_threshold,
                   beg_line_match_threshold, first_line_match_threshold, 
                   last_line_pointer):
    '''
    Given a toc section title, iterate all the lines in the file from the last line associated with a section title
    going forward
    '''
    
    if last_line_pointer == len(preprocessed_output['full_line']):
        return None, last_line_pointer
    
    toc_page = get_toc_page(preprocessed_output)
    
    itertuple = zip(range(last_line_pointer, len(preprocessed_output['full_line'])),
                    range(last_line_pointer + 1, len(preprocessed_output['full_line'])))
    
    for idx1, idx2 in itertuple:
        
        ## first try matching with first line
        match = match_line(section_info, preprocessed_output, idx1, None,
                           subset_match_threshold, line_len_match_threshold,
                           beg_line_match_threshold, first_line_match_threshold, 
                           toc_page)
        if match[0]:
            return match
        
        ## let's try matching with second line only. This code is extremely inefficient but keeping for now.
        match = match_line(section_info, preprocessed_output, idx2, None,
                           subset_match_threshold, line_len_match_threshold,
                           beg_line_match_threshold, first_line_match_threshold, 
                           toc_page)
        if match[0]:
            return match
    

        # now try matching with 2 lines 
        match = match_line(section_info, preprocessed_output, idx1, idx2,
                           subset_match_threshold, line_len_match_threshold,
                           beg_line_match_threshold, first_line_match_threshold, 
                           toc_page)
        if match[0]:
            return match
        ## if no match, move onto the next pair of lines
    
    return None, last_line_pointer

def get_starts_all(section_dict, preprocessed_output):
    last_line_pointer = 0
    #lines = list(zip(preprocessed_output2['page_id'],
    #                  preprocessed_output2['full_line'],
    #                  preprocessed_output2['ymin_min'],
    #                  preprocessed_output2['ymax_max']))


    subset_match_threshold = 80    # Design decision to only allow subset match ratios of > 80/100
    line_len_match_threshold = 0.8 # Design decision to potentially only match document lines
                                   # that are not much smaller than table of content label
    beg_line_match_threshold = 80
    first_line_match_threshold = 20 # in case of merged lines lets make sure top line is at least mildly relevant
    starts = []

    for key, section_info in section_dict.items():

        start, last_line_pointer = find_start_new(section_info, preprocessed_output,
                                                  subset_match_threshold, line_len_match_threshold,
                                                  beg_line_match_threshold, first_line_match_threshold, 
                                                  last_line_pointer)
        if start:
            starts.append(start)
        else:
            print(f"Couldn't match {section_info[0]} with a line. Moving onto next TOC section")
    
    return starts
def flatten_contract_dict(nested_dict):
    i = 1
    section_dict_flattened = {}

    for item in nested_dict.items():
        section_dict_flattened[i] = (item[1][0], {})
        i += 1
        for sub_item in item[1][1].items():
            section_dict_flattened[i] = (sub_item[1][0], {})
            i += 1
    return section_dict_flattened

In [63]:
def merge_bboxes(words, query, bboxes, window_size, match_score):
    
    max_window_score, match_idx = 0, 0
    for idx, start_idx in enumerate(range(0, len(words), window_size)):
        window_text = " ".join(words[start_idx : start_idx + window_size])
        
        window_score = process.extractBests(window_text, [query], scorer=fuzz.token_set_ratio)[0][-1]
        candidate_bboxes = bboxes[start_idx : start_idx + window_size]

        if window_score >= match_score:
            # x min is the x_left of the first bbox 
            # y_min is the min of the top left y's for each box
            x_min = candidate_bboxes[0][0][0]
            y_min = min([y[0][-1] for y in candidate_bboxes])

            # x max is the x_right of the last bbox 
            # y_max is the max of the bottom_right y's for each box
            x_max = candidate_bboxes[-1][1][0]
            y_max = max([y[1][-1] for y in candidate_bboxes])

            merged_bbox = [ [x_min, y_min], [x_max, y_max] ]
            
            return merged_bbox, window_text
        
        else:
            if window_score > max_window_score:
                max_window_score = window_score
                match_text = window_text
                match_candidate_bboxes = candidate_bboxes

            continue
    
    # in case the window_text has a lower match score than with the entire string, match with the 
    # segment with highest matching score
    x_min = match_candidate_bboxes[0][0][0]
    y_min = min([y[0][-1] for y in match_candidate_bboxes])

    # x max is the x_right of the last bbox 
    # y_max is the max of the bottom_right y's for each box
    x_max = match_candidate_bboxes[-1][1][0]
    y_max = max([y[1][-1] for y in match_candidate_bboxes])

    merged_bbox = [ [x_min, y_min], [x_max, y_max] ]

    return merged_bbox, match_text

def extract_exact_match(row):
    
    query = row['Section Title via HTML']
    if query is not None:

        match = process.extractBests(row.loc['Line via OCR'], [query], scorer=fuzz.token_set_ratio)
        
        match_text, match_score = match[0]
        window_size = len(match_text.split())
        
        
        merged_bbox, exact_match_text = merge_bboxes(row['Line via OCR'].split(), 
                                                     query, 
                                                     row['bboxes'], 
                                                     window_size, 
                                                     match_score)
        
        return merged_bbox, exact_match_text
    
    else:
        return None, None

In [64]:
with open("TOC_Labels_Set1.pkl", "rb") as f:
    section_dicts = pickle.load(f)
section_dicts = [flatten_contract_dict(section_dict) for section_dict in section_dicts]

In [65]:
json_files = {
    'pdf_0': 
            {
                "name": "ZogenixInc_20190509_10-Q_EX-10.2_11663313_EX-10.2_Distributor Agreement",
                "path": "pdf_0_from_list_in_discord.json"
            },
    'pdf_1': 
            {
                "name": "PerformanceSportsBrandsInc_20110909_S-1_EX-10.10_7220214_EX-10.10_Endorsement Agreement",
                "path": "pdf_1_from_list_in_discord.json"
            },
    'pdf_2': 
            {
                "name": "OTISWORLDWIDECORP_04_03_2020-EX-10.4-INTELLECTUAL PROPERTY AGREEMENT by and among UNITED TECHNOLOGIES CORPORATION, OTIS WORLDWIDE CORPORATION and CARRIER ~1",
                "path": "pdf_2_from_list_in_discord.json"
            },
    'pdf_3': 
            {
                "name": "NUVEEN - REMARKETING AGREEMENT",
                "path": "pdf_3_from_list_in_discord.json"
            },
    'pdf_4': 
            {
                "name": "'ParatekPharmaceuticalsInc_20170505_10-KA_EX-10.29_10323872_EX-10.29_Outsourcing Agreement",
                "path": "pdf_4_from_list_in_discord.json"
            },
 

}

## Output using non multiline match:

In [16]:
file_key = 'pdf_0'
json_path = json_files[file_key]['path']
filename = json_files[file_key]['name']
pickle_key = int(file_key.split('_')[1])
section_dict = section_dicts[pickle_key]


with open(json_path, 'r') as f:
    json_output = json.load(f)
file_as_dict = get_file_by_dict(json_output)
preprocessed_output = final_file_line_by_line(file_as_dict, threshold=0.65)
df = pd.DataFrame(preprocessed_output)
preprocessed_output = df.sort_values(by=['page_id', 'ymin_min']).to_dict(orient='list')

df = pd.DataFrame(get_starts_all(section_dict, preprocessed_output)).rename(columns={0:'Line via OCR',
                                     1:'Section Title via HTML',
                                     2:'ymin',
                                     3:'ymax',
                                     4:'page_id',
                                     5:'bboxes'})
df['exact_match_bbox'], df['exact_match_text'] = zip(*df.apply(lambda row: extract_exact_match(row), axis=1))
df

1858 1858
Couldn't match Article 1. Definitions with a line. Moving onto next TOC section
Couldn't match Article 2. Grant of Rights with a line. Moving onto next TOC section
Couldn't match Article 3. Governance with a line. Moving onto next TOC section
Couldn't match Article 4. Development and Regulatory Activities with a line. Moving onto next TOC section
Couldn't match Article 5. Commercialization; Supply: Trademarks with a line. Moving onto next TOC section
Couldn't match Article 6. Payments with a line. Moving onto next TOC section
Couldn't match Article 7. Payments, Books and Records with a line. Moving onto next TOC section


Unnamed: 0,Line via OCR,Section Title via HTML,ymin,ymax,page_id,bboxes,exact_match_bbox,exact_match_text
0,2.1 Grant of Rights to Distributor. Subject to...,2.1 Grant of Rights to Distributor,0.185547,0.203125,13,"[[[0.1328125, 0.1865234375], [0.1572265625, 0....","[[0.1328125, 0.1865234375], [0.3837890625, 0.2...",2.1 Grant of Rights to Distributor.
1,2.2 Sub-distribution by Distributor. Distribut...,2.2 Sub-distribution by Distributor,0.296875,0.314453,13,"[[[0.1318359375, 0.296875], [0.16015625, 0.312...","[[0.1318359375, 0.296875], [0.3876953125, 0.31...",2.2 Sub-distribution by Distributor.
2,2.3 Supply of Product for Distributorship. Zog...,2.3 Supply of Product for Distributorship,0.507812,0.526367,13,"[[[0.1337890625, 0.509765625], [0.158203125, 0...","[[0.1337890625, 0.5087890625], [0.4228515625, ...",2.3 Supply of Product for Distributorship.
3,2.4 No Other Rights; Other Limitations. Except...,2.4 No Other Rights; Other Limitations,0.145508,0.163086,14,"[[[0.1337890625, 0.146484375], [0.1591796875, ...","[[0.1337890625, 0.1455078125], [0.4208984375, ...",2.4 No Other Rights; Other Limitations.
4,"2.5 Non-Compete Covenant. During the Term, wit...",2.5 Non-Compete Covenant,0.328125,0.345703,14,"[[[0.1328125, 0.3291015625], [0.1591796875, 0....","[[0.1328125, 0.3291015625], [0.3359375, 0.34375]]",2.5 Non-Compete Covenant.
5,2.6No Activities Outside the Territory or Fiel...,2.6 No Activities Outside the Territory or Field,0.396484,0.413086,14,"[[[0.1328125, 0.3974609375], [0.18359375, 0.41...","[[0.1328125, 0.3974609375], [0.5439453125, 0.4...",2.6No Activities Outside the Territory or Fiel...
6,3.1 Joint Steering Committee. The Parties shal...,3.1 Joint Steering Committee,0.576172,0.592773,14,"[[[0.1318359375, 0.576171875], [0.1572265625, ...","[[0.1318359375, 0.576171875], [0.3466796875, 0...",3.1 Joint Steering Committee.
7,3.2 Expenses. Each Party shall bear all its ow...,3.2 Expenses,0.185547,0.202148,17,"[[[0.1328125, 0.1865234375], [0.158203125, 0.1...","[[0.1328125, 0.1865234375], [0.2333984375, 0.2...",3.2 Expenses.
8,3.3 Alliance Managers. Promptly after the Effe...,3.3 Alliance Managers,0.225586,0.242188,17,"[[[0.1318359375, 0.2255859375], [0.1591796875,...","[[0.1318359375, 0.2255859375], [0.3017578125, ...",3.3 Alliance Managers.
9,3.4 Scope of Governance. Notwithstanding the c...,3.4 Scope of Governance,0.323242,0.339844,17,"[[[0.1318359375, 0.3232421875], [0.16015625, 0...","[[0.1318359375, 0.3232421875], [0.32421875, 0....",3.4 Scope of Governance.


## Output using multiline match (find_start_new fx)

In [39]:
file_key = 'pdf_0'
json_path = json_files[file_key]['path']
filename = json_files[file_key]['name']
pickle_key = int(file_key.split('_')[1])
section_dict = section_dicts[pickle_key]


with open(json_path, 'r') as f:
    json_output = json.load(f)
file_as_dict = get_file_by_dict(json_output)
preprocessed_output = final_file_line_by_line(file_as_dict, threshold=0.65)
df = pd.DataFrame(preprocessed_output)
preprocessed_output = df.sort_values(by=['page_id', 'ymin_min']).to_dict(orient='list')

df = pd.DataFrame(get_starts_all(section_dict, preprocessed_output)).rename(columns={0:'Line via OCR',
                                                                                     1:'Line2 via OCR',
                                     2:'Section Title via HTML',
                                     3:'ymin',
                                     4:'ymax',
                                     5:'page_id',
                                     6:'bboxes'})

1858 1858


In [40]:
df

Unnamed: 0,Line via OCR,Line2 via OCR,Section Title via HTML,ymin,ymax,page_id,bboxes
0,ARTICLE1.,DEFINITIONS,Article 1. Definitions,0.526367,0.552734,5,"[[[[0.453125, 0.5263671875], [0.548828125, 0.5..."
1,ARTICLE2.,GRANTOF RIGHTS,Article 2. Grant of Rights,0.145508,0.173828,13,"[[[[0.453125, 0.1455078125], [0.548828125, 0.1..."
2,2.1 Grant of Rights to Distributor. Subject to...,,2.1 Grant of Rights to Distributor,0.185547,0.203125,13,"[[[0.1328125, 0.1865234375], [0.1572265625, 0...."
3,2.2 Sub-distribution by Distributor. Distribut...,,2.2 Sub-distribution by Distributor,0.296875,0.314453,13,"[[[0.1318359375, 0.296875], [0.16015625, 0.312..."
4,compliance with this Agreement).,2.3 Supply of Product for Distributorship. Zog...,2.3 Supply of Product for Distributorship,0.482422,0.526367,13,"[[[[0.083984375, 0.4853515625], [0.158203125, ..."
5,2.4 No Other Rights; Other Limitations. Except...,,2.4 No Other Rights; Other Limitations,0.145508,0.163086,14,"[[[0.1337890625, 0.146484375], [0.1591796875, ..."
6,"2.5 Non-Compete Covenant. During the Term, wit...",,2.5 Non-Compete Covenant,0.328125,0.345703,14,"[[[0.1328125, 0.3291015625], [0.1591796875, 0...."
7,any product for [***).,2.6No Activities Outside the Territory or Fiel...,2.6 No Activities Outside the Territory or Field,0.370117,0.413086,14,"[[[[0.08203125, 0.3740234375], [0.1103515625, ..."
8,ARTICLE3.,GOVERNANCE,Article 3. Governance,0.536133,0.563477,14,"[[[[0.453125, 0.5361328125], [0.548828125, 0.5..."
9,3.1 Joint Steering Committee. The Parties shal...,,3.1 Joint Steering Committee,0.576172,0.592773,14,"[[[0.1318359375, 0.576171875], [0.1572265625, ..."


In [41]:
df.iloc[19]

Line via OCR              Information relevant to such] pharmacovigilanc...
Line2 via OCR             4.6 Drug Safety and Pharmacovigilance System i...
Section Title via HTML    4.6 Drug Safety and Pharmacovigilance System i...
ymin                                                                0.34668
ymax                                                               0.386719
page_id                                                                  22
bboxes                    [[[[0.083984375, 0.3466796875], [0.16015625, 0...
Name: 19, dtype: object

In [42]:
list(df.iloc[19])

['Information relevant to such] pharmacovigilance activities.',
 '4.6 Drug Safety and Pharmacovigilance System including Global Safety Database. Zogenix shall maintain a global',
 '4.6 Drug Safety and Pharmacovigilance System including Global Safety Database',
 0.3466796875,
 0.38671875,
 22,
 [[[[0.083984375, 0.3466796875], [0.16015625, 0.357421875]],
   [[0.1611328125, 0.3466796875], [0.21484375, 0.3583984375]],
   [[0.2138671875, 0.3466796875], [0.232421875, 0.359375]],
   [[0.23046875, 0.3466796875], [0.2646484375, 0.3583984375]],
   [[0.2626953125, 0.3466796875], [0.384765625, 0.3603515625]],
   [[0.3857421875, 0.3466796875], [0.447265625, 0.3583984375]]],
  [[[0.1328125, 0.3720703125], [0.1591796875, 0.384765625]],
   [[0.158203125, 0.3720703125], [0.19921875, 0.38671875]],
   [[0.19921875, 0.3720703125], [0.2470703125, 0.38671875]],
   [[0.24609375, 0.37109375], [0.2783203125, 0.3857421875]],
   [[0.279296875, 0.3720703125], [0.41796875, 0.38671875]],
   [[0.41796875, 0.37207031

## It looks like its working but theres this edge case now where the second line is a good match and the first line is irrelevant but is carried in. Let's add a first_line_match_threshold

In [53]:
file_key = 'pdf_0'
json_path = json_files[file_key]['path']
filename = json_files[file_key]['name']
pickle_key = int(file_key.split('_')[1])
section_dict = section_dicts[pickle_key]


with open(json_path, 'r') as f:
    json_output = json.load(f)
file_as_dict = get_file_by_dict(json_output)
preprocessed_output = final_file_line_by_line(file_as_dict, threshold=0.65)
df = pd.DataFrame(preprocessed_output)
preprocessed_output = df.sort_values(by=['page_id', 'ymin_min']).to_dict(orient='list')

df = pd.DataFrame(get_starts_all(section_dict, preprocessed_output)).rename(columns={0:'Line via OCR',
                                                                                     1:'Line2 via OCR',
                                     2:'Section Title via HTML',
                                     3:'ymin',
                                     4:'ymax',
                                     5:'page_id',
                                     6:'bboxes'})
df

1858 1858


Unnamed: 0,Line via OCR,Line2 via OCR,Section Title via HTML,ymin,ymax,page_id,bboxes
0,ARTICLE1.,DEFINITIONS,Article 1. Definitions,0.526367,0.552734,5,"[[[[0.453125, 0.5263671875], [0.548828125, 0.5..."
1,ARTICLE2.,GRANTOF RIGHTS,Article 2. Grant of Rights,0.145508,0.173828,13,"[[[[0.453125, 0.1455078125], [0.548828125, 0.1..."
2,2.1 Grant of Rights to Distributor. Subject to...,,2.1 Grant of Rights to Distributor,0.185547,0.203125,13,"[[[0.1328125, 0.1865234375], [0.1572265625, 0...."
3,2.2 Sub-distribution by Distributor. Distribut...,,2.2 Sub-distribution by Distributor,0.296875,0.314453,13,"[[[0.1318359375, 0.296875], [0.16015625, 0.312..."
4,compliance with this Agreement).,2.3 Supply of Product for Distributorship. Zog...,2.3 Supply of Product for Distributorship,0.482422,0.526367,13,"[[[[0.083984375, 0.4853515625], [0.158203125, ..."
5,2.4 No Other Rights; Other Limitations. Except...,,2.4 No Other Rights; Other Limitations,0.145508,0.163086,14,"[[[0.1337890625, 0.146484375], [0.1591796875, ..."
6,"2.5 Non-Compete Covenant. During the Term, wit...",,2.5 Non-Compete Covenant,0.328125,0.345703,14,"[[[0.1328125, 0.3291015625], [0.1591796875, 0...."
7,any product for [***).,2.6No Activities Outside the Territory or Fiel...,2.6 No Activities Outside the Territory or Field,0.370117,0.413086,14,"[[[[0.08203125, 0.3740234375], [0.1103515625, ..."
8,ARTICLE3.,GOVERNANCE,Article 3. Governance,0.536133,0.563477,14,"[[[[0.453125, 0.5361328125], [0.548828125, 0.5..."
9,3.1 Joint Steering Committee. The Parties shal...,,3.1 Joint Steering Committee,0.576172,0.592773,14,"[[[0.1318359375, 0.576171875], [0.1572265625, ..."


In [55]:
fuzz.partial_ratio('4.6 Drug Safety and Pharmacovigilance System including Global Safety Database'.lower(), 
                   'Information relevant to such] pharmacovigilance activities.'.lower())

47

## Okay we will suffer from this false merged line positive case. This occurs three times in this document. See dataframe line 4, 7, 19

## Let's try a simple inefficient fix where we check both lines separately first then try merge

In [66]:
file_key = 'pdf_0'
json_path = json_files[file_key]['path']
filename = json_files[file_key]['name']
pickle_key = int(file_key.split('_')[1])
section_dict = section_dicts[pickle_key]


with open(json_path, 'r') as f:
    json_output = json.load(f)
file_as_dict = get_file_by_dict(json_output)
preprocessed_output = final_file_line_by_line(file_as_dict, threshold=0.65)
df = pd.DataFrame(preprocessed_output)
preprocessed_output = df.sort_values(by=['page_id', 'ymin_min']).to_dict(orient='list')

df = pd.DataFrame(get_starts_all(section_dict, preprocessed_output)).rename(columns={0:'Line via OCR',
                                                                                     1:'Line2 via OCR',
                                     2:'Section Title via HTML',
                                     3:'ymin',
                                     4:'ymax',
                                     5:'page_id',
                                     6:'bboxes'})
df

1858 1858


Unnamed: 0,Line via OCR,Line2 via OCR,Section Title via HTML,ymin,ymax,page_id,bboxes
0,ARTICLE1.,DEFINITIONS,Article 1. Definitions,0.526367,0.552734,5,"[[[[0.453125, 0.5263671875], [0.548828125, 0.5..."
1,ARTICLE2.,GRANTOF RIGHTS,Article 2. Grant of Rights,0.145508,0.173828,13,"[[[[0.453125, 0.1455078125], [0.548828125, 0.1..."
2,2.1 Grant of Rights to Distributor. Subject to...,,2.1 Grant of Rights to Distributor,0.185547,0.203125,13,"[[[0.1328125, 0.1865234375], [0.1572265625, 0...."
3,2.2 Sub-distribution by Distributor. Distribut...,,2.2 Sub-distribution by Distributor,0.296875,0.314453,13,"[[[0.1318359375, 0.296875], [0.16015625, 0.312..."
4,2.3 Supply of Product for Distributorship. Zog...,,2.3 Supply of Product for Distributorship,0.507812,0.526367,13,"[[[0.1337890625, 0.509765625], [0.158203125, 0..."
5,2.4 No Other Rights; Other Limitations. Except...,,2.4 No Other Rights; Other Limitations,0.145508,0.163086,14,"[[[0.1337890625, 0.146484375], [0.1591796875, ..."
6,"2.5 Non-Compete Covenant. During the Term, wit...",,2.5 Non-Compete Covenant,0.328125,0.345703,14,"[[[0.1328125, 0.3291015625], [0.1591796875, 0...."
7,2.6No Activities Outside the Territory or Fiel...,,2.6 No Activities Outside the Territory or Field,0.396484,0.413086,14,"[[[0.1328125, 0.3974609375], [0.18359375, 0.41..."
8,ARTICLE3.,GOVERNANCE,Article 3. Governance,0.536133,0.563477,14,"[[[[0.453125, 0.5361328125], [0.548828125, 0.5..."
9,3.1 Joint Steering Committee. The Parties shal...,,3.1 Joint Steering Committee,0.576172,0.592773,14,"[[[0.1318359375, 0.576171875], [0.1572265625, ..."
