## Requirements: 
- https://github.com/mindee/doctr: I did a git clone; pip install was not working. Also install GTK (for windows; instructions in the repo). Mac should be able to install something similar.
- Pytorch
- pandas
- matplotlib

## We run OCR on CUAD_v1\full_contract_pdf\Part_I\Endorsement\PerformanceSportsBrandsInc_20110909_S-1_EX-10.10_7220214_EX-10.10_Endorsement Agreement.pdf

In [None]:
import sys
import os
import json


In [None]:
sys.path.append("../doctr/")
from doctr.models import ocr_predictor
from doctr.io import DocumentFile



In [None]:

path = r"C:\Users\islam\Desktop\shortcutpaths\CUAD_v1\CUAD_v1\full_contract_pdf\Part_I\Endorsement\PerformanceSportsBrandsInc_20110909_S-1_EX-10.10_7220214_EX-10.10_Endorsement Agreement.pdf".replace("\\","/",)
model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)

doc = DocumentFile.from_pdf(path)

result = model(doc)

json_output = result.export()

## Sukrit and maybe shaan, you guys didn't install doctr so i dumped the json file so you guys can just read the OCR output that way

In [None]:
with open('example_json_output.json', 'r', encoding='utf-8') as f:
    json_output = json.load(f)

## Make sure to read the json file and store as json_output

## Below we define functions to parse and merge relevant lines in the document after OCR gave us json structure. You all can ignore the functions and simply focus on the output. I stored the output of all of these functions as "preprocessed_output" variable. You can literally copy paste the output cell in this notebook when I run "preprocessed_output['full_line']" OR read in the json_output and simply run the cells in order

In [None]:
def get_file_by_dict(json_output):
    '''
    Given json output from OCR, construct a dictionary to better represent the data
    '''
    file_as_dict = {'words':[], 'ymin_ymax':[], 'page_id':[], 'page_dimensions':[], 'line_merge_next':[], 'bboxes': []}
    for page in json_output['pages']:
        for block in page['blocks']:
            for idx, line in enumerate(block['lines']):
                ((xmin, ymin),(xmax, ymax)) = line['geometry']

                
                file_as_dict['words'].append([line['words'][i]['value'] for i in range(len(line['words']))])
                file_as_dict['bboxes'].append([line['words'][ii]['geometry'] for ii in range(len(line['words']))])
                
                file_as_dict['ymin_ymax'].append((ymin, ymax))
                file_as_dict['page_id'].append(page['page_idx'])
                file_as_dict['page_dimensions'].append(page['dimensions'])
                # file_as_dict['line_merge_next'].append(None)
    return file_as_dict

def getIOU(segment1, segment2, threshold):
    '''
    Merging algorithm for line geometries. If lines are in different blocks but have similar line coordinates, we will be 
    able to "merge" them as one line this way. Function needed because OCR model does not always treat the same line
    as an item within the same block
    '''
    ymin1, ymax1 = segment1
    ymin2, ymax2 = segment2
    
    less_ymax = min(ymax1, ymax2)
    less_ymin = min(ymin1, ymin2)
    greater_ymax = max(ymax1, ymax2)
    greater_ymin = max(ymin1, ymin2)
    
    intersection = less_ymax - greater_ymin
    
    # no overlap
    if intersection < 0:
        return False
    
    union = greater_ymax - less_ymin
    
    if intersection / union > threshold:
        return True
    return False

def get_lines_to_merge(file_as_dict, threshold):
    lines_to_merge_indices = []
    for i in range(0, len(file_as_dict['words'])):
        for j in range(i+1, len(file_as_dict['words'])):
            if (file_as_dict['page_id'][i] == file_as_dict['page_id'][j]) and getIOU(file_as_dict['ymin_ymax'][i], 
                                                                                     file_as_dict['ymin_ymax'][j], 
                                                                                     threshold):
                lines_to_merge_indices.append((i,j))
    return lines_to_merge_indices

def line_merging(lines_to_merge_indices):
    follows = dict()
    lines = dict()
    for i, j in lines_to_merge_indices:

        if i not in lines:
            if i not in follows:
                lines[i] = [i, j]
            else:
                if j not in lines[follows[i][0]]:
                    lines[follows[i][0]].append(j)
        else:
            lines[i].append(j)

        if j not in follows:
            follows[j] = [i]
        else:
            follows[j].append(i)
    return lines, follows

## Let's get the whole file (with merged lines) as an array of lines

In [None]:
from pprint import pprint

In [None]:
file_as_dict = get_file_by_dict(json_output)

In [None]:
def final_file_line_by_line(file_as_dict, threshold):
    final_file_as_dict = {'full_line':[], 'page_id':[], 'ymax_max':[], 'ymin_min':[], 'bboxes': []}
    lines_to_merge = get_lines_to_merge(file_as_dict, threshold)
    line_merge_map, follow_merge_map = line_merging(lines_to_merge)

    # final_file_as_dict['bboxes'] = [bbox for bbox in file_as_dict['bboxes']]
    for i in range(len(file_as_dict['words'])):
        if i not in line_merge_map and i not in follow_merge_map:
            final_file_as_dict['full_line'].extend([file_as_dict['words'][i]])
            final_file_as_dict['bboxes'].extend([file_as_dict['bboxes'][i]])
            final_file_as_dict['page_id'].append(file_as_dict['page_id'][i])
            final_file_as_dict['ymin_min'].append(file_as_dict['ymin_ymax'][i][0])
            final_file_as_dict['ymax_max'].append(file_as_dict['ymin_ymax'][i][1])
            
        else:
            if i in line_merge_map:
                line = []
                ymin_min = []
                ymax_max = []
                bboxes = []
                for j in line_merge_map[i]:
                    line.extend(file_as_dict['words'][j])
                    ymin_min.append(file_as_dict['ymin_ymax'][j][0])
                    ymax_max.append(file_as_dict['ymin_ymax'][j][1])
                    bboxes.extend(file_as_dict['bboxes'][j])
                
                final_file_as_dict['full_line'].append(line)
                final_file_as_dict['bboxes'].append(bboxes)
                final_file_as_dict['ymin_min'].append(ymin_min)
                final_file_as_dict['ymax_max'].append(ymax_max)
                final_file_as_dict['page_id'].append(file_as_dict['page_id'][i])
                
                #final_file_as_dict['full_line'].append([file_as_dict['words'][j] for j in line_merge_map[i]])
                #final_file_as_dict['ymin_min'].append([file_as_dict['ymin_ymax'][j][0] for j in line_merge_map[i]])
                #final_file_as_dict['ymax_max'].append([file_as_dict['ymin_ymax'][j][1] for j in line_merge_map[i]])
    
    
    for idx, (min_element, max_element) in enumerate(zip(final_file_as_dict['ymin_min'], final_file_as_dict['ymax_max'])):
        if type(min_element) == type(list()):
            new_min_element = min(min_element)
            new_max_element = max(max_element)
            final_file_as_dict['ymin_min'][idx] = new_min_element
            final_file_as_dict['ymax_max'][idx] = new_max_element
    
    print(len(final_file_as_dict['full_line']), len(final_file_as_dict['bboxes']))
    return final_file_as_dict

## Here is the final variable that will contain the info we need where each item in preprocessed_output['full_line'] is a line in the file

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 100)

In [None]:
# for i in range(len(file_as_dict['words'])):
#     if len(file_as_dict['words'][i]) != len(file_as_dict['bboxes'][i]):
#         pprint(file_as_dict['words'][i])
#         break

In [None]:
# for key in file_as_dict.keys():
#     print(len(file_as_dict[key]), key)

In [None]:
preprocessed_output = final_file_line_by_line(file_as_dict, threshold=0.7)
df = pd.DataFrame(preprocessed_output)
preprocessed_output = df.sort_values(by=['page_id', 'ymin_min']).to_dict(orient='list')

In [None]:
preprocessed_output['full_line'][1], preprocessed_output['bboxes'][1]

In [None]:
df.head()

In [None]:
# make the numner of words in each row matches the number of bboxes
(df["full_line"].str.len() == df['bboxes'].str.len()).all()

In [None]:
len(preprocessed_output['page_id']), len(preprocessed_output['full_line'])

In [None]:
# Example output from Rohith that we need to match against

section_dict = {1: ('1. Definitions', {}),
 2: ('2. Term', {}),
 3: ('3. Grant of License and Exclusivity', {}),
 4: ('4. Retention of Rights', {}),
 5: ('5. Appearances', {}),
 6: ('6. Compensation', {}),
 7: ('7. Supply of Endorsed Products', {}),
 8: ('8. Approval of Advertising', {}),
 9: ('9. Ownership', {}),
 10: ('10. SAG and/or AFTRA', {}),
 11: ('11. Standards', {}),
 12: ('12. Events of Default', {}),
 13: ('13. Termination/Remedies', {}),
 14: ('14. Companys Debts', {}),
 15: ('15. Indemnification', {}),
 16: ('16. Insurance', {}),
 17: ('17. Waiver', {}),
 18: ('18. Notices', {}),
 19: ('19. Assignment', {}),
 20: ('20. Independent Contractor', {}),
 21: ('21. Joint Venture', {}),
 22: ('22. Governing Law', {}),
 23: ('23 Entire Agreement', {}),
 24: ('24. Amendments', {}),
 25: ('25. Authority', {}),
 26: ('26. Severability', {}),
 27: ('27. Compliance with Laws', {}),
 28: ('28. Attorneys Fees and Costs', {}),
 29: ('29. Force Majeure', {}),
 30: ('30. Confidentiality', {}),
 31: ('31. Counterparts', {})}

In [None]:
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from fuzzysearch import find_near_matches
import re

subset_match_threshold = 80    # Design decision to only allow subset match ratios of > 80/100
line_len_match_threshold = 0.8 # Design decision to potentially only match document lines
                               # that are not much smaller than table of content label


In [None]:
df['preprocessed_output'] = df['full_line'].str.join(" ")
df.head()

In [None]:
def fuzzy_match(row, toc_page_id, section_dict, line_len_match_threshold, subset_match_threshold):
    line, page_id = row['preprocessed_output'], int(row['page_id'])
    if page_id != toc_page_id:
        for key, section_info in section_dict.items():
            if (fuzz.partial_ratio(section_info[0].lower(), line.lower()) > subset_match_threshold and
                len(line) >= len(section_info[0]) * line_len_match_threshold):
                return line, section_info[0]
        else:
            return None, None
    return None, None

In [None]:
def merge_bboxes(words, query, bboxes, window_size, match_score):
    
    max_window_score, match_idx = 0, 0
    for idx, start_idx in enumerate(range(0, len(words), window_size)):
        window_text = " ".join(words[start_idx : start_idx + window_size])
        
        window_score = process.extractBests(window_text, [query], scorer=fuzz.token_set_ratio)[0][-1]
        candidate_bboxes = bboxes[start_idx : start_idx + window_size]

        if window_score >= match_score:
            # x min is the x_left of the first bbox 
            # y_min is the min of the top left y's for each box
            x_min = candidate_bboxes[0][0][0]
            y_min = min([y[0][-1] for y in candidate_bboxes])

            # x max is the x_right of the last bbox 
            # y_max is the max of the bottom_right y's for each box
            x_max = candidate_bboxes[-1][1][0]
            y_max = max([y[1][-1] for y in candidate_bboxes])

            merged_bbox = [ [x_min, y_min], [x_max, y_max] ]
            
            return merged_bbox, window_text
        
        else:
            if window_score > max_window_score:
                max_window_score = window_score
                match_text = window_text
                match_candidate_bboxes = candidate_bboxes

            continue
    
    # in case the window_text has a lower match score than with the entire string, match with the 
    # segment with highest matching score
    x_min = match_candidate_bboxes[0][0][0]
    y_min = min([y[0][-1] for y in match_candidate_bboxes])

    # x max is the x_right of the last bbox 
    # y_max is the max of the bottom_right y's for each box
    x_max = match_candidate_bboxes[-1][1][0]
    y_max = max([y[1][-1] for y in match_candidate_bboxes])

    merged_bbox = [ [x_min, y_min], [x_max, y_max] ]

    return merged_bbox, match_text


In [None]:
def extract_exact_match(row):
    
    query = row['match_toc']
    if query is not None:
        # regex
        # row.loc[['match_ocr']].str.extract(rf"({query})", flags=re.IGNORECASE, expand=True).values.flatten()[0]
        match = process.extractBests(row.loc['match_ocr'], [query], scorer=fuzz.token_set_ratio)
        
        match_text, match_score = match[0]
        window_size = len(match_text.split())

        # find the start idx of the first token from the query in the matched string
        # query_start_token = query.split()[0]
        
        
        merged_bbox, exact_match_text = merge_bboxes(row['full_line'], query, row['bboxes'], window_size, match_score)
        
        return merged_bbox, exact_match_text
    
    else:
        return None, None

In [None]:
# get the page number of the TOC. Skip this during fuzzy matching
toc_page_id = df.loc[df['preprocessed_output'].str.contains(r'table of contents', case=False), 'page_id'].iloc[0]


In [None]:
df['match_ocr'], df['match_toc'] = zip(*df.apply(lambda row: fuzzy_match(row, toc_page_id, section_dict, line_len_match_threshold, subset_match_threshold), axis=1))
df.loc[df['match_ocr'].notnull()].head()

In [None]:
df['exact_match_bbox'], df['exact_match_text'] = zip(*df.apply(lambda row: extract_exact_match(row), axis=1))
df.loc[df['match_ocr'].notnull()].head(15)

In [None]:
# keep just the first match for each toc
df = df.drop_duplicates(subset=['match_toc'])
df.loc[df['match_ocr'].notnull()].head(15)

# TODO:
-  to ensure there's only one match per entry in the TOC (From HTML), we currently keep the first match and drop the rest
   -  compare performance of this with condition where we keep the match with the highest score
-  extend to more contracts

In [None]:

matches = []
for line in preprocessed_output['full_line']:
    line = " ".join(line) ## DESIGN DECISION TO SPLIT EVERYTHING BY A SPACE
    for key, section_info in section_dict.items():
        if (fuzz.partial_ratio(section_info[0].lower(), line.lower()) > subset_match_threshold and
           len(line) >= len(section_info[0]) * line_len_match_threshold):
            matches.append((line, section_info[0]))

## As you can see, the strings do somewhat match but weneed to add extra filters in order to remove or ignore matches that we don't want. There might be some combination of string subset matches, full matches, regex, line length, lev distance, etc that we can use to really get good section titles for the TOC output that Rohith gave.

## Once we have better matches, we can tag each line with the coordinates since that info is stored here