Simple notebook to compare versions of the exchange control manual. Most updates are to the tables that contain lists of ADs and ALDAs and not to the text itself. The test here is to compare the text in the document (excluding the ToC because page numbers change). This is done by selecting page ranges in the base and new document and comparing those. Where there are mismatches, these still need to be investigated manually

In [1]:
import fitz # pymupdf
import re

pdf_file = "./pdf/Currency and Exchanges Manual for Authorised Dealers_20230523.pdf"
base_doc = fitz.open(pdf_file)

pdf_file = "./pdf/Currency and Exchanges Manual for Authorised Dealers_20231013.pdf"
new_doc = fitz.open(pdf_file)

In [5]:
header_size=70
footer_size=80

def get_page_text_as_list(pymupdf_doc, page_number):
    page = pymupdf_doc[page_number]
    tl = page.rect[0], page.rect[1]  # lower-left coordinates
    br = page.rect[2], page.rect[3]  # upper-right
    rect = fitz.Rect(tl[0], tl[1]+header_size, br[0], br[1]-footer_size)
    lines = page.get_text('text', clip=rect).split('\n')
    lines = [s.strip() for s in lines]

    # Remove empty strings from the start
    while lines and not lines[0]:
        lines.pop(0)

    # Remove empty strings from the end
    while lines and not lines[-1]:
        lines.pop()

    return lines

I need some objects to define page ranges in the source and new documents along with a page offset

In [21]:
class PageInfo:
    def __init__(self, start, end, offset):
        if end <= start:
            raise ValueError("End must be greater than Start.")
        self.start = start
        self.end = end
        self.offset = offset

class PageList:
    def __init__(self, raw_list):
        sorted_list = sorted(raw_list, key=lambda x: x['start'])
        self.page_list = []
        self.missing_pages = []
    
        for i, item in enumerate(sorted_list):
            start = item['start']
            end = item['end']
            offset = item['offset']
                
            if i > 0 and start != self.page_list[-1].end:
                print(f"There are some pages that are not being compared. They include the range [{self.page_list[-1].end}, {start})")
                for j in range(self.page_list[-1].end, start):
                    self.missing_pages.append(j)

            self.page_list.append(PageInfo(start, end, offset))    

    def find_offset(self, i):
        if i in self.missing_pages:
            return None
        for item in self.page_list:
            if item.start <= i < item.end:
                return item.offset



In [30]:
# NOTE: Only compare the document body
raw_list = [
    {'start': 13, 'end': 22, 'offset': 0},
    {'start': 23, 'end': len(base_doc), 'offset': 1}
]
page_list = PageList(raw_list)

# The most common update is to a table of Authorised Dealers. 
try:
    for i in range(13, len(base_doc)):
        page_offset = page_list.find_offset(i)
        if page_offset:
            base_lines = get_page_text_as_list(base_doc, i)
            new_lines = get_page_text_as_list(new_doc, i + page_offset)

            if len(new_lines) != len(base_lines):
                print(f"Mismatched page number {i} (in base document)")
                break
            else:
                for j in range(len(new_lines)):
                    if new_lines[j].strip() != base_lines[j].strip():
                        print(f"Page {i}, line {j} does not match")
                        break
    print("There were no mismatches for the input page ranges")
except IndexError as e:
    print(f"error on page {i}")



There are some pages that are not being compared. They include the range [22, 23)
There were no mismatches for the input page ranges
