Note: Briefly switched to pdfplumber to parse sections using text metadata (fontname: bold) instead because section 3.1 title does not agree with title formatting conventions in the rest of the document. The title is not capitalized but it is bold indicating that it is a title. 

In [466]:
import pandas as pd
import csv
import re
from PyPDF2 import PdfReader
import pdfplumber

Read PDF, correct format errors in extracted page text, convert to CSV format data (cell per word), and initialize dataframe with input data with the associated page number and row index by line.

In [467]:
def correct_format_issues(page_text, page_number):
    # Combine all patterns into a single regex using named groups
    all_patterns_regex = (
        r'(?P<line_break_lower_upper>[a-z][A-Z])|'  # Lowercase followed by uppercase
        r'(?P<space_number_upper>[0-9][A-Z])|'  # Number followed by uppercase
        r'(?P<space_lower_number>[a-z][0-9])|'  # Lowercase followed by number
        r'(?P<header>R1-1\S+)|'                     # Header
        r'(?P<footer_1>City of Vancouver.*)|'       # Footer 1
        rf'(?P<footer_2>.*Page\s+{page_number})'    # Footer 2 with exact page number
    )
    
    # Correct all formatting issues found in a single pass
    for match in re.finditer(all_patterns_regex, page_text):
        # Add a line break 
        if match.group('line_break_lower_upper'):
            # Add a line break for lowercase followed by uppercase
            matched_text = match.group(0)
            pattern = re.escape(matched_text)
            line_start, line_end = matched_text[0], matched_text[1]
            page_text = re.sub(pattern, f'{line_start}\n{line_end}', page_text)
        # Add a space
        elif match.group('space_number_upper'):
            # Add a space for number followed by uppercase or lowercase followed by number
            matched_text = match.group(0)
            pattern = re.escape(matched_text)
            line_start, line_end = matched_text[0], matched_text[1]
            page_text = re.sub(pattern, f'{line_start} {line_end}', page_text)
            
        elif match.group('space_lower_number'):
            # Add a space after the entire group (lowercase followed by numbers)
            matched_text = match.group(0)
            pattern = re.escape(matched_text)
            page_text = re.sub(pattern, f'{matched_text} ', page_text)
            
        # Delete headers and footers from the document    
        else:
            page_text = re.sub(r'R1-1', '', page_text)
            page_text = re.sub(re.escape(match.group(0)), '', page_text)

    return page_text


In [468]:
pdf_file_name = 'zoning-by-law-district-schedule-r1-1.pdf'
# Read PDF & correct formatting issues
reader = PdfReader(pdf_file_name)
pdf_text = []

for page_number, page in enumerate(reader.pages, start=1):
    # Correct formatting issues before appending page text   
    page_text = correct_format_issues(page.extract_text(), page_number)
    #  print(f'Page Text Being Appended: {page_text}')
    pdf_text.append(page_text)

# Store document text in cell format, skip empty or blank lines
# Record page number associated with each line
csv_format_data = []
line_page_numbers = []
for page_number, page_text in enumerate(pdf_text, start=1):
    lines = page_text.splitlines()
    for line_index, line in enumerate(lines):
        if line:
            line = re.sub(r'\s*([.,!?;:])', r'\1', line) 
            line = re.sub(r'([.,!?;:])\s+', r'\1 ', line) 
            line_split = line.split()
            csv_format_data.append(line_split)
            line_page_numbers.append(page_number)

df_org = pd.DataFrame(csv_format_data)
df_org.insert(0, 'Page Number', line_page_numbers)
df_org.insert(1, 'Row Index', range(1, len(df_org) + 1))
# Parquet keeps None values instead of replacing with NaN on read
df_org.to_parquet('csv_format_data.parquet', index=False)
df_org.to_csv('csv_format_data.csv', index=False)

In [469]:
# Initialize DF with CSV formatted data for processing
#df_org = pd.read_parquet('csv_format_data.parquet')
data_column_1 = df_org.iloc[:, 2]

# Initialize dictionary / vectors to hold DF column vectors to avoid repeat DF access.
all_sections = []
formatted_columns = {
    'Parent': [],
    'Section': [],
    'Section Title': [],
    'Section Start Page': [],
    'Row Index': [],
}

# Text that may appear lowercase in section titles 
conjunctions = [
    'and', 'the', 'of', 'to', 'in', 'a', 'for', 'on', 'with', 'as', 'at', 
    'by', 'but', 'or', 'an', 'so', 'if', 'because', 'that', 'while', 'from', 
    'after', 'when', 'before', 'where', 'nor', 'yet', 'once', 'although',
    'since', 'than', 'though', 'unless', 'until', 'up', 'while'
]

def get_section_title(section_level, row):
    section_title = ''

    for cell in row:
        # End of section title text
        if cell is None:
            break
        # Not a section title, returns empty string
        if section_level == 0 and not cell.isupper(): 
            return ''
        if section_level > 1 and not cell[0].isupper() and cell not in conjunctions: 
            return ''
        section_title += f'{cell} '

    return section_title.strip()
        
def add_section_row(parent_section, section_number, section_title, section_page_start, section_row_index):
    formatted_columns['Parent'].append(parent_section)
    formatted_columns['Section'].append(section_number)
    formatted_columns['Section Title'].append(section_title)
    formatted_columns['Section Start Page'].append(section_page_start)
    formatted_columns['Row Index'].append(section_row_index)
    
def get_section_range(sections, idx):
    section_start = sections[idx]
    section_end = sections[idx + 1] if idx + 1 < len(sections) else len(df_org)
    return section_start, section_end

def get_section_data(section_level, parent_section, section_start, section_end):
    subsection = 1
    sections = []

    for row_idx in range(section_start, section_end):
        if section_level == 0: 
            next_section = f'{subsection}'
        else: 
            next_section = f'{parent_section}.{subsection}'
        
        section_number = data_column_1[row_idx]
        if section_number.strip() == next_section:                
            row = df_org.iloc[row_idx]
            section_title = get_section_title(section_level, row[3:])
            section_page_start = row['Page Number']
            add_section_row(parent_section, section_number, section_title, section_page_start, row_idx)
            sections.append(row_idx)
            subsection += 1
        
    return sections

def get_sections_end_page(df_final):
    df_temp = df_final[['Parent', 'Section', 'Section Start Page', 'Row Index']].copy()
    df_temp['Section End Page'] = None 

    # Get unique page indices
    page_idxs = df_org[['Page Number', 'Row Index']].sort_values(by='Row Index')
    page_idxs = page_idxs.drop_duplicates(subset='Page Number', keep='first')
    page_row_idxs = set(page_idxs['Row Index'].tolist())
    last_page = page_idxs['Page Number'].iloc[-1]

    for idx in range(len(df_temp)):
        current_parent = df_temp.iloc[idx]['Parent']        
        if idx + 1 < len(df_temp):
            next_start_page = df_temp.iloc[idx + 1]['Section Start Page']
            next_parent = df_temp.iloc[idx + 1]['Parent']
            next_row_idx = df_temp.iloc[idx + 1]['Row Index'] + 1
            
            # If section level changes, use parent's end page
            if current_parent != next_parent:
                if current_parent is None:
                    df_temp.at[idx, 'Section End Page'] = last_page
                else:
                    parent_row = df_temp.loc[df_temp['Section'] == current_parent]
                    parent_end_page = parent_row.iloc[0]['Section End Page']
                    df_temp.at[idx, 'Section End Page'] = parent_end_page
            
            # If the next section starts on a new page, use next section's start page number - 1
            elif next_row_idx in page_row_idxs:
                df_temp.at[idx, 'Section End Page'] = next_start_page - 1
            
            # Use the next section's start page
            else:
                df_temp.at[idx, 'Section End Page'] = next_start_page
        
        # Set last section's end page to parent's end page
        else:
            parent_row = df_temp.loc[df_temp['Section'] == current_parent]
            parent_end_page = parent_row.iloc[0]['Section End Page']
            df_temp.at[idx, 'Section End Page'] = parent_end_page

    df_final['Section End Page'] = df_temp['Section End Page']

def get_sections_body(section_row_idxs, section_titles):
    all_sections_body = []

    # Iterate through sections using enumerate on row_index
    for section_idx, current_row_index in enumerate(section_row_idxs):
        if not section_titles[section_idx]:
            start_row = current_row_index
        else:
            start_row = current_row_index + 1
        
        end_row = section_row_idxs[section_idx + 1] if section_idx + 1 < len(section_row_idxs) else len(df_org)
        section_body_lines = []
        section_body_rows = df_org.iloc[start_row:end_row, 2:]
        
        if not section_titles[section_idx] and not section_body_rows.empty:
            section_body_rows.iloc[0, 0] = ''
            
        for _, row in section_body_rows.iterrows():
            row_text = ''
            for cell in row:
                if cell is None:
                    break
                row_text += f'{cell} '
            section_body_lines.append(row_text.strip())

        # Combine all rows into a single string with line breaks
        section_body_text = '\n'.join(section_body_lines)
        all_sections_body.append(section_body_text)
        
    return all_sections_body

def generate_csv_format():
    # Create DataFrame from formatted_columns
    df_final = pd.DataFrame(formatted_columns)
    
    # Extract end pages from following section start page
    get_sections_end_page(df_final)

    # Extract body text by section row indices
    df_final = df_final.sort_values(by='Row Index', ascending=True)
    section_row_idxs = df_final['Row Index'].tolist()
    section_titles = df_final['Section Title'].tolist()
    sections_body = get_sections_body(section_row_idxs, section_titles)

    # Insert 'Section Body Text' column after 'Section Title'
    body_column_idx = df_final.columns.get_loc('Section Title') + 1
    df_final.insert(body_column_idx, 'Section Body Text', sections_body)

    df_final = df_final.drop(columns=['Row Index'])
    return df_final

Begin parsing document by extracting only main section heading values and their indices.  
Iterate through main sections to find all subsections.

In [470]:
all_sections = []
# Initial pass through the original data to extract main section row indices
main_sections = get_section_data(0, None, 0, len(df_org))
all_sections.append(main_sections)
all_sections_idx = 0

for section_level, current_section in enumerate(all_sections, start=1):
    next_level_sections = []
    for section_idx, row_idx in enumerate(current_section):
        parent_section = formatted_columns['Section'][all_sections_idx]
        section_start, section_end = get_section_range(current_section, section_idx)
        subsections = get_section_data(section_level, parent_section, section_start, section_end)
        next_level_sections.extend(subsections)
        all_sections_idx += 1

    if next_level_sections: 
        all_sections.append(next_level_sections)
        
# Generate the final CSV format
converted_data = generate_csv_format()
converted_data.to_csv('final-csv-conversion.csv', index=False, encoding='utf-8-sig')
converted_data

Unnamed: 0,Parent,Section,Section Title,Section Body Text,Section Start Page,Section End Page
0,,1,INTENT AND OVERVIEW,,1,1
4,1,1.1,Intent,The intent of this Residential Inclusive distr...,1,1
5,1,1.2,Overview,The table below provides an overview of the ou...,1,1
1,,2,USE REGULATIONS,,2,6
6,2,2.1,Outright and Conditional Approval Uses,All outright and conditional approval uses are...,2,4
...,...,...,...,...,...,...
40,4.4,4.4.1,,No portion of the basement or cellar may proje...,16,16
41,4.4,4.4.2,,The surface of the ground adjoining a building...,16,17
42,4.4,4.4.3,,Exterior window s in a secondary suite or lock...,17,17
43,4.4,4.4.4,,"For multiple dwelling, no exterior stairway ca...",17,17
