Note: Briefly switched to pdfplumber to parse sections using text metadata (fontname: bold) instead because section 3.1 title does not agree with title formatting conventions in the rest of the document. The title is not capitalized but it is bold indicating that it is a title. 

In [13]:
import pandas as pd
import csv
import re
from PyPDF2 import PdfReader

Read PDF, correct format errors in extracted page text, convert to CSV format data (cell per word), and initialize dataframe with input data with the associated page number and row index by line.

In [14]:
def correct_format_issues(page_text, page_number):
    # Combine all patterns into a single regex using named groups
    all_patterns_regex = (
        r'(?P<line_break>[a-z][A-Z])|'  
        r'(?P<header>R1-1\S+)|'         
        r'(?P<footer_1>City of Vancouver.*)|'  
        rf'(?P<footer_2>.*Page\s+{page_number})'
    )
    
    # Correct all formatting issues found in a single pass
    for match in re.finditer(all_patterns_regex, page_text):
        if match.group('line_break'):
            # Add line breaks for line break issues
            page_text = re.sub(r'([a-z])([A-Z])', r'\1\n\2', page_text)
        else:
            # Delete headers and footers from the document
            page_text = re.sub(r'R1-1', '', page_text)
            page_text = re.sub(re.escape(match.group(0)), '', page_text)

    return page_text


In [15]:
pdf_file_name = 'zoning-by-law-district-schedule-r1-1.pdf'
# Read PDF & correct formatting issues
reader = PdfReader(pdf_file_name)
pdf_text = []

for page_number, page in enumerate(reader.pages, start=1):
    # Correct formatting issues before appending page text   
    page_text = correct_format_issues(page.extract_text(), page_number)
    #  print(f'Page Text Being Appended: {page_text}')
    pdf_text.append(page_text)

# Store document text in cell format, skip empty or blank lines
# Record page number associated with each line
csv_format_data = []
line_page_numbers = []
for page_number, page_text in enumerate(pdf_text, start=1):
    lines = page_text.splitlines()
    for line_index, line in enumerate(lines):
        if line:
            line_split = line.split()
            csv_format_data.append(line_split)
            line_page_numbers.append(page_number)

df_org = pd.DataFrame(csv_format_data)
df_org.insert(0, 'Page Number', line_page_numbers)
df_org.insert(1, 'Row Index', range(1, len(df_org) + 1))

In [16]:
# Initialize dictionary / vectors to hold DF column vectors to avoid repeat DF access.
all_sections = []
formatted_columns = {
    'Parent': [],
    'Section': [],
    'Section Title': [],
    'Section Start Page': [],
    'Row Index': []
}

# Text that may appear lowercase in section titles 
conjunctions = [
    'and', 'the', 'of', 'to', 'in', 'a', 'for', 'on', 'with', 'as', 'at', 
    'by', 'but', 'or', 'an', 'so', 'if', 'because', 'that', 'while', 'from', 
    'after', 'when', 'before', 'where', 'nor', 'yet', 'once', 'although',
    'since', 'than', 'though', 'unless', 'until', 'up', 'while'
]

def get_section_title(section_level, row):
    section_title = ''

    for cell in row:
        # End of section title text
        if cell is None:
            break
        # Not a section title, returns empty string
        if section_level == 0 and not cell.isupper(): 
            return ''
        if section_level > 1 and not cell[0].isupper() and cell not in conjunctions: 
            return ''
        section_title += f'{cell} '

    return section_title.strip()
        
def add_section_row(parent_section, section_number, section_title, section_page_start, section_row_index):
    formatted_columns['Parent'].append(parent_section)
    formatted_columns['Section'].append(section_number)
    formatted_columns['Section Title'].append(section_title)
    formatted_columns['Section Start Page'].append(section_page_start)
    formatted_columns['Row Index'].append(section_row_index)
    
def get_section_range(sections, idx):
    section_start = sections[idx]
    section_end = sections[idx + 1] if idx + 1 < len(sections) else len(df_org)
    return section_start, section_end

def get_section_data(section_level, parent_section, section_start, section_end):
    section_num_col = df_org.iloc[:, 2]
    subsection = 1
    sections = []

    for row_idx in range(section_start, section_end):
        if section_level == 0: 
            next_section = f'{subsection}'
        else: 
            next_section = f'{parent_section}.{subsection}'
        
        section_number = section_num_col[row_idx]
        if section_number.strip() == next_section:
            row = df_org.iloc[row_idx]
            section_title = get_section_title(section_level, row[3:])
            section_page_start = row['Page Number']
            add_section_row(parent_section, section_number, section_title, section_page_start, row_idx)
            sections.append(row_idx)
            subsection += 1
                
    return sections

def get_sections_body(section_row_idxs, section_row_titles):
    all_sections_body = []

    # Iterate through sections using enumerate on row_index
    for section_idx, current_row_index in enumerate(section_row_idxs):
        # Body text for subsections without a title begin from the same row as the Section itself 
        if section_row_titles[section_idx]:
            start_row = current_row_index + 1
        else: 
            start_row = current_row_index
        end_row = section_row_idxs[section_idx + 1] if section_idx + 1 < len(section_row_idxs) else len(df_org)

        # Extract rows between current and next row index
        section_body_rows = df_org.iloc[start_row:end_row, 2:]
        section_body_lines = []

        first_row_first_cell = True
        for _, row in section_body_rows.iterrows():
            row_text = ''
            for cell in row:
                if first_row_first_cell and not section_row_titles[section_idx]:
                    first_row_first_cell = False
                if cell is None:
                    break
                row_text += f'{cell} '
            section_body_lines.append(row_text.strip())

        # Combine all rows into a single string with newlines
        section_body_text = '\n'.join(section_body_lines)
        all_sections_body.append(section_body_text)
        
    return all_sections_body

def get_sections_end_page(df_final):
    # Identify the starting row index for each page
    page_start_row_idxs = df_org.groupby('Page Number')['Row Index'].min().tolist()
    # Identify the last page in the document and its row index
    last_page = int(df_org.iloc[-1]['Page Number']) 
    last_row_idx = int(df_org.iloc[-1]['Row Index']) 
    
    temp_df = df_final[['Row Index', 'Parent', 'Section', 'Section Start Page']].copy()
    temp_df['Level'] = temp_df['Section'].astype(str).apply(lambda x: x.count('.'))
    temp_df['Next Sequential Start Page'] = temp_df['Section Start Page'].shift(-1).fillna(last_page).astype(int)
    temp_df['Next Sequential Start Row'] = temp_df['Row Index'].shift(-1).fillna(last_row_idx).astype(int)
    temp_df['End Page'] = None
    
    # Sort temp_df by Level, Parent Section, and Section
    temp_df = temp_df.sort_values(by=['Level', 'Parent', 'Section'])

    # Iterate through temp_df up to len(temp_df) - 2
    for i in range(len(temp_df) - 1):  # Corrected loop range to include last section
        current_level = temp_df.iloc[i]['Level']
        next_level = temp_df.iloc[i + 1]['Level']
        current_parent = temp_df.iloc[i]['Parent']
        next_parent = temp_df.iloc[i + 1]['Parent']

        # Force integer type conversion
        next_start_page_sorted = int(temp_df.iloc[i + 1]['Section Start Page'])
        next_index_sorted = int(temp_df.iloc[i + 1]['Row Index']) + 1
        next_start_page_original = int(temp_df.iloc[i]['Next Sequential Start Page'])
        next_index_original = int(temp_df.iloc[i]['Next Sequential Start Row']) + 1

        print(f"\nProcessing Row {i}:")
        print(f"  - Current Level: {current_level}")
        print(f"  - Next Level: {next_level}")
        print(f"  - Current Parent: {current_parent}")
        print(f"  - Next Parent: {next_parent}")
        print(f"  - Next Start Page (Sorted): {next_start_page_sorted}")
        print(f"  - Next Index (Sorted): {next_index_sorted}")
        print(f"  - Next Start Page (Original): {next_start_page_original}")
        print(f"  - Next Index (Original): {next_index_original}")

        # Condition 1: Same level, same parent
        if current_level == next_level and current_parent == next_parent:
            if next_index_sorted in page_start_row_idxs:
                temp_df.at[i, 'End Page'] = next_start_page_sorted - 1
            else:
                temp_df.at[i, 'End Page'] = next_start_page_sorted
            print(f"  → Condition 1 executed: Same Level & Same Parent. End Page set to {temp_df.at[i, 'End Page']}.")

        # Condition 2: Same level, different parent
        elif current_level == next_level and current_parent != next_parent:
            if next_index_original in page_start_row_idxs:
                temp_df.at[i, 'End Page'] = next_start_page_original - 1
            else:
                temp_df.at[i, 'End Page'] = next_start_page_original
            print(f"  → Condition 1 executed: Same Level & Different Parent. End Page set to {temp_df.at[i, 'End Page']}.")
        # Condition 3: Different levels
        else:
            print(f"  → Condition 3 executed: Different Levels. End Page set to {last_page}.")
            temp_df.at[i, 'End Page'] = last_page
    
    temp_df.at[len(temp_df) - 1, 'End Page'] = last_page
    temp_df = temp_df.sort_values(by=['Row Index'])
    return temp_df['End Page'].astype(int).tolist()  # Ensure output is a list of integers

def generate_csv_format():
    # Create DataFrame from formatted_columns
    df_final = pd.DataFrame(formatted_columns)
    df_final = df_final.sort_values(by='Row Index', ascending=True)

    # Extract body text by section row indices
    section_row_idxs = df_final['Row Index'].tolist()
    section_row_titles = df_final['Section Title'].tolist()
    sections_body = get_sections_body(section_row_idxs, section_row_titles)

    # Insert 'Section Body Text' column after 'Section Title'
    body_column_idx = df_final.columns.get_loc('Section Title') + 1
    df_final.insert(body_column_idx, 'Section Body Text', sections_body)

    # Extract end pages from following section start page
    section_end_pages = get_sections_end_page(df_final)

    # Insert 'Section End Page' column after 'Section Start Page'
    end_page_idx = df_final.columns.get_loc('Section Start Page') + 1
    df_final.insert(end_page_idx, 'Section End Page', section_end_pages)

    df_final = df_final.drop(columns=['Row Index'])
    return df_final

In [17]:
# Identify the starting row index for each page
page_start_row_idxs = df_org.groupby('Page Number')['Row Index'].min().tolist()
page_start_row_idxs

[1,
 31,
 69,
 106,
 143,
 177,
 230,
 260,
 288,
 345,
 401,
 427,
 480,
 483,
 513,
 547,
 579]

Begin parsing document by extracting only main section heading values and their indices.  
Iterate through main sections to find all subsections.

In [18]:
all_sections = []
# Initial pass through the original data to extract main section row indices
main_sections = get_section_data(0, None, 0, len(df_org))
all_sections.append(main_sections)
all_sections_idx = 0

for section_level, current_section in enumerate(all_sections, start=1):
    next_level_sections = []
    for section_idx, row_idx in enumerate(current_section):
        parent_section = formatted_columns['Section'][all_sections_idx]
        section_start, section_end = get_section_range(current_section, section_idx)
        subsections = get_section_data(section_level, parent_section, section_start, section_end)
        next_level_sections.extend(subsections)
        all_sections_idx += 1

    if next_level_sections: 
        all_sections.append(next_level_sections)
        
# Generate the final CSV format
converted_data = generate_csv_format()
converted_data.to_csv('final-csv-conversion.csv', index=False)
converted_data


Processing Row 0:
  - Current Level: 0
  - Next Level: 0
  - Current Parent: None
  - Next Parent: None
  - Next Start Page (Sorted): 2
  - Next Index (Sorted): 31
  - Next Start Page (Original): 1
  - Next Index (Original): 3
  → Condition 1 executed: Same Level & Same Parent. End Page set to 1.

Processing Row 1:
  - Current Level: 0
  - Next Level: 0
  - Current Parent: None
  - Next Parent: None
  - Next Start Page (Sorted): 7
  - Next Index (Sorted): 230
  - Next Start Page (Original): 2
  - Next Index (Original): 32
  → Condition 1 executed: Same Level & Same Parent. End Page set to 6.

Processing Row 2:
  - Current Level: 0
  - Next Level: 0
  - Current Parent: None
  - Next Parent: None
  - Next Start Page (Sorted): 14
  - Next Index (Sorted): 483
  - Next Start Page (Original): 7
  - Next Index (Original): 232
  → Condition 1 executed: Same Level & Same Parent. End Page set to 13.

Processing Row 3:
  - Current Level: 0
  - Next Level: 1
  - Current Parent: None
  - Next Pare

Unnamed: 0,Parent,Section,Section Title,Section Body Text,Section Start Page,Section End Page
0,,1,INTENT AND OVERVIEW,,1,1
4,1,1.1,Intent,The intent of this Residential Inclusive distr...,1,1
5,1,1.2,Overview,The table below provides an overview of the ou...,1,1
1,,2,USE REGULATIONS,,2,6
6,2,2.1,Outright and Conditional Approval Uses,All outright and conditional approval uses are...,2,4
...,...,...,...,...,...,...
40,4.4,4.4.1,,4.4.1 No portion of the basement or cellar may...,16,16
41,4.4,4.4.2,,4.4.2 The surface of the ground adjoining a bu...,16,17
42,4.4,4.4.3,,4.4.3 Exterior window s in a secondary suite o...,17,17
43,4.4,4.4.4,,"4.4.4 For multiple dwelling , no exterior stai...",17,17
