In [43]:
import pandas as pd
import csv
import re
from PyPDF2 import PdfReader
import pdfplumber

In [None]:
# Initialize variables
pdf_file_name = 'zoning-by-law-district-schedule-r1-1.pdf'
csv_file_name = 'extracted-text.csv'
pdf_text = []
page_break_idxs = []
bold_line_idxs = []

# Read PDF 
with pdfplumber.open(pdf_file_name) as pdf:
    for page in pdf.pages:
        page_text = page.extract_text()
        pdf_text.append(page_text)

Read PDF, correct header/table format errors in text, convert to CSV format data, and initialized dataframes to hold original input and final output data. 

Note: Issue with section 3.1 title format (first character of non-conjunctive words are not capitalized). Switched from PYPDF2 to pdfplumber library to determine 'Bold' fontname of lines to extract section titles.

In [36]:
pdf_file_name = 'zoning-by-law-district-schedule-r1-1.pdf'
csv_file_name = 'extracted-text.csv'
# Read PDF & correct formatting issues
reader = PdfReader(pdf_file_name)
pdf_text = []
new_line_pattern = r'([a-z])([A-Z])'
header_pattern = r'(R1-1)(\S+)'
for page in reader.pages:
    page_text = page.extract_text()
    # Correct line breaks that were not registered - e.g. tables, diagram labels
    if re.search(new_line_pattern, page_text):
        page_text = re.sub(new_line_pattern, r'\1\n\2', page_text)
    # Remove 'R1-1' header from each page 
    if re.search(header_pattern, page_text):
        page_text = re.sub(header_pattern, r'\2', page_text)
    
    pdf_text.append(page_text)

# Split lines into cells 
csv_format_data = []
for page_text in pdf_text:
    lines = page_text.splitlines()
    csv_format_data.extend([line.split() for line in lines]) 

# Initialize DF with CSV formatted data for processing
df_org = pd.DataFrame(csv_format_data)

# Initialize DF to hold formatted data 
columns = ['Parent Section', 'Section', 'Section Title', 'Section Body', 'Section Start Page', 'Section End Page']
df_final = pd.DataFrame(columns=columns)

# Initialize vectors to hold DF column values to avoid repeat DF access
parent = []
section = []
title = []
body = []
start_pg = []
end_pg = []
df_row_idxs = []

In [37]:
# Remove lines appearing before Section 1
def find_start():
    global df_org
    start_idx = None

    for cell_idx, cell in enumerate(df_org[0]):
        if cell == '1':
            start_idx = cell_idx
            break

    if start_idx is not None:
        df_org = df_org.iloc[start_idx:].reset_index(drop=True)

find_start()
df_org.to_csv('org-data-extracted.csv', index=False)

Begin parsing document by extracting only main section heading values and their indices.

In [38]:
# Extract Parent Sections 
curr_section = 0
next_section = 1
main_idxs = []
all_section_idxs = []

# Extract all main section headings 
for idx, row in df_org.iterrows():
    first_cell = row[0]
    equals_next_section = first_cell.isdigit() and int(first_cell) == curr_section + 1

    if equals_next_section:
        # Skip row if subsequent text is not strictly uppercase
        if not row[1].isupper():
            break
        curr_section = int(first_cell)
        next_section = curr_section + 1
        section_title = ''
        remaining_cells = row[1:]
        
        for cell in remaining_cells:
            if cell is None:
                break
            section_title += f'{cell} '

        # Add section to DF and record indices of (main) parent sections 
        parent.append(None)
        section.append(curr_section)
        title.append(section_title)
        body.append(None)
        start_pg.append(None)
        end_pg.append(None)
        df_row_idxs.append(idx)
        # Save section number and row index in original dataframe as tuple
        main_idxs.append((curr_section, idx))

all_section_idxs.append(main_idxs)

Parse subsections until each section's "depth" is exhausted.  

TO DO: Combine below function with function in previous code block to remove repeat code. 

In [42]:
for section_depth, section_idxs in enumerate(all_section_idxs):
    sub_section = 1
    
    # Iterate through (data index, section number) tuples for all sections in at current section "depth"
    for idx in section_idxs:
        parent_section, section_start = idx
        next_idx = section_idxs.index(idx) + 1
        section_end = section_idxs[next_idx][1] if next_idx < len(section_idxs) else len(df_org)
        sub_section = 1
        
        # Look for next subsection value match in first column
        for row_idx in range(section_start + 1, section_end):
            next_sub = f'{parent_section}.{sub_section}'
            first_cell = df_org.iloc[row_idx, 0]

            if first_cell.strip() == next_sub:              
                sub_section += 1  
                
                subsection_title = ''
                remaining_cells = df_org.iloc[row_idx, 1:]
                for cell in remaining_cells:
                    if cell is None: 
                        break
                    subsection_title += f'{cell} '
                
                parent.append(parent_section)
                section.append(next_sub)
                title.append(subsection_title)
                body.append(None)
                start_pg.append(None)
                end_pg.append(None)
                df_row_idxs.append(row_idx)
                sub_idxs.append((next_sub, row_idx))
                
        if len(sub_idxs) > 0:
            all_section_idxs.append(sub_idxs)
            sub_idxs = []

In [40]:
df_final['Parent Section'] = parent
df_final['Section'] = section
df_final['Section Title'] = title
df_final['Section Body'] = body
df_final['Section Start Page'] = start_pg
df_final['Section End Page'] = end_pg
df_final['Row Index Original'] = df_row_idxs

df_final = df_final.sort_values(by='Row Index Original', ascending=True).reset_index(drop=True)
df_final.to_csv('test-extracted-sections.csv')
df_final

Unnamed: 0,Parent Section,Section,Section Title,Section Body,Section Start Page,Section End Page,Row Index Original
0,,1,INTENT AND OVERVIEW,,,,0
1,1,1.1,Intent,,,,1
2,1,1.2,Overview,,,,12
3,,2,USE REGULATIONS,,,,31
4,2,2.1,Outright and Conditional Approval Uses,,,,32
...,...,...,...,...,...,...,...
72,4.4,4.4.1,No portion of the basement or cellar may proje...,,,,593
73,4.4,4.4.2,The surface of the ground adjoining a building...,,,,595
74,4.4,4.4.3,Exterior window s in a secondary suite or lock...,,,,614
75,4.4,4.4.4,"For multiple dwelling , no exterior stairway c...",,,,617
