In [126]:
import pandas as pd
import csv
import re
from PyPDF2 import PdfReader

Read PDF, correct header/table format errors in text, convert to CSV format data, and initialized dataframes to hold original input and final output data. 

In [127]:
pdf_file_name = 'zoning-by-law-district-schedule-r1-1.pdf'
csv_file_name = 'extracted-text.csv'
# Read PDF & correct formatting issues
reader = PdfReader(pdf_file_name)
pdf_text = []
new_line_pattern = r'([a-z])([A-Z])'
header_pattern = r'(R1-1)(\S+)'
for page in reader.pages:
    page_text = page.extract_text()
    # Correct line breaks that were not registered - e.g. tables, diagram labels
    if re.search(new_line_pattern, page_text):
        page_text = re.sub(new_line_pattern, r'\1\n\2', page_text)
    # Remove 'R1-1' header from each page 
    if re.search(header_pattern, page_text):
        page_text = re.sub(header_pattern, r'\2', page_text)
    
    pdf_text.append(page_text)

# Split lines into cells 
csv_format_data = []
for page_text in pdf_text:
    lines = page_text.splitlines()
    csv_format_data.extend([line.split() for line in lines]) 

# Initialize DF with CSV formatted data for processing
df_org = pd.DataFrame(csv_format_data)

# Initialize DF to hold formatted data 
columns = ['Parent Section', 'Section', 'Section Title', 'Section Body', 'Section Start Page', 'Section End Page']
df_final = pd.DataFrame(columns=columns)

In [128]:
# Remove lines appearing before Section 1
def find_start():
    global df_org
    start_idx = None

    for cell_idx, cell in enumerate(df_org[0]):
        if cell == '1':
            start_idx = cell_idx
            break

    if start_idx is not None:
        df_org = df_org.iloc[start_idx:].reset_index(drop=True)

find_start()
df_org.to_csv('extract-test.csv', index=False)
df_org

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,INTENT,AND,OVERVIEW,,,,,,,...,,,,,,,,,,
1,1.1,Intent,,,,,,,,,...,,,,,,,,,,
2,The,intent,of,this,Residential,Inclusive,district,schedule,is,to,...,small-scale,housing,,,,,,,,
3,options,while,retaining,the,single,lot,character,of,the,area.,...,dwelling,s,,,,,,,,
4,(“multiplex”,up,to,6,dwelling,"units,",or,up,to,8,...,and,single,detached,houses.,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
615,(a),10%,of,the,total,floor,area,of,the,"room,",...,"kitchen,",living,room,and,dining,room;,and,,,
616,(b),5%,of,the,total,floor,area,of,the,"room,",...,except,bathrooms,and,laundry,rooms.,,,,,
617,4.4.4,For,multiple,dwelling,",",no,exterior,stairway,can,exceed,...,m.,,,,,,,,,
618,4.4.5,For,multiple,dwelling,",",a,minimum,area,of,7.4,...,must,be,provided,in,the,form,of,,,


In [129]:
# Initialize vectors to hold DF column values to avoid repeat DF access
parent = []
section = []
title = []
body = []
start_pg = []
end_pg = []

Begin parsing document by extracting only main section heading values and their indices.

In [130]:
# Extract Parent Sections 
curr_parent = 0
next_parent = 1
parent_idxs = []
all_section_idxs = []

# Extract all main section headings 
for idx, row in df_org.iterrows():
    first_cell = row[0]
    equals_next_parent = first_cell.isdigit() and int(first_cell) == curr_parent + 1

    if equals_next_parent:
        # Skip row if subsequent text is not strictly uppercase
        if not row[1].isupper():
            break
        curr_parent = int(first_cell)
        next_parent = curr_parent + 1
        section_title = ''
        remaining_cells = row[1:]

        # Added for possible edge cases, but instance is unlikely
        # all_upper = True
        for cell in remaining_cells:
            if cell is None:
                break
            # if not cell.isupper():
            #     all_upper = False
            #     break
            section_title += f'{cell} '

        # Add section to DF and record indices of (main) parent sections 
        parent.append(None)
        section.append(curr_parent)
        title.append(section_title)
        body.append(None)
        start_pg.append(None)
        end_pg.append(None)
        parent_idxs.append(idx)

all_section_idxs.append(parent_idxs)

# Move this code segment to AFTER all sections and subsections have been added to vectors
# after completing function to extract subsections
df_final['Parent Section'] = parent
df_final['Section'] = section
df_final['Section Title'] = title
df_final['Section Body'] = body
df_final['Section Start Page'] = start_pg
df_final['Section End Page'] = end_pg
df_final['Row Index Original'] = parent_idxs
df_final

Unnamed: 0,Parent Section,Section,Section Title,Section Body,Section Start Page,Section End Page,Row Index Original
0,,1,INTENT AND OVERVIEW,,,,0
1,,2,USE REGULATIONS,,,,31
2,,3,"DENSITY, FORM AND PLACEMENT REGULATIONS",,,,240
3,,4,GENERAL REGULATIONS,,,,507


In [131]:
# sub_idxs = []

# for section_idx, section in enumerate(all_section_idxs):
#     curr_section = all_section_idxs[section_idx]
#     curr_sub = 0
#     next_sub = 1
#     section_depth = section_idx
#     for idx in curr_section:
        # actual section number = df_final at idx + 1
        # start search at curr_section[idx], end search at curr_section[idx+1]
        # check for str == section number.{curr_sub + 1}
