In [7]:
import fitz  # PyMuPDF
import pandas as pd
import re


## First, we need to define a function which can extract Name, Addres, Email, Telephone, and Website

In [10]:
def extract_information(pdf_path, start_page, end_page):
    doc = fitz.open(pdf_path)
    data = []

    for page_num in range(start_page - 1, end_page):
        page = doc.load_page(page_num)
        text = page.get_text("text").splitlines()

        current_entry = {"Name": "", "Address": [], "Email": "", "Telephone": "", "Website": ""}
        capturing_address = False
        skip_next_line_for_address = False

        for i, line in enumerate(text):
            if skip_next_line_for_address:
                skip_next_line_for_address = False
                capturing_address = True
                continue

            if '[' in line and not capturing_address:
                if current_entry["Name"]:  # Save previous entry if exists
                    current_entry["Address"] = ' '.join(current_entry["Address"]).strip()
                    data.append(current_entry)
                    current_entry = {"Name": "", "Address": [], "Email": "", "Telephone": "", "Website": ""}
                
                current_entry["Name"] = line.split('[')[0].strip()
                if ']' not in line:  # Name continues on the next line
                    skip_next_line_for_address = True
                else:
                    capturing_address = True

            elif capturing_address:
                if re.search(r'\d{5}', line):  
                    current_entry["Address"].append(line)
                    capturing_address = False  # Stop capturing address
                    current_entry["Address"] = ' '.join(current_entry["Address"]).strip()
                else:
                    current_entry["Address"].append(line)

            elif ']' not in line and not capturing_address:  # Look for contact details after address capture
                if "Email" in line or "@" in line:  
                    current_entry["Email"] = line
                elif "Telephone" in line or "Tel" in line or "Phone" in line:  
                    current_entry["Telephone"] = line
                elif "Website" in line or line.startswith("http"): 
                    current_entry["Website"] = line

        # Add the last entry if exists
        if current_entry["Name"]:
            if isinstance(current_entry["Address"], list):  
                current_entry["Address"] = ' '.join(current_entry["Address"]).strip()
            data.append(current_entry)

    doc.close()
    df = pd.DataFrame(data)
    return df

## extract_information() takes three arguments: the path of the pdf file, start page, and end page

**We will run this function, iterating over pages 16 to 23 in the pdf, the pages containing State Senators and Assemblymembers**

**Then, we will add "Title" column, defining the first 39 rows as State Senators and the remaining rows as Assemlymembers, as observed in the pdf**

In [30]:
pdf_path = 'data.pdf'
start_page = 16
end_page = 23
df = extract_information(pdf_path, start_page, end_page)
df.loc[0:38, 'Title'] = 'State Senator'
df.loc[39:118, 'Title'] = 'Assemblymember'
df[35:40]

Unnamed: 0,Name,Address,Email,Telephone,Website,Title
35,"Umberg, Thomas J.","C a p i t o l O f f i c e , 1 0 2 1 O ...",Email: senator.umberg@senate.ca.gov,Telephone: (714) 558-3785,,State Senator
36,"Wieckowski, Bob","C a p i t o l O f f i c e , 1 0 2 1 O ...",Email: senator.wieckowski@senate.ca.gov,Telephone: (510) 794-3900,,State Senator
37,"Wiener, Scott D.","C a p i t o l O f f i c e , 1 0 2 1 O ...",Email: senator.wiener@senate.ca.gov,Telephone: (415) 557-1300,,State Senator
38,"Wilk, Scott","Capitol Office,1021 O Street, Suite 7640, Sacr...",Email: senator.wilk@senate.ca.gov,Telephone: (760) 843-8414,,State Senator
39,"Aguiar-Curry, Cecilia M.","C a p i t o l O f f i c e , R o o m 5 1 ...",Email: assemblymember.aguiar-curry@assembly.ca...,Telephone: (707)-576-0400,Website: www.assembly.ca.gov,Assemblymember


## For tabulating other officials, we will need to create a new function for a new format in the document, starting on page 51