In [2]:
import pandas as pd
from civic_scraper.platforms import CivicPlusSite

In [3]:
df = pd.read_csv('hearings.csv', skiprows=1)
df = df.iloc[:, :8]
df = df[['County', 'Jurisdiction', 'RHNA', 'Agenda Page']]
df = df[df['Agenda Page'].notna()]

In [4]:
sum(['AgendaCenter' in url for url in df['Agenda Page'].drop_duplicates().dropna().values])

57

The substring 'Agenda Center' indicates the city uses CivicPlus.

The civic plus scraper fails if you include the legislative body name in the URL, like so:

In [5]:
print('\n'.join(['https://www.malibucity.org/AgendaCenter/Planning-Commission-4',
                  'https://www.malibucity.org/AgendaCenter/City-Council-8',
                  'https://ca-irwindale2.civicplus.com/AgendaCenter/City-Council-1',
                  'https://www.irwindaleca.gov/AgendaCenter/Planning-Commission-3']))

https://www.malibucity.org/AgendaCenter/Planning-Commission-4
https://www.malibucity.org/AgendaCenter/City-Council-8
https://ca-irwindale2.civicplus.com/AgendaCenter/City-Council-1
https://www.irwindaleca.gov/AgendaCenter/Planning-Commission-3


So we need to strip URLs of everything after AgendaCenter.

In [6]:
civicplus_cities = df[df['Agenda Page'].str.lower().str.contains('agendacenter')]

In [7]:
civicplus_cities.Jurisdiction.nunique()

40

In [8]:
civicplus_cities.loc[:, 'Agenda Page'] = civicplus_cities['Agenda Page'].apply(
    lambda url: url if url.lower().endswith('agendacenter') else url.lower().split('agendacenter')[0] + 'AgendaCenter'
)

In [9]:
civicplus_cities = civicplus_cities.drop_duplicates('Agenda Page')

In [10]:
civicplus_cities = civicplus_cities.drop_duplicates(['County', 'Jurisdiction'])

In [11]:
civicplus_cities.shape

(40, 4)

In [14]:
import concurrent.futures

assets_map = {}

def process_row(row):
    try:
        url = row['Agenda Page']
        site = CivicPlusSite(url)
        assets_metadata = site.scrape(
            start_date='2024-03-26',
            download=True,
            file_size=20,
            asset_list=['agenda', 'minutes', 'agenda_packet', 'captions']
        )
        # Write metadata CSV (ensure the directory exists beforehand)
        metadata_path = f"./civic-scraper/{row['Jurisdiction']}/"
        assets_metadata.to_csv(metadata_path)
        return row['Jurisdiction'], assets_metadata
    except Exception as e:
        print(url)
        print(e)
        return row['Jurisdiction'], []

total = civicplus_cities.shape[0]
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Submit each row for processing. Note: iterrows returns (index, row)
    futures = {executor.submit(process_row, row): idx for idx, row in civicplus_cities.iterrows()}
    
    # Use as_completed to print progress as each future finishes
    for i, future in enumerate(concurrent.futures.as_completed(futures), start=1):
        jurisdiction, assets_metadata = future.result()
        assets_map[jurisdiction] = assets_metadata
        print(round(100 * i / total, 2), '%')


2.5 %
5.0 %
7.5 %
10.0 %
12.5 %
15.0 %
17.5 %
20.0 %
22.5 %
25.0 %
27.5 %
30.0 %
https://www.lahabracity.com/AgendaCenter
('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))
32.5 %
35.0 %
37.5 %
40.0 %
42.5 %
45.0 %
47.5 %
50.0 %
52.5 %
55.0 %
57.5 %
60.0 %
62.5 %
65.0 %
67.5 %
70.0 %
72.5 %
75.0 %
77.5 %
80.0 %
82.5 %
85.0 %
87.5 %
90.0 %
92.5 %
95.0 %
97.5 %
100.0 %


In [13]:
# successful matches
len([k for k, v in assets_map.items() if len(v) > 0])

34