In [13]:
import pandas as pd
import requests
import numpy as np
from urllib.parse import urlparse
import time
import os

This notebook retrieves the legistar client names for cities and towns in California. We get up to 100 free google search API calls per day, and after that it's \\$5 for a thousand calls - so about \\$2.50 to run through the whole list of cities + towns. Won't break us to run it ocassionally, but we shouldn't execute this script mindlessly.

If you need to run this script, you have to set up environment variables. I included them in the Vercel list of environment variables - even though they're not required for deployment of our website, it's still nice to have them in a shared place.

In [20]:
cities = [ # this list is generated by copy/pasting the wikipedia table here: 
    # https://en.wikipedia.org/wiki/List_of_municipalities_in_California
    # and asking chatgpt for a python literal list
    "Adelanto", "Agoura Hills", "Alameda", "Albany", "Alhambra", "Aliso Viejo", "Alturas", "Amador City",
    "American Canyon", "Anaheim", "Anderson", "Angels Camp", "Antioch", "Apple Valley", "Arcadia", "Arcata",
    "Arroyo Grande", "Artesia", "Arvin", "Atascadero", "Atherton", "Atwater", "Auburn", "Avalon", "Avenal",
    "Azusa", "Bakersfield", "Baldwin Park", "Banning", "Barstow", "Beaumont", "Bell", "Bell Gardens",
    "Bellflower", "Belmont", "Belvedere", "Benicia", "Berkeley", "Beverly Hills", "Big Bear Lake", "Biggs",
    "Bishop", "Blue Lake", "Blythe", "Bradbury", "Brawley", "Brea", "Brentwood", "Brisbane", "Buellton",
    "Buena Park", "Burbank", "Burlingame", "Calabasas", "Calexico", "California City", "Calimesa", "Calipatria",
    "Calistoga", "Camarillo", "Campbell", "Canyon Lake", "Capitola", "Carlsbad", "Carmel-by-the-Sea",
    "Carpinteria", "Carson", "Cathedral City", "Ceres", "Cerritos", "Chico", "Chino", "Chino Hills", "Chowchilla",
    "Chula Vista", "Citrus Heights", "Claremont", "Clayton", "Clearlake", "Cloverdale", "Clovis", "Coachella",
    "Coalinga", "Colfax", "Colma", "Colton", "Colusa", "Commerce", "Compton", "Concord", "Corcoran", "Corning",
    "Corona", "Coronado", "Corte Madera", "Costa Mesa", "Cotati", "Covina", "Crescent City", "Cudahy",
    "Culver City", "Cupertino", "Cypress", "Daly City", "Dana Point", "Danville", "Davis", "Del Mar",
    "Del Rey Oaks", "Delano", "Desert Hot Springs", "Diamond Bar", "Dinuba", "Dixon", "Dorris", "Dos Palos",
    "Downey", "Duarte", "Dublin", "Dunsmuir", "East Palo Alto", "Eastvale", "El Cajon", "El Centro", "El Cerrito",
    "El Monte", "El Segundo", "Elk Grove", "Emeryville", "Encinitas", "Escalon", "Escondido", "Etna", "Eureka",
    "Exeter", "Fairfax", "Fairfield", "Farmersville", "Ferndale", "Fillmore", "Firebaugh", "Folsom", "Fontana",
    "Fort Bragg", "Fort Jones", "Fortuna", "Foster City", "Fountain Valley", "Fowler", "Fremont", "Fresno",
    "Fullerton", "Galt", "Garden Grove", "Gardena", "Gilroy", "Glendale", "Glendora", "Goleta", "Gonzales",
    "Grand Terrace", "Grass Valley", "Greenfield", "Gridley", "Grover Beach", "Guadalupe", "Gustine",
    "Half Moon Bay", "Hanford", "Hawaiian Gardens", "Hawthorne", "Hayward", "Healdsburg", "Hemet", "Hercules",
    "Hermosa Beach", "Hesperia", "Hidden Hills", "Highland", "Hillsborough", "Hollister", "Holtville",
    "Hughson", "Huntington Beach", "Huntington Park", "Huron", "Imperial", "Imperial Beach", "Indian Wells",
    "Indio", "City of Industry", "Inglewood", "Ione", "Irvine", "Irwindale", "Isleton", "Jackson", "Jurupa Valley",
    "Kerman", "King City", "Kingsburg", "La Cañada Flintridge", "La Habra", "La Habra Heights", "La Mesa",
    "La Mirada", "La Palma", "La Puente", "La Quinta", "La Verne", "Lafayette", "Laguna Beach", "Laguna Hills",
    "Laguna Niguel", "Laguna Woods", "Lake Elsinore", "Lake Forest", "Lakeport", "Lakewood", "Lancaster",
    "Larkspur", "Lathrop", "Lawndale", "Lemon Grove", "Lemoore", "Lincoln", "Lindsay", "Live Oak", "Livermore",
    "Livingston", "Lodi", "Loma Linda", "Lomita", "Lompoc", "Long Beach", "Loomis", "Los Alamitos", "Los Altos",
    "Los Altos Hills", "Los Angeles", "Los Banos", "Los Gatos", "Loyalton", "Lynwood", "Madera", "Malibu",
    "Mammoth Lakes", "Manhattan Beach", "Manteca", "Maricopa", "Marina", "Martinez", "Marysville", "Maywood",
    "McFarland", "Mendota", "Menifee", "Menlo Park", "Merced", "Mill Valley", "Millbrae", "Milpitas",
    "Mission Viejo", "Modesto", "Monrovia", "Montague", "Montclair", "Monte Sereno", "Montebello", "Monterey",
    "Monterey Park", "Moorpark", "Moraga", "Moreno Valley", "Morgan Hill", "Morro Bay", "Mount Shasta",
    "Mountain House", "Mountain View", "Murrieta", "Napa", "National City", "Needles", "Nevada City", "Newark",
    "Newman", "Newport Beach", "Norco", "Norwalk", "Novato", "Oakdale", "Oakland", "Oakley", "Oceanside",
    "Ojai", "Ontario", "Orange", "Orange Cove", "Orinda", "Orland", "Oroville", "Oxnard", "Pacific Grove",
    "Pacifica", "Palm Desert", "Palm Springs", "Palmdale", "Palo Alto", "Palos Verdes Estates", "Paradise",
    "Paramount", "Parlier", "Pasadena", "Paso Robles", "Patterson", "Perris", "Petaluma", "Pico Rivera",
    "Piedmont", "Pinole", "Pismo Beach", "Pittsburg", "Placentia", "Placerville", "Pleasant Hill", "Pleasanton",
    "Plymouth", "Point Arena", "Pomona", "Port Hueneme", "Porterville", "Portola", "Portola Valley", "Poway",
    "Rancho Cordova", "Rancho Cucamonga", "Rancho Mirage", "Rancho Palos Verdes", "Rancho Santa Margarita",
    "Red Bluff", "Redding", "Redlands", "Redondo Beach", "Redwood City", "Reedley", "Rialto", "Richmond",
    "Ridgecrest", "Rio Dell", "Rio Vista", "Ripon", "Riverbank", "Riverside", "Rocklin", "Rohnert Park",
    "Rolling Hills", "Rolling Hills Estates", "Rosemead", "Roseville", "Ross", "Sacramento", "St. Helena",
    "Salinas", "San Anselmo", "San Bernardino", "San Bruno", "San Carlos", "San Clemente", "San Diego",
    "San Dimas", "San Fernando", "San Francisco", "San Gabriel", "San Jacinto", "San Joaquin", "San Jose",
    "San Juan Bautista", "San Juan Capistrano", "San Leandro", "San Luis Obispo", "San Marcos", "San Marino",
    "San Mateo", "San Pablo", "San Rafael", "San Ramon", "Sand City", "Sanger", "Santa Ana", "Santa Barbara",
    "Santa Clara", "Santa Clarita", "Santa Cruz", "Santa Fe Springs", "Santa Maria", "Santa Monica",
    "Santa Paula", "Santa Rosa", "Santee", "Saratoga", "Sausalito", "Scotts Valley", "Seal Beach", "Seaside"
]


In [21]:
def find_legistar_urls(cities, api_key, cx):
    """
    For each city in the provided list, uses Google Custom Search API to search for 
    'legistar {city} california' and extracts the first URL containing 'legistar.com' 
    from the top 5 search results.
    
    Args:
        cities (list): List of California city names
        api_key (str): Google Custom Search API key
        cx (str): Google Custom Search engine ID
        
    Returns:
        pd.DataFrame: DataFrame with cities and their corresponding Legistar URLs
                     (NaN if no matching URL found)
    """
    results = []
    
    for city in cities:
        query = f"legistar {city} california"
        legistar_url = np.nan
        
        try:
            # Construct the API request URL
            search_url = "https://www.googleapis.com/customsearch/v1"
            params = {
                'q': query,
                'key': api_key,
                'cx': cx,
                'num': 5  # Request 5 results
            }
            
            # Send the request
            response = requests.get(search_url, params=params)
            
            if response.status_code == 200:
                # Parse the JSON response
                search_data = response.json()
                
                # Check if we have search results
                if 'items' in search_data:
                    # Check the top 5 results for 'legistar.com'
                    for item in search_data['items']:
                        if 'link' in item and 'legistar.com' in item['link']:
                            # Extract the domain part
                            parsed_url = urlparse(item['link'])
                            legistar_domain = parsed_url.netloc
                            legistar_url = legistar_domain
                            break
            
            # Add a small delay to avoid hitting rate limits
            time.sleep(0.5)
        except Exception as e:
            print(f"Error searching for {city}: {e}")
        
        results.append({"city": city, "legistar_url": legistar_url})
    
    return pd.DataFrame(results)



In [22]:
API_KEY = os.environ.get('GOOGLE_SEARCH_API_KEY')
cx = os.environ.get('GOOGLE_CUSTOM_SEARCH_ENGINE_NAME')

In [23]:
df = find_legistar_urls(cities, api_key=API_KEY, cx=cx)

Error searching for Hemet: HTTPSConnectionPool(host='www.googleapis.com', port=443): Max retries exceeded with url: /customsearch/v1?q=legistar+Hemet+california&key=AIzaSyByDDlimG3zUG2ZXGENFvtAdAKB_zH47l0&cx=f240fdb24dddf4b77&num=5 (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1129)')))


In [24]:
client_names = df['legistar_url'].str.split('.').str[0]

In [25]:
client_names.values

array(['hesperia', 'santabarbara', 'alameda', 'oakland', 'sanbernardino',
       'octa', nan, 'hayward', 'napa', 'cityoforange', 'sanbernardino',
       'wauwatosacitywi', 'contra-costa', 'sanbernardino',
       'sanbernardino', 'humboldt', 'ci-ssf-ca', 'redondo', 'tol',
       'sonoma-county', 'manteca-ca', 'cityofmerced', 'monterey',
       'carson', 'oakland', 'sbcera', 'solano', 'riversideca',
       'sunnyvaleca', 'sanbernardino', 'temeculaca', 'monterey',
       'longbeach', 'cityofmerced', 'longbeach', 'sfgov', 'solano',
       'actransit', 'newportbeach', 'sanbernardino', 'sanbernardino',
       'cook-county', 'humboldt', 'fresno', 'longbeach', 'fresno',
       'cityoforange', 'contra-costa', 'sanmateocounty', 'santabarbara',
       'octa', 'metro', 'burlingameca', 'culver-city', nan, 'santaclara',
       nan, nan, 'santa-rosa', 'sanmateocounty', 'monterey',
       'riversideca', 'scvwd', 'sdcounty', 'monterey', 'santabarbara',
       'carson', 'culver-city', 'manteca-ca', 'met

In [26]:
client_names.notna().sum()

101

Just fyi, I have not set up billing yet, so we only get the free 100 cities per day. 

In [28]:

def test_sunnyvale_legistar(api_key, cx):
    """
    Unit test to verify that the function returns 'sunnyvaleca.legistar.com'
    for the city of 'Sunnyvale'.
    
    Args:
        api_key (str): Google Custom Search API key
        cx (str): Google Custom Search engine ID
    """
    results = find_legistar_urls(['Sunnyvale'], api_key, cx)
    assert results.loc[0, 'legistar_url'] == 'sunnyvaleca.legistar.com', \
        f"Expected 'sunnyvaleca.legistar.com', got {results.loc[0, 'legistar_url']}"
    print("Test passed!")


In [29]:
test_sunnyvale_legistar(api_key=API_KEY, cx=cx)

Test passed!
