In [None]:
import civicweb_scraper

import requests
from requests_cache import CachedSession

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException

import logging
import os
import json
from pprint import pprint
import time

In [None]:
# logging.basicConfig(format='%(asctime)s - %(levelname)s:%(message)s', datefmt='%Y-%m-%d %H:%M:%S')
logger = civicweb_scraper.logger_setup()
# logger.setLevel(logging.DEBUG)
logger.setLevel(logging.INFO)

# Information about the final subdomains.json file
To track the results from each website under the "civicweb.net" domain, we keep a dictionary of dictionaries as `subdomains_dict`. The keys to `subdomains_dict` should be the website's subdomain. `subdomains_dict[subdomain]` provides information to the dictionary. The only MANDATORY key within each dictionary is `subdomains_dict[subdomain]["root_url"]`, which should hold a stripped root url of the site, `https://<subdomain>.civicweb.net`. 

Make sure to clear all outputs before saving to Github!

# Using Google Search API

In [None]:
# load existing subdomain information if available
try:
    google_scrape_results = json.load(open(f"{civicweb_scraper.OUT_FOLDER}/subdomains.json"))
    logger.info(f"Loaded in existing subdomains.json file.")
except:
    google_scrape_results = {}
    logger.info(f"No existing subdomains.json file found in the {civicweb_scraper.OUT_FOLDER} folder.")

In [None]:
session = civicweb_scraper.create_cache(
    name="test_cache", 
    expire_after=3600*24*14, 
    allowable_codes=[200] # only save successful requests
    )

# setting constants

In [None]:
# create a custom google search engine by following https://developers.google.com/custom-search/docs/tutorial/creatingcse
GOOGLE_SEARCH_ENDPOINT = "https://www.googleapis.com/customsearch/v1?"
GOOGLE_API_KEY = os.getenv('GOOGLE_API')
GOOGLE_SEARCH_ENGINE_ID = os.getenv('GOOGLE_SEARCH_ENGINE_ID')

In [None]:
start_page = 0
max_page = 20
num_results_per_page = 10
for page_index in range(start_page, max_page):
    time.sleep(1) # wait for 1 second between each search
    start_index = page_index*10+1
    # search with query and result page
    logger.info(f"Scraping page {page_index} with results from {start_index} to {start_index+num_results_per_page-1}...")

    google_params = {
        "key": GOOGLE_API_KEY, 
        "cx":GOOGLE_SEARCH_ENGINE_ID,
        "q":"site:civicweb.net",
        "num":num_results_per_page,
        'start': start_index,
        "gl":"ca"
    }

    try:
        response = session.get(GOOGLE_SEARCH_ENDPOINT, params=google_params)
        response.raise_for_status()
        search_results = response.json()
    except requests.exceptions.HTTPError as e:
        logger.error(response.status_code)
        continue

    for item in search_results["items"]:
        logger.debug(f"Looking at {item['link']}")
        if ".civicweb.net" in item["link"]:
            subdomain = item["link"].split(".civicweb.net")[0].split("https://")[-1]
            if subdomain in google_scrape_results:
                logger.debug(f"Already seen {subdomain}")
                continue
            else:
                logger.info(f"Adding {subdomain} to results list")
                google_scrape_results[subdomain] = {
                    "root_url": f"https://{subdomain}.civicweb.net",
                    "google_search_url": item["link"],
                    "title": item["title"],
                    "description": item["snippet"],
                    }

logger.info(f"Found {len(google_scrape_results)} unique subdomains from the Google Search API.")

In [None]:
pprint(google_scrape_results)

In [None]:
# save the newly found subdomains
with open(civicweb_scraper.OUT_FOLDER / f"subdomains.json", "w") as f:
    json.dump(google_scrape_results, f, indent=4)

In [None]:
try:
    subdomains_dict = json.load(open(f"{civicweb_scraper.OUT_FOLDER}/subdomains.json"))
    logger.info(f"Loaded in existing subdomains.json file.")
except:
    subdomains_dict = {}
    logger.info(f"No existing subdomains.json file found in the {civicweb_scraper.OUT_FOLDER} folder.")

# for subdomain in subdomains_dict.keys():
subdomains_to_scrape = [
    subdomain for subdomain in subdomains_dict.keys() 
    if "complete" not in subdomains_dict[subdomain] 
    or subdomains_dict[subdomain]["complete"] == False
    ]

subdomains_to_scrape

# using Selenium with Bing

In [None]:
bing_links = []

driver = webdriver.Firefox()
max_page_number = 10 # max number of pages to scrape
num_results_per_page = 10 # default number
for page_index in range(max_page_number):

    start_index = 1+page_index*num_results_per_page

    logger.info(f"Scraping page {page_index+1} with results from {start_index} to {start_index+num_results_per_page-1}...")

    bing_url = f'https://www.bing.com/search?q=site%3acivicweb.net&first={start_index}' 

    # get url of each page result
    try:
        driver.get(bing_url)
        links = driver.find_elements(by=By.TAG_NAME, value="cite")
        logger.debug(links)
        bing_links.extend([link.text for link in links])
    except Exception as e:
        logger.error("Error opening Bing page for scraping: ", str(e))
        continue
    finally:
        driver.quit()
    
logger.info(f"Bing scraping finished. Found {len(bing_links)} links.")

In [None]:
bing_links

In [None]:
# result from previous scrapes
bing_links = ['https://victoria.civicweb.net/Portal/Welcome.aspx',
 'governmentjobs.com',
 'tender.victoria.ca',
 'melbourneflorida.org',
 'victoriatx.gov',
 'ci.victoria.mn.us',
 'https://victoria.civicweb.net/Portal',
 'victoria.civicweb.net',
 'victoria.civicweb.net',
 'https://tay.civicweb.net/Portal',
 'https://terrace.civicweb.net/Portal/Default.aspx',
 'https://cityofshawnee.civicweb.net/portal',
 'https://loyalist.civicweb.net/portal',
 'https://victoria.civicweb.net/user/signin',
 'https://wetaskiwin.civicweb.net/Portal/Default.aspx',
 'https://mclendon-chisholm.civicweb.net/Portal',
 'https://revelstoke.civicweb.net/portal',
 'https://victoria.civicweb.net/Portal',
 'https://victoria.civicweb.net/Portal/Welcome.aspx',
 'https://kamloops.civicweb.net/Portal/Default.aspx',
 'https://cityofjerseycity.civicweb.net/Portal',
 'https://wifn.civicweb.net/portal',
 'https://nngov.civicweb.net/portal/members.aspx?id=10',
 'https://centralelgin.civicweb.net',
 'https://revelstoke.civicweb.net/portal',
 'https://millcreek.civicweb.net/Portal',
 'https://hartford.civicweb.net/Portal',
 'https://otonabeesouthmonaghan.civicweb.net/Portal',
 'https://greatermadawaska.civicweb.net/Portal/Welcome.aspx',
 'https://springwater.civicweb.net/Portal',
 'https://westnewyorknj.civicweb.net/Portal/Welcome.aspx',
 'https://centralelgin.civicweb.net',
 'https://lewes.civicweb.net/Portal',
 'https://cityofshawnee.civicweb.net/portal',
 'https://opkansas.civicweb.net/Portal',
 'https://nngov.civicweb.net/portal/members.aspx?id=10',
 'https://orillia.civicweb.net/Portal/Default.aspx',
 'https://sequimwa.civicweb.net',
 'https://loyalist.civicweb.net/portal',
 'https://lawrenceks.civicweb.net/portal/members.aspx?id=10',
 'https://powellriver.civicweb.net/Portal/MeetingSchedule.aspx',
 'https://mclendon-chisholm.civicweb.net/Portal',
 'https://sammamishwa.civicweb.net/Portal',
 'https://peoriagov.civicweb.net/Portal/Video.aspx',
 'https://marmoraandlake.civicweb.net/portal',
 'https://cityofalice.civicweb.net/Portal/Default.aspx',
 'https://wifn.civicweb.net/Portal/MeetingTypeList.aspx',
 'https://cityofjerseycity.civicweb.net/Portal',
 'https://countygp.civicweb.net/Portal',
 'https://selkirk.civicweb.net/Portal',
 'https://southfrontenac.civicweb.net/Portal',
 'https://cloquet.civicweb.net',
 'https://pinecitygovoffice.civicweb.net/Portal',
 'https://wifn.civicweb.net/portal',
 'https://sphosp.civicweb.net/user',
 'https://centrewellington.civicweb.net/Portal',
 'https://dallascounty.civicweb.net/Portal/VirtualLibrary.aspx',
 'https://highriver.civicweb.net/filepro/document/48474/2024- spring-summer_ community...',
 'https://citwentynine-palmsca.civicweb.net/portal',
 'https://dallascounty.civicweb.net/portal',
 'https://osoyoos.civicweb.net/filepro/documents/135633',
 'https://ramara.civicweb.net/portal',
 'https://terrace.civicweb.net/portal/members.aspx?id=11',
 'https://petrolia.civicweb.net/Portal',
 'https://timmins.civicweb.net/document/167053/ADM-2024-07-09-Admin Report-Housing …',
 'https://williamsnd.civicweb.net/Portal',
 'https://lacenter.civicweb.net/Portal',
 'https://lawrenceks.civicweb.net/Portal/MeetingInformation.aspx?Org=Cal&Id=5599',
 'https://codb.civicweb.net/Portal',
 'https://woonsocketri.civicweb.net/portal',
 'https://revelstoke.civicweb.net/portal',
 'https://powellriver.civicweb.net/Portal',
 'https://covinaca.civicweb.net/Portal',
 'https://stlouisco.civicweb.net/portal/members.aspx?id=10',
 'https://cityofholland.civicweb.net/Portal',
 'https://tay.civicweb.net/Portal',
 'https://cityofalice.civicweb.net/user',
 'https://mindenhills.civicweb.net/Portal/MeetingTypeList.aspx',
 'https://waverly.civicweb.net/Portal',
 'https://englewoodgov.civicweb.net/Portal',
 'https://winnipegsdca.civicweb.net/Portal/Welcome.aspx',
 'https://washingtoncounty.civicweb.net/portal/members.aspx?id=10',
 'https://timmins.civicweb.net/document/165147/PLN-2024-06-18-Admin Report-Tiny Hom…',
 'https://hearst.civicweb.net/portal',
 'https://kamloops.civicweb.net/portal',
 'https://cityofbelmont.civicweb.net/portal',
 'https://severn.civicweb.net/Portal/MeetingTypeList.aspx',
 'https://lillooettribalcouncil.civicweb.net',
 'https://stonemills.civicweb.net/portal',
 'https://muskoka.civicweb.net/filepro/documents',
 'https://codb.civicweb.net/Portal/Default.aspx',
 'https://cityofrehoboth.civicweb.net/portal',
 'https://walton.civicweb.net/Portal/MeetingSchedule.aspx',
 'https://peachland.civicweb.net/filepro/documents/89371/?preview=89372',
 'https://honeybrooktwp.civicweb.net/user/signin',
 'https://cocookmn.civicweb.net/Portal',
 'https://delta.civicweb.net/filepro/documents/224970',
 'https://terrace.civicweb.net/Portal/Default.aspx',
 'https://ponoka.civicweb.net/filepro/documents',
 'https://kamloops.civicweb.net/Portal/MeetingSchedule.aspx',
 'https://powellriver.civicweb.net/Portal/MeetingTypeList.aspx',
 'https://cityofrehoboth.civicweb.net/filepro/documents',
 'https://victoria.civicweb.net/Portal/Welcome.aspx',
 'https://oakbay.civicweb.net/portal',
 'https://hemetca.civicweb.net',
 'https://victoria.civicweb.net/Portal',
 'https://nr.civicweb.net/Portal',
 'https://voluntown.civicweb.net/Portal/Default.aspx',
 'https://strathmore.civicweb.net',
 'https://cityofbowietx.civicweb.net',
 'https://stlouisco.civicweb.net/Portal/Welcome.aspx',
 'https://delavan.civicweb.net/portal',
 'https://covinaca.civicweb.net/Portal/Welcome.aspx',
 'https://indio.civicweb.net/Portal',
 'https://camrose.civicweb.net/Portal/Default.aspx',
 'https://hartford.civicweb.net/Portal',
 'https://bracebridge.civicweb.net/document/15808']

In [None]:
find_subdomain = lambda url: url.split(".civicweb.net")[0].split("https://")[-1]
bing_scrape_results = {find_subdomain(url): {"root_url": f"https://{find_subdomain(url)}.civicweb.net","bing_search_url":url} for url in bing_links if ".civicweb.net" in url}

In [None]:
len(bing_scrape_results)

In [None]:
pprint(bing_scrape_results)

# Merge the links found from Bing to the existing subdomains.json file

In [None]:
subdomains_dict = json.load(open(f"{civicweb_scraper.OUT_FOLDER}/subdomains.json"))
logger.info(f"Loaded in existing subdomains.json file.")

In [None]:
len(set(bing_scrape_results.keys()) - set(subdomains_dict.keys()))

In [None]:
additional_bing_scrapes = {subdomain: bing_scrape_results[subdomain] for subdomain in set(bing_scrape_results.keys()) - set(subdomains_dict.keys())}

In [None]:
out_scrape_info = {**subdomains_dict, **additional_bing_scrapes}

In [None]:
pprint(out_scrape_info)

In [None]:
len(out_scrape_info)

In [None]:
with open(civicweb_scraper.OUT_FOLDER / f"subdomains.json", "w") as f:
    json.dump(out_scrape_info, f, indent=4)