In [75]:
import civicweb_scraper

import requests
from requests_cache import CachedSession

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException

import pandas as pd
import json

import time
from datetime import datetime
import logging
import traceback

import os
from dotenv import load_dotenv
load_dotenv()

True

# Create/Modify Cache

In [4]:
session = civicweb_scraper.create_cache(
    name="test_cache", 
    expire_after=3600*24*14, 
    allowable_codes=[200] # only save successful requests
    )

2024-07-29 01:58:09 - INFO:Getting cache with name test_cache which will expire after 1209600 seconds.


In [24]:
# session = CachedSession(
#     "civicweb_scraper_cache", 
#     expire_after=3600*24*14, 
#     allowable_codes=[200] # only save successful requests
#     )

In [25]:
# uncomment and run line below if you want to scrape more recent data
# civicweb_scraper.clear_cache(name="scraper_cache")
# session.cache.delete_url("")

In [29]:
##### LOGGER SETUP
logging.basicConfig(format='%(asctime)s - %(levelname)s:%(message)s', datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

In [30]:
logger.setLevel(logging.DEBUG)
# logger.setLevel(logging.INFO)

# Get all websites with domain `civicweb.net`

In [6]:
subdomains = []
scraped_links = []

In [7]:
# read csv as a giant dictionary
# get the existing list of scraped subdomains
# read in dataframe

try:
    existing_subdomains_df = pd.read_csv(civicweb_scraper.OUT_FOLDER / "civicweb_subdomains.csv", index_col=[0])

    subdomains_dict = {
        subdomain["subdomain"]: {
            key: subdomain[key] 
            for key in existing_subdomains_df.columns
            if key != "subdomain"
        } for i, subdomain in existing_subdomains_df.iterrows()
    }
except FileNotFoundError:
    existing_subdomains_df = pd.DataFrame()
    subdomains_dict = {}

In [8]:
subdomains_dict

{'victoria': {'root_url': 'https://victoria.civicweb.net',
  'bing_retrieval_date': '2024-07-27 16:39:23',
  'error': nan},
 'tay': {'root_url': 'https://tay.civicweb.net',
  'bing_retrieval_date': '2024-07-27 16:39:23',
  'error': nan},
 'terrace': {'root_url': 'https://terrace.civicweb.net',
  'bing_retrieval_date': '2024-07-27 16:39:23',
  'error': nan},
 'cityofshawnee': {'root_url': 'https://cityofshawnee.civicweb.net',
  'bing_retrieval_date': '2024-07-27 16:39:23',
  'error': nan},
 'loyalist': {'root_url': 'https://loyalist.civicweb.net',
  'bing_retrieval_date': '2024-07-27 16:39:23',
  'error': nan},
 'wetaskiwin': {'root_url': 'https://wetaskiwin.civicweb.net',
  'bing_retrieval_date': '2024-07-27 16:39:23',
  'error': nan},
 'mclendon-chisholm': {'root_url': 'https://mclendon-chisholm.civicweb.net',
  'bing_retrieval_date': '2024-07-27 16:39:23',
  'error': nan},
 'revelstoke': {'root_url': 'https://revelstoke.civicweb.net',
  'bing_retrieval_date': '2024-07-27 16:39:23',
 

## Using Google Search API

In [11]:
# create a custom google search engine by following https://developers.google.com/custom-search/docs/tutorial/creatingcse
GOOGLE_SEARCH_ENDPOINT = "https://www.googleapis.com/customsearch/v1?"
GOOGLE_API_KEY = os.getenv('GOOGLE_API')
GOOGLE_SEARCH_ENGINE_ID = os.getenv('GOOGLE_SEARCH_ENGINE_ID')

google_params = {
    "key": GOOGLE_API_KEY, 
    "cx":GOOGLE_SEARCH_ENGINE_ID,
    "q":"site:civicweb.net",
    "num":10,
    "gl":"ca"
}

response = session.get(GOOGLE_SEARCH_ENDPOINT, params=google_params)

In [12]:
response.raise_for_status()
search_results = response.json()

In [13]:
search_results

{'kind': 'customsearch#search',
 'url': {'type': 'application/json',
  'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'},
 'queries': {'request': [{'title': 'Google Custom Search - site:civicweb.net',
    'totalResults': '18000',
    'searchTerms': 'site:civicweb.net',
    'count': 10,
    'startIndex': 1,
    'inputEncoding': 'utf8',
    'outputEncoding': 'utf8',
    'safe':

In [22]:
google_scrape_results = []

google_scrape_results.extend([
    {key:item[key] for key in ['title', 'link', 'snippet']} 
    for item in search_results["items"]])

google_scrape_results

[{'title': 'Diligent Community',
  'link': 'https://civicweb.net/',
  'snippet': 'Modernize governance management with Diligent Community. Diligent Community is the next-generation board management solution designed to help public and elected\xa0...'},
 {'title': 'Login DominoQQ Link Alternatif Bermain Domino QQ Online ...',
  'link': 'https://testus.civicweb.net/dominoqq/',
  'snippet': 'Nikmati kemudahan dan kepercayaan dalam bermain DominoQQ dengan menggunakan link alternatif yang disediakan. Proses login yang cepat dan aman memungkinkan\xa0...'},
 {'title': 'City of Winchester - Home',
  'link': 'https://winchesterva.civicweb.net/',
  'snippet': 'WELCOME. Welcome to our Agenda and Minutes Portal. This portal provides easy access to meeting agendas, minutes, videos (or audio) and more for City Council and\xa0...'},
 {'title': 'City of Indio - Home',
  'link': 'https://indio.civicweb.net/',
  'snippet': 'City of Indio. We are pleased to introduce this user-friendly site providing acc

In [81]:
# load existing subdomain information if available
try:
    google_scrape_results = json.load(open(f"{civicweb_scraper.OUT_FOLDER}/subdomains.json"))
except:
    google_scrape_results = {}

In [71]:
start_page = 0
max_page = 3
num_results_per_page = 10
for page_index in range(start_page, max_page):
    # time.sleep(1) # wait for 1 second between each search
    start_index = page_index*10+1
    # search with query and result page
    logger.info(f"Scraping page {page_index} with results from {start_index} to {start_index+num_results_per_page-1}...")

    google_params = {
        "key": GOOGLE_API_KEY, 
        "cx":GOOGLE_SEARCH_ENGINE_ID,
        "q":"site:civicweb.net",
        "num":num_results_per_page,
        'start': start_index,
        "gl":"ca"
    }

    response = session.get(GOOGLE_SEARCH_ENDPOINT, params=google_params)
    # response = requests.get(GOOGLE_SEARCH_ENDPOINT, params=google_params)
    response.raise_for_status()
    search_results = response.json()

    for item in search_results["items"]:
        logger.debug(f"Looking at {item['link']}")
        if ".civicweb.net" in item["link"]:
            subdomain = item["link"].split(".civicweb.net")[0].split("https://")[-1]
            if subdomain in google_scrape_results:
                logger.debug(f"Already seen {subdomain}")
                continue
            else:
                logger.debug(f"Adding {subdomain} to results list")
                google_scrape_results[subdomain] ={
                    "root_url": f"https://{subdomain}.civicweb.net",
                    "google_search_url": item["link"],
                    "title": item["title"],
                    "description": item["snippet"]}
                
            
    
    # google_scrape_results.extend([
    #     {key:item[key] for key in ['title', 'link','snippet']} 
    #     for item in search_results["items"]])

# google_scrape_results
        

2024-07-29 03:41:10 - INFO:Scraping page 0 with results from 1 to 10...
2024-07-29 03:41:10 - DEBUG:Looking at https://civicweb.net/
2024-07-29 03:41:10 - DEBUG:Looking at https://winchesterva.civicweb.net/
2024-07-29 03:41:10 - DEBUG:Adding winchesterva to results list
2024-07-29 03:41:10 - DEBUG:Looking at https://indio.civicweb.net/
2024-07-29 03:41:10 - DEBUG:Adding indio to results list
2024-07-29 03:41:10 - DEBUG:Looking at https://missouricitytx.civicweb.net/
2024-07-29 03:41:10 - DEBUG:Adding missouricitytx to results list
2024-07-29 03:41:10 - DEBUG:Looking at https://nngov.civicweb.net/
2024-07-29 03:41:10 - DEBUG:Adding nngov to results list
2024-07-29 03:41:10 - DEBUG:Looking at https://coshawanowi.civicweb.net/
2024-07-29 03:41:10 - DEBUG:Adding coshawanowi to results list
2024-07-29 03:41:10 - DEBUG:Looking at https://dev.civicweb.net/
2024-07-29 03:41:10 - DEBUG:Adding dev to results list
2024-07-29 03:41:10 - DEBUG:Looking at https://rcocr.civicweb.net/
2024-07-29 03:41

In [72]:
google_scrape_results

{'winchesterva': {'root_url': 'https://winchesterva.civicweb.net',
  'google_search_url': 'https://winchesterva.civicweb.net/',
  'title': 'City of Winchester - Home',
  'description': 'WELCOME. Welcome to our Agenda and Minutes Portal. This portal provides easy access to meeting agendas, minutes, videos (or audio) and more for City Council and\xa0...'},
 'indio': {'root_url': 'https://indio.civicweb.net',
  'google_search_url': 'https://indio.civicweb.net/',
  'title': 'City of Indio - Home',
  'description': 'City of Indio. We are pleased to introduce this user-friendly site providing access to information about the City Council and City Commissions. You will find\xa0...'},
 'missouricitytx': {'root_url': 'https://missouricitytx.civicweb.net',
  'google_search_url': 'https://missouricitytx.civicweb.net/',
  'title': 'Missouri City, Texas - Home',
  'description': 'We pride ourselves on being accessible to all our citizens, and providing information to people as easily as possible. Th

In [74]:
google_scrape_results.keys()

dict_keys(['winchesterva', 'indio', 'missouricitytx', 'nngov', 'coshawanowi', 'dev', 'rcocr', 'sbcag', 'lakeway-tx', 'patchtestus.dev', 'victoriatx', 'issaquah', 'harrisonhotsprings', 'opkansas', 'douglasville', 'bellairetx'])

In [73]:
logger.info(f"Found {len(google_scrape_results)} unique subdomains from the Google Search API.")

2024-07-29 03:42:36 - INFO:Found 16 unique subdomains from the Google Search API.


In [78]:
# save the subdomains
with open(civicweb_scraper.OUT_FOLDER / f"subdomains.json", "w") as f:
    json.dump(google_scrape_results, f, indent=4)

# Scrape documents from each subdomain

In [None]:
for subdomain in google_scrape_results.keys()[0]:
    # breadth-first search to find all subfolders and documents
    folders = deque([]) 
    done_folders = deque([])
    documents = []
    
    # add the folders at the root to be processed
    try:
        response = session.get(url=root_url+"/filepro/documents/")
        response.raise_for_status()
        
        folders.extend(civicweb_scraper.get_items(response, parent_url="/filepro/documents/",parent=[], is_folder=True))
    except Exception as e:
        logger.error(e)
        continue

    while len(folders)>0:
        time.sleep(1)
        logger.debug("folders to visit: %s", "\n".join([str(folder) for folder in folders]))
        logger.debug("completed folders: %s", "\n".join([str(folder) for folder in done_folders]))
        
        curr_folder = folders.popleft()
        logger.info(f"\nSearching folder {curr_folder['name']} at location /{'/'.join(curr_folder['parent'])}...") 
        try:
            response = session.get(url=root_url+curr_folder["url"])

            # update current folder
            curr_path = curr_folder["parent"].copy()
            curr_path.append(curr_folder["name"])
            logger.debug("currrent folder's parents:%s", curr_folder["parent"])
            logger.debug("current folder's name:%s", curr_folder["name"])
            logger.debug("parent path to add to children folders/documents: %s", curr_path)
            
            # add subfolders to visit from this folder to the folders deque
            # initial_folders = len(folders)
            children_folders = civicweb_scraper.get_items(response, parent_url=curr_folder["url"], parent=curr_path, is_folder=True)
            folders.extend(children_folders)

            # add documents to download from this folder to the documents list
            # initial_documents = len(documents)
            children_documents = civicweb_scraper.get_items(response, parent_url=curr_folder["url"], parent=curr_path, is_folder=False)
            documents.extend(children_documents)
            
            logger.info(f"Found {len(children_folders)} folders and {len(children_documents)} documents at this location.")
        except Exception as e:
            tb_str = ''.join(traceback.format_exception(e))
            logger.error(tb_str)
            continue

        done_folders.append(curr_folder)
        curr_path = []
        
        logger.info(f"Completed folder {curr_folder['name']}\n")

In [54]:
# def get_cache_creation_time(url, session:CachedSession):
#     response = session.get(url=url)
#     return response.created_at

## Using Selenium

In [49]:
bing_scrape_results = []

In [50]:
driver = webdriver.Firefox()

max_page_number = 10 # max number of pages to scrape
num_results_per_page = 10 # default number

for page_index in range(max_page_number):
    time.sleep(1) # wait for 1 second between each search

    # search with query and result page
    start_index = 1+page_index*num_results_per_page
    logger.info(f"Scraping page {page_index+1} with results from {start_index} to {start_index+num_results_per_page-1}...")

    bing_url = f'https://www.bing.com/search?q=site%3acivicweb.net&first={start_index}' 

    # get url of each page result
    try:
        driver.get(bing_url)
        # titles = driver.find_elements(by=By.CLASS_NAME, value="tptt")
        links = driver.find_elements(by=By.TAG_NAME, value="cite")
        # snippets = driver.find_elements(by=By.CLASS_NAME, value="b_lineclamp4")

        scraped_links.append(links.text)

        # scraped_links.extend([
        #     {
        #         "title": title.text, 
        #         "link": link.text,
        #         "snippet": snippet.text,
        #     } for (title, link, snippet) in zip(titles, links, snippets)
        # ])
    except WebDriverException as e:
        logger.error("Error opening Bing page for scraping: ", e)
        continue
    finally:
        driver.quit()
    
logger.info(f"Bing scraping finished. Found {len(scraped_links)} links.")

2024-07-29 03:08:26 - INFO:Scraping page 1 with results from 1 to 10...
2024-07-29 03:08:31 - INFO:Scraping page 2 with results from 11 to 20...


MaxRetryError: HTTPConnectionPool(host='localhost', port=55939): Max retries exceeded with url: /session/b081662c-64b5-4ae1-bc6f-03d9ad446abc/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1692bd540>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [None]:
subdomain_rows = [] # for the dataframe
subdomains_with_documents = [] # valid site to be scraped

for subdomain in subdomain_list:
    root_url = f"https://{subdomain}.civicweb.net"
    error = ""
    # subdomain = root_url.split("https://")[-1].split(".civicweb.net")[0]
    # if subdomain returns code 200 and text is not empty, add to list of sites to scrape
    try:
        response = civicweb_scraper.fetch_webpage(url=root_url+"/filepro/documents/") # get the url of the documents hub
        logger.debug(f"{root_url} has a documents page, adding to valid subdomains list")
        subdomains_with_documents.append(subdomain)
    except requests.HTTPError as e: 
        # error from requests module for url
        tb_str = ''.join(traceback.format_exception(e))
        logger.error(f"in {root_url}: {tb_str}")
        error += str(f"{response.status_code}: {e}")
        continue
    except TypeError as e: # error from requests-cache 
        tb_str = ''.join(traceback.format_exception(e))
        logger.error(f"in {root_url}: {tb_str}")
        error += str(f"{response.status_code}: {e}")
        continue
    except Exception as e:
        tb_str = ''.join(traceback.format_exception(e))
        logger.error(f"in {root_url}: {tb_str}")
        error += str(e)
        continue
    finally:
        subdomain_info = {
            "subdomain": subdomain, 
            "root_url": root_url, 
            "bing_retrieval_date": "",
            "'error'": error,
        }
        subdomain_rows.append(subdomain_info)

In [None]:
bing_scrape_results

In [None]:
# test scraped results without running selenium
scraped_links = ['https://severn.civicweb.net', 'https://tucumcari.civicweb.net', 'https://williamsnd.civicweb.net', 'https://cocookmn.civicweb.net', 'https://greatermadawaska.civicweb.net', 'https://otonabeesouthmonaghan.civicweb.net', 'https://centrewellington.civicweb.net', 'https://google.com']

In [None]:
overwrite_cache = False

In [None]:
civicweb_scraper.remove_url_from_cache(session, url="google.com")

In [None]:
# check if the website is valid subdomain of civicweb.net
for link in scraped_links:
    if '.civicweb.net' in link:
        # get basic site information
        subdomain = link.split(".civicweb.net")[0].split("https://")[-1]
        root_url = f"https://{subdomain}.civicweb.net"
        documents_url = root_url+"/filepro/documents/"
        error = ""

        if overwrite_cache is True:
            pass
        else:
            pass
        
        
        try:
            # response = civicweb_scraper.fetch_webpage(url=documents_url) 
            if session.cache.has_url(documents_url):
                

            response = civicweb_scraper.fetch_webpage_with_cache(url=documents_url, session=session) 
            # if subdomain in subdomains_dict:
            #     # update existing subdomain with new scraping details
            #     logger.debug(f"{documents_url} has been retrieved before, adding to valid subdomains list")

        except requests.exceptions.ConnectionError as e:
            logger.error(f"Check your connection. Unable to fetch {documents_url}: {e}")
            error += str(e)
            continue

        finally:
            # update subdomains_dict with new scraping details
            if subdomain in subdomains_dict:
                subdomains_dict[subdomain].update(
                    {
                        "bing_retrieval_date": datetime.now().strftime("%m/%d/%Y %H:%M:%S"),"error" : str(error), 
                        "cached_date": datetime.now().strftime("%m/%d/%Y %H:%M:%S")
                    })
            else:
                # add new subdomain with scraping details to the dictionary
                subdomains_dict[subdomain] = {
                    "root_url": root_url,
                    "bing_retrieval_date": datetime.now().strftime("%m/%d/%Y %H:%M:%S"),
                    "error": str(error),
                    "cached_date": datetime.now().strftime("%m/%d/%Y %H:%M:%S")
                }
        

        # update with new details
        # if subdomain in subdomains_dict:
        #     subdomains[subdomain]["bing_retrieval_date"] = datetime.now().strftime("%m/%d/%Y %H:%M:%S")
        #     subdomains[subdomain]["error"] = e
        # else:
        #     subdomains[subdomain] = {
        #         "root_url": root_url,
        #         "bing_retrieval_date": datetime.now().strftime("%m/%d/%Y %H:%M:%S"),
        #         "error": error
        #     }
        
        #     subdomains_with_documents.append(subdomain)
        #     scraped_links.append(subdomain)
        

In [None]:
scraped_links

## Filter for subdomains with a documents portal

In [None]:
subdomain_rows = [] # for the dataframe
subdomains_with_documents = [] # valid site to be scraped

for subdomain in subdomain_list:
    root_url = f"https://{subdomain}.civicweb.net"
    error = ""
    # subdomain = root_url.split("https://")[-1].split(".civicweb.net")[0]
    # if subdomain returns code 200 and text is not empty, add to list of sites to scrape
    try:
        response = civicweb_scraper.fetch_webpage(url=root_url+"/filepro/documents/") # get the url of the documents hub
        logger.debug(f"{root_url} has a documents page, adding to valid subdomains list")
        subdomains_with_documents.append(subdomain)
    except requests.HTTPError as e: 
        # error from requests module for url
        tb_str = ''.join(traceback.format_exception(e))
        logger.error(f"in {root_url}: {tb_str}")
        error += str(f"{response.status_code}: {e}")
        continue
    except TypeError as e: # error from requests-cache 
        tb_str = ''.join(traceback.format_exception(e))
        logger.error(f"in {root_url}: {tb_str}")
        error += str(f"{response.status_code}: {e}")
        continue
    except Exception as e:
        tb_str = ''.join(traceback.format_exception(e))
        logger.error(f"in {root_url}: {tb_str}")
        error += str(e)
        continue
    finally:
        subdomain_info = {
            "subdomain": subdomain, 
            "root_url": root_url, 
            "bing_retrieval_date": "",
            "'error'": error,
        }
        subdomain_rows.append(subdomain_info)

In [None]:
subdomain_rows

# Track valid subdomains

In [None]:
# get the existing list of scraped subdomains
try:
    existing_subdomains_df = pd.read_csv(civicweb_scraper.OUT_FOLDER / "scraped_subdomains new.csv", index_col=[0])
except FileNotFoundError:
    existing_subdomains_df = pd.DataFrame()

In [None]:
new_subdomains_df = pd.DataFrame(subdomain_rows)
new_subdomains_df

In [None]:
old_subdomains_df = old_subdomains_df.reset_index(drop=True)
new_subdomains_df = new_subdomains_df.reset_index(drop=True)

updated_subdomains_df = pd.concat([old_subdomains_df, new_subdomains_df], ignore_index=True)

In [None]:
updated_subdomains_df

In [None]:
updated_subdomains_df.to_csv(civicweb_scraper.OUT_FOLDER / "scraped_subdomains_test_updated.csv")