In [None]:
import civicweb_scraper

import requests
from requests_cache import CachedSession

from collections import deque

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException

import pandas as pd
import json
from csv import DictReader

import time
from datetime import datetime
import logging
import traceback
from tqdm import tqdm

import os
from dotenv import load_dotenv
load_dotenv()

# Create/Modify Cache

In [None]:
session = civicweb_scraper.create_cache(
    name="test_cache", 
    expire_after=3600*24*14, 
    allowable_codes=[200] # only save successful requests
    )

In [None]:
##### LOGGER SETUP
logging.basicConfig(format='%(asctime)s - %(levelname)s:%(message)s', datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

In [None]:
# logger.setLevel(logging.DEBUG)
logger.setLevel(logging.INFO)

# Get all websites with domain `civicweb.net`

In [None]:
# read csv as a giant dictionary
# get the existing list of scraped subdomains
# read in dataframe

# try:
#     existing_subdomains_df = pd.read_csv(civicweb_scraper.OUT_FOLDER / "civicweb_subdomains.csv", index_col=[0])
    
#     subdomains_dict = {
#         subdomain["subdomain"]: {
#             key: subdomain[key] 
#             for key in existing_subdomains_df.columns
#             if key != "subdomain"
#         } for i, subdomain in existing_subdomains_df.iterrows()
#     }
# except FileNotFoundError:
#     existing_subdomains_df = pd.DataFrame()
#     subdomains_dict = {}

# subdomains_dict

In [None]:
# load existing subdomain information if available
try:
    google_scrape_results = json.load(open(f"{civicweb_scraper.OUT_FOLDER}/subdomains.json"))
    logger.info(f"Loaded in existing subdomains.json file.")
except:
    google_scrape_results = {}
    logger.info(f"No existing subdomains.json file found in the {civicweb_scraper.OUT_FOLDER} folder.")

## Using Google Search API

In [None]:
# create a custom google search engine by following https://developers.google.com/custom-search/docs/tutorial/creatingcse
GOOGLE_SEARCH_ENDPOINT = "https://www.googleapis.com/customsearch/v1?"
GOOGLE_API_KEY = os.getenv('GOOGLE_API')
GOOGLE_SEARCH_ENGINE_ID = os.getenv('GOOGLE_SEARCH_ENGINE_ID')

# google_params = {
#     "key": GOOGLE_API_KEY, 
#     "cx":GOOGLE_SEARCH_ENGINE_ID,
#     "q":"site:civicweb.net",
#     "num":10,
#     "gl":"ca"
# }

# response = session.get(GOOGLE_SEARCH_ENDPOINT, params=google_params)

In [None]:
# response.raise_for_status()
# search_results = response.json()

In [None]:
# search_results

In [None]:
# google_scrape_results = []

# google_scrape_results.extend([
#     {key:item[key] for key in ['title', 'link', 'snippet']} 
#     for item in search_results["items"]])

# google_scrape_results

In [None]:
start_page = 0
max_page = 15
num_results_per_page = 10
for page_index in range(start_page, max_page):
    # time.sleep(1) # wait for 1 second between each search
    start_index = page_index*10+1
    # search with query and result page
    logger.info(f"Scraping page {page_index} with results from {start_index} to {start_index+num_results_per_page-1}...")

    google_params = {
        "key": GOOGLE_API_KEY, 
        "cx":GOOGLE_SEARCH_ENGINE_ID,
        "q":"site:civicweb.net",
        "num":num_results_per_page,
        'start': start_index,
        "gl":"ca"
    }

    try:
        response = session.get(GOOGLE_SEARCH_ENDPOINT, params=google_params)
        response.raise_for_status()
        search_results = response.json()
    except requests.exceptions.HTTPError as e:
        logger.error(e)
        continue

    for item in search_results["items"]:
        logger.debug(f"Looking at {item['link']}")
        if ".civicweb.net" in item["link"]:
            subdomain = item["link"].split(".civicweb.net")[0].split("https://")[-1]
            if subdomain in google_scrape_results:
                logger.debug(f"Already seen {subdomain}")
                continue
            else:
                logger.info(f"Adding {subdomain} to results list")
                google_scrape_results[subdomain] ={
                    "root_url": f"https://{subdomain}.civicweb.net",
                    "google_search_url": item["link"],
                    "title": item["title"],
                    "description": item["snippet"]}

In [None]:
google_scrape_results

In [None]:
google_scrape_results.keys()

In [None]:
logger.info(f"Found {len(google_scrape_results)} unique subdomains from the Google Search API.")

In [None]:
# save the newly found subdomains
with open(civicweb_scraper.OUT_FOLDER / f"subdomains.json", "w") as f:
    json.dump(google_scrape_results, f, indent=4)

# Scrape documents from each subdomain

In [None]:
logger.setLevel(logging.INFO)

In [None]:
# def download_document(session, document, root_url, subdomain) -> dict:
#     ''' 
#     Download according to a document information dictionary, and return a dictionary of the download details.
#     '''
#     error = ""
#     file_extension = ""
#     file_type = ""

#     try:
#         response = session.get(url=root_url+document["url"], headers=civicweb_scraper.HEADERS)

#         out_path = civicweb_scraper.OUT_FOLDER.joinpath(subdomain, *document["parent"])

#         file_extension, file_type = civicweb_scraper.get_filetype(response)
#         logger.debug((f"File: {document['name']}{file_extension}"))

#         civicweb_scraper.download_file(
#         response, 
#         filename=document["name"]+file_extension,
#         out_path=out_path)
#     except Exception as e:
#         logger.error(e)
#         error += f"{e}"
#     finally:
#         download_dict = {
#                     "name": document["name"]+file_extension,
#                     "file_type": file_type,
#                     "subdomain": subdomain,
#                     "parent_path": "/".join(document["parent"]),
#                     "root_url": root_url, 
#                     "url": document["url"], 
#                     "parent_url": document["parent_url"],
#                     "date_scraped": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
#                     "error": error
#                 }
#         return download_dict

In [None]:
# for subdomain in google_scrape_results.keys():

for subdomain in ['dev', 'victoria', 'notl']:
    # breadth-first search to find all subfolders and documents
    logger.info(f"Processing {subdomain}...")
    root_url = google_scrape_results[subdomain]["root_url"]

    try:
        # load existing document tracking files if they exist
        with open(civicweb_scraper.OUT_FOLDER / f"{subdomain}_documents.csv",'r') as data:
            dict_reader = DictReader(f)
            documents = list(dict_reader)
        logger.info(f"Found existing document tracking file for {subdomain}: {len(done_folders)} folders completed and {len(folders)} to go")
    except:
        documents = []
    
    try:
        # load existing folder json deques if they exist
        with open(civicweb_scraper.OUT_FOLDER / f"{subdomain}_folders.json", "r") as f: 
            json_data = json.load(f)
            folders, done_folders = deque(json_data["folders"]), deque(json_data["done_folders"])

        logger.info(f"Found existing folder tracking file for {subdomain}: {len(done_folders)} folders completed and {len(folders)} to go")
    except FileNotFoundError:
        logger.debug(f"No existing folder tracking file for {subdomain}. Adding root folder to folders as a start.")

        folders = deque([]) 
        done_folders = deque([])

        # get the root url for this subdomain and add the folders at the root to be processed
        
        try:
            response = session.get(url=root_url+"/filepro/documents/")
            response.raise_for_status()
            folders.extend(civicweb_scraper.get_items(response, parent_url="/filepro/documents/",parent=[], is_folder=True))
        except requests.exceptions.HTTPError as e:
            logger.error(f"Unable to fetch website information for {root_url+'/filepro/documents/'}: {e}")
            continue
        except Exception as e:
            logger.error(f"Exception occurred for URL {root_url+'/filepro/documents/'}: {e}")
            continue

    # process all documents on site
    while len(folders)>0:
        time.sleep(1)
        logger.debug("num folders to visit: %s", len([str(folder) for folder in folders]))
        logger.debug("\nnum completed folders: %s", len([str(folder) for folder in done_folders]))
        
        curr_folder = folders.popleft()
        logger.info(f"\nEntering folder {curr_folder['name']} at location \'/{'/'.join(curr_folder['parent'])}\'...") 
        try:
            response = session.get(url=root_url+curr_folder["url"])

            # update current folder's details
            curr_path = curr_folder["parent"].copy()
            curr_path.append(curr_folder["name"])

            logger.debug("parent path to add to children folders/documents: %s", curr_path)
            
            # add subfolders to visit from this folder to the folders deque
            children_folders = civicweb_scraper.get_items(response, parent_url=curr_folder["url"], parent=curr_path, is_folder=True)
            folders.extend(children_folders)

            # add documents to download from this folder
            children_documents = civicweb_scraper.get_items(response, parent_url=curr_folder["url"], parent=curr_path, is_folder=False)
            # documents.extend(children_documents)

            # download documents
            for document in tqdm(children_documents):
                logger.debug(f"downloading document {document['name']} (URL: {document['url']})")
                download_dict = civicweb_scraper.download_document(session=session, document=document, root_url=root_url, subdomain=subdomain)
                documents.append(download_dict)

            logger.debug(str(documents))
            
        except Exception as e:
            tb_str = ''.join(traceback.format_exception(e))
            logger.error(tb_str)
            continue
        finally:
            # save document information to a csv
            logger.info(f"Updating tracking files for {curr_folder['name']} with {len(children_folders)} new folders and {len(children_documents)} new documents.")

            out_df = pd.DataFrame(documents)
            out_df.to_csv(civicweb_scraper.OUT_FOLDER / f"{subdomain}_documents.csv", index=False)
            # save current folder/deque/documents information to a json file
            with open(civicweb_scraper.OUT_FOLDER / f"{subdomain}_folders.json", "w") as f:
                json.dump({"folders": list(folders),
                           "done_folders": list(done_folders)}, f, indent=4)

        done_folders.append(curr_folder)
        curr_path = []
        
        logger.info(f"... Exiting folder {curr_folder['name']}.\n")
    logger.info(f"... Found all documents for {subdomain}.\n\n")

In [None]:
# def get_cache_creation_time(url, session:CachedSession):
#     response = session.get(url=url)
#     return response.created_at

## Using Selenium

In [None]:
bing_scrape_results = []

In [None]:
driver = webdriver.Firefox()

max_page_number = 10 # max number of pages to scrape
num_results_per_page = 10 # default number

for page_index in range(max_page_number):
    time.sleep(1) # wait for 1 second between each search

    # search with query and result page
    start_index = 1+page_index*num_results_per_page
    logger.info(f"Scraping page {page_index+1} with results from {start_index} to {start_index+num_results_per_page-1}...")

    bing_url = f'https://www.bing.com/search?q=site%3acivicweb.net&first={start_index}' 

    # get url of each page result
    try:
        driver.get(bing_url)
        # titles = driver.find_elements(by=By.CLASS_NAME, value="tptt")
        links = driver.find_elements(by=By.TAG_NAME, value="cite")
        # snippets = driver.find_elements(by=By.CLASS_NAME, value="b_lineclamp4")

        scraped_links.append(links.text)

        # scraped_links.extend([
        #     {
        #         "title": title.text, 
        #         "link": link.text,
        #         "snippet": snippet.text,
        #     } for (title, link, snippet) in zip(titles, links, snippets)
        # ])
    except WebDriverException as e:
        logger.error("Error opening Bing page for scraping: ", e)
        continue
    finally:
        driver.quit()
    
logger.info(f"Bing scraping finished. Found {len(scraped_links)} links.")

In [None]:
subdomain_rows = [] # for the dataframe
subdomains_with_documents = [] # valid site to be scraped

for subdomain in subdomain_list:
    root_url = f"https://{subdomain}.civicweb.net"
    error = ""
    # subdomain = root_url.split("https://")[-1].split(".civicweb.net")[0]
    # if subdomain returns code 200 and text is not empty, add to list of sites to scrape
    try:
        response = civicweb_scraper.fetch_webpage(url=root_url+"/filepro/documents/") # get the url of the documents hub
        logger.debug(f"{root_url} has a documents page, adding to valid subdomains list")
        subdomains_with_documents.append(subdomain)
    except requests.HTTPError as e: 
        # error from requests module for url
        tb_str = ''.join(traceback.format_exception(e))
        logger.error(f"in {root_url}: {tb_str}")
        error += str(f"{response.status_code}: {e}")
        continue
    except TypeError as e: # error from requests-cache 
        tb_str = ''.join(traceback.format_exception(e))
        logger.error(f"in {root_url}: {tb_str}")
        error += str(f"{response.status_code}: {e}")
        continue
    except Exception as e:
        tb_str = ''.join(traceback.format_exception(e))
        logger.error(f"in {root_url}: {tb_str}")
        error += str(e)
        continue
    finally:
        subdomain_info = {
            "subdomain": subdomain, 
            "root_url": root_url, 
            "bing_retrieval_date": "",
            "'error'": error,
        }
        subdomain_rows.append(subdomain_info)

In [None]:
bing_scrape_results

In [None]:
# test scraped results without running selenium
scraped_links = ['https://severn.civicweb.net', 'https://tucumcari.civicweb.net', 'https://williamsnd.civicweb.net', 'https://cocookmn.civicweb.net', 'https://greatermadawaska.civicweb.net', 'https://otonabeesouthmonaghan.civicweb.net', 'https://centrewellington.civicweb.net', 'https://google.com']

In [None]:
overwrite_cache = False

In [None]:
civicweb_scraper.remove_url_from_cache(session, url="google.com")

In [None]:
# check if the website is valid subdomain of civicweb.net
for link in scraped_links:
    if '.civicweb.net' in link:
        # get basic site information
        subdomain = link.split(".civicweb.net")[0].split("https://")[-1]
        root_url = f"https://{subdomain}.civicweb.net"
        documents_url = root_url+"/filepro/documents/"
        error = ""

        if overwrite_cache is True:
            pass
        else:
            pass
        
        
        try:
            # response = civicweb_scraper.fetch_webpage(url=documents_url) 
            if session.cache.has_url(documents_url):
                

            response = civicweb_scraper.fetch_webpage_with_cache(url=documents_url, session=session) 
            # if subdomain in subdomains_dict:
            #     # update existing subdomain with new scraping details
            #     logger.debug(f"{documents_url} has been retrieved before, adding to valid subdomains list")

        except requests.exceptions.ConnectionError as e:
            logger.error(f"Check your connection. Unable to fetch {documents_url}: {e}")
            error += str(e)
            continue

        finally:
            # update subdomains_dict with new scraping details
            if subdomain in subdomains_dict:
                subdomains_dict[subdomain].update(
                    {
                        "bing_retrieval_date": datetime.now().strftime("%m/%d/%Y %H:%M:%S"),"error" : str(error), 
                        "cached_date": datetime.now().strftime("%m/%d/%Y %H:%M:%S")
                    })
            else:
                # add new subdomain with scraping details to the dictionary
                subdomains_dict[subdomain] = {
                    "root_url": root_url,
                    "bing_retrieval_date": datetime.now().strftime("%m/%d/%Y %H:%M:%S"),
                    "error": str(error),
                    "cached_date": datetime.now().strftime("%m/%d/%Y %H:%M:%S")
                }
        

        # update with new details
        # if subdomain in subdomains_dict:
        #     subdomains[subdomain]["bing_retrieval_date"] = datetime.now().strftime("%m/%d/%Y %H:%M:%S")
        #     subdomains[subdomain]["error"] = e
        # else:
        #     subdomains[subdomain] = {
        #         "root_url": root_url,
        #         "bing_retrieval_date": datetime.now().strftime("%m/%d/%Y %H:%M:%S"),
        #         "error": error
        #     }
        
        #     subdomains_with_documents.append(subdomain)
        #     scraped_links.append(subdomain)
        

In [None]:
scraped_links

## Filter for subdomains with a documents portal

In [None]:
subdomain_rows = [] # for the dataframe
subdomains_with_documents = [] # valid site to be scraped

for subdomain in subdomain_list:
    root_url = f"https://{subdomain}.civicweb.net"
    error = ""
    # subdomain = root_url.split("https://")[-1].split(".civicweb.net")[0]
    # if subdomain returns code 200 and text is not empty, add to list of sites to scrape
    try:
        response = civicweb_scraper.fetch_webpage(url=root_url+"/filepro/documents/") # get the url of the documents hub
        logger.debug(f"{root_url} has a documents page, adding to valid subdomains list")
        subdomains_with_documents.append(subdomain)
    except requests.HTTPError as e: 
        # error from requests module for url
        tb_str = ''.join(traceback.format_exception(e))
        logger.error(f"in {root_url}: {tb_str}")
        error += str(f"{response.status_code}: {e}")
        continue
    except TypeError as e: # error from requests-cache 
        tb_str = ''.join(traceback.format_exception(e))
        logger.error(f"in {root_url}: {tb_str}")
        error += str(f"{response.status_code}: {e}")
        continue
    except Exception as e:
        tb_str = ''.join(traceback.format_exception(e))
        logger.error(f"in {root_url}: {tb_str}")
        error += str(e)
        continue
    finally:
        subdomain_info = {
            "subdomain": subdomain, 
            "root_url": root_url, 
            "bing_retrieval_date": "",
            "'error'": error,
        }
        subdomain_rows.append(subdomain_info)

In [None]:
subdomain_rows

# Track valid subdomains

In [None]:
# get the existing list of scraped subdomains
try:
    existing_subdomains_df = pd.read_csv(civicweb_scraper.OUT_FOLDER / "scraped_subdomains new.csv", index_col=[0])
except FileNotFoundError:
    existing_subdomains_df = pd.DataFrame()

In [None]:
new_subdomains_df = pd.DataFrame(subdomain_rows)
new_subdomains_df

In [None]:
old_subdomains_df = old_subdomains_df.reset_index(drop=True)
new_subdomains_df = new_subdomains_df.reset_index(drop=True)

updated_subdomains_df = pd.concat([old_subdomains_df, new_subdomains_df], ignore_index=True)

In [None]:
updated_subdomains_df

In [None]:
updated_subdomains_df.to_csv(civicweb_scraper.OUT_FOLDER / "scraped_subdomains_test_updated.csv")