In [48]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from collections import deque
import logging
from joblib import Memory
import time
from pathlib import Path

logging.basicConfig(format='%(asctime)s - %(levelname)s:%(message)s', datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

In [35]:
# import requests_cache

# requests_cache.install_cache('scraper_cache', backend='sqlite', expire_after=3600*24*14)

In [49]:
cachedir = './cache' 
memory = Memory(cachedir, verbose=0)

In [50]:
root_url = "https://pleasantontx.civicweb.net"
subdomain = root_url.split("https://")[-1].split(".civicweb.net")[0]
HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'}

In [209]:
@memory.cache
def fetch_webpage(url_append, root_url=root_url, headers=HEADERS):
    url = root_url + url_append
    try:
        response = requests.get(url, headers=headers)
        return response
    except Exception as e:
        print("Error: ", e)
        return None
    
# def get_soup(url_append, root_url=root_url, headers=HEADERS):
#     url = root_url + url_append
#     # TODO: implement caching using joblib
#     # cached_result = cache.get('html:%s' % url)
#     # if cached_result:
#     #     return cached_result
#     print(f"Getting BeautifulSoup object from {url}")
#     try:
#         response = requests.get(url, headers=headers)
#         # cache.set('html:%s' % url, page)
#         bs = BeautifulSoup(response.content,'html.parser')
#         return bs
#     except Exception as e:
#         print("Error: ", e)
#         return None
    
# def get_item(bs:BeautifulSoup, items, parent_path="/", is_folder=True):
#     ''' 
#     parent_path (str):
#         The path to the current folder, relative to the root of the website. Must start with a slash. At least one slash is required to indicate the root of the website.
#     '''
#     if is_folder:
#         class_str = 'folder-link'
#     else:
#         class_str = 'file-link'

#     for item in bs.find_all("a", class_=class_str):
#         if item.get('href') not in [i["url"] for i in items]:
#             item_info  = {
#                 "name": item.text, 
#                 "url": item.get('href'), 
#                 "parent_path": parent_path
#             }
#             items.append(item_info)
#     return items
    
def get_folders(bs:BeautifulSoup, folders:deque, parents):
    ''' 
    parents : list(str)
        List of the folders on the path to the current folder, relative to the root of the website. parent_path[0] should be the first folder on the path, and parent_path[-1] is the current folder. parent_path=[] indicates that the folder belongs to the root.
    '''
    logger.debug("folder parents: %s", parents)
    for folder in bs.find_all("a", class_='folder-link'):
        folder_info = {
                "name": folder.text.strip(), 
                "url": folder.get('href'),
                "parents": parents,
            }
        logger.debug(f"Found folder with name {folder_info['name']} in {folder_info['parents']}, fetched from {folder_info['url']}")
        if folder.get('href') not in [f["url"] for f in folders]:
            logger.debug("Adding to folders deque to visit")
            folders.append(folder_info)
    return folders

def get_documents(bs:BeautifulSoup, documents:list, parents=[]):
    for doc in bs.find_all("a", class_='document-link'):
        document_info = {
                "name": doc.text.strip(),
                "url": doc["href"],
                "parents": parents,
            }
        logger.debug(f"Found document with name {document_info['name']} in {document_info['parents']}, fetched from {document_info['url']}")
        if doc.get('href') not in [d["url"] for d in documents]:
            logger.debug("Adding to documents list to download later")
            documents.append(document_info)
    return documents

def download_pdf(response, name, parents, subdomain="", out_path="out"):
    ''' 
    Downloads a PDF file from the given response.
    
    Parameters
    ----------
    response : requests.Response
        The response object to download from.
    name : str
        The name of the file being downloaded.
    parents : list(str)
        The list of directories that the file is in.
    subdomain : str, optional
        The subdomain of the website. Defaults to "".
    out_path : str, optional
        The path where the file should be saved on the disk. Defaults to "out".
    '''
    # out_path = Path(__file__).parent / out_path / subdomain / Path(*parents) / name + ".pdf"
    out_path = Path.cwd() / out_path / subdomain # use Path.cwd() for Jupyter Notebook
    out_path.joinpath(*parents) 
    out_path = out_path / name + ".pdf"
    out_path.mkdir(parents=True, exist_ok=True)
    logger.debug(f"Saving to {out_path}")
    with open(out_path) as pdf:
        pdf.write(response.content)
        pdf.close()

In [166]:
folders = deque([]) 
done_folders = deque([])

In [167]:
# add the folders at the root to be processed
response = fetch_webpage(url_append="/filepro/documents/", root_url=root_url)
bs = BeautifulSoup(response.content,'html.parser')
folders = get_folders(bs, folders, parents=[])

DEBUG:folder parents: []
DEBUG:Found folder with name Agendas in [], fetched from /filepro/documents/1009
DEBUG:Adding to folders deque to visit
DEBUG:Found folder with name Ordinances in [], fetched from /filepro/documents/108
DEBUG:Adding to folders deque to visit
DEBUG:Found folder with name Policies in [], fetched from /filepro/documents/109
DEBUG:Adding to folders deque to visit


In [168]:
folders

deque([{'name': 'Agendas', 'url': '/filepro/documents/1009', 'parents': []},
       {'name': 'Ordinances', 'url': '/filepro/documents/108', 'parents': []},
       {'name': 'Policies', 'url': '/filepro/documents/109', 'parents': []}])

In [169]:
documents = []

In [170]:
documents

[]

In [109]:
# memory.clear()

In [171]:
# breadth-first search to find all subfolders and documents
while len(folders)>0:
    time.sleep(1)
    logger.debug("folders to visit: %s", "\n".join([str(folder) for folder in folders]))
    logger.debug("completed folders: %s", "\n".join([str(folder) for folder in done_folders]))
    
    curr_folder = folders.popleft()
    logger.info(f"Searching folder {curr_folder['name']} with parents {curr_folder['parents']}...")
    try:
        response = fetch_webpage(url_append=curr_folder["url"], root_url=root_url)
        bs = BeautifulSoup(response.content,'html.parser')
        curr_path = curr_folder["parents"].copy()
        curr_path.append(curr_folder["name"])
        logger.debug("curr_folder parents:%s", curr_folder["parents"])
        logger.debug("curr_folder name:%s", curr_folder["name"])
        logger.debug("curr_path (the new parent): %s", curr_path)
        
        # add subfolders to visit from this folder to the folders deque
        initial_folders = len(folders)
        folders = get_folders(bs, folders, parents=curr_path)

        # add documents to download from this folder to the documents list
        initial_documents = len(documents)
        documents = get_documents(bs, documents, parents=curr_path)
        
        logger.info(f"Found {len(folders)-initial_folders} new folders and {len(documents)-initial_documents} new documents.")
    except Exception as e:
        logger.error(f"Scraping Incomplete for folder {curr_folder} with Error: {e}")
        break

    done_folders.append(curr_folder)
    curr_path = []
    
    logger.info(f"Completed folder {curr_folder['name']}")

DEBUG:folders to visit: {'name': 'Agendas', 'url': '/filepro/documents/1009', 'parents': []}
{'name': 'Ordinances', 'url': '/filepro/documents/108', 'parents': []}
{'name': 'Policies', 'url': '/filepro/documents/109', 'parents': []}
DEBUG:completed folders: 
INFO:Searching folder Agendas with parents []...
DEBUG:curr_folder parents:[]
DEBUG:curr_folder name:Agendas
DEBUG:curr_path (the new parent): ['Agendas']
DEBUG:folder parents: ['Agendas']
DEBUG:Found folder with name Workshop Session in ['Agendas'], fetched from /filepro/documents/5682
DEBUG:Adding to folders deque to visit
DEBUG:Found folder with name Special Session in ['Agendas'], fetched from /filepro/documents/3879
DEBUG:Adding to folders deque to visit
DEBUG:Found folder with name Regular Council in ['Agendas'], fetched from /filepro/documents/1021
DEBUG:Adding to folders deque to visit
INFO:Found 3 new folders and 0 new documents.
INFO:Completed folder Agendas
DEBUG:folders to visit: {'name': 'Ordinances', 'url': '/filepro/

In [172]:
print(folders)
print(done_folders)

deque([])
deque([{'name': 'Agendas', 'url': '/filepro/documents/1009', 'parents': []}, {'name': 'Ordinances', 'url': '/filepro/documents/108', 'parents': []}, {'name': 'Policies', 'url': '/filepro/documents/109', 'parents': []}, {'name': 'Workshop Session', 'url': '/filepro/documents/5682', 'parents': ['Agendas']}, {'name': 'Special Session', 'url': '/filepro/documents/3879', 'parents': ['Agendas']}, {'name': 'Regular Council', 'url': '/filepro/documents/1021', 'parents': ['Agendas']}, {'name': '2017', 'url': '/filepro/documents/1688', 'parents': ['Ordinances']}, {'name': '2022', 'url': '/filepro/documents/5869', 'parents': ['Agendas', 'Workshop Session']}, {'name': '2021', 'url': '/filepro/documents/5690', 'parents': ['Agendas', 'Workshop Session']}, {'name': '2020', 'url': '/filepro/documents/5688', 'parents': ['Agendas', 'Workshop Session']}, {'name': '2019', 'url': '/filepro/documents/5686', 'parents': ['Agendas', 'Workshop Session']}, {'name': '2018', 'url': '/filepro/documents/56

In [173]:
documents

[{'name': 'Workshop Session - Jul 21 2022 - Agenda - Html',
  'url': '/document/5871',
  'parents': ['Agendas', 'Workshop Session', '2022']},
 {'name': 'Workshop Session - Jul 21 2022 - Agenda - Pdf',
  'url': '/document/5870',
  'parents': ['Agendas', 'Workshop Session', '2022']},
 {'name': '06-17-21 Workshop',
  'url': '/document/5654',
  'parents': ['Agendas', 'Workshop Session', '2021']},
 {'name': '08-19-21 Workshop',
  'url': '/document/5658',
  'parents': ['Agendas', 'Workshop Session', '2021']},
 {'name': '02-04-21 Workshop',
  'url': '/document/5642',
  'parents': ['Agendas', 'Workshop Session', '2021']},
 {'name': '07-16-20 Workshop',
  'url': '/document/5626',
  'parents': ['Agendas', 'Workshop Session', '2020']},
 {'name': '08-06-20 Workshop',
  'url': '/document/5628',
  'parents': ['Agendas', 'Workshop Session', '2020']},
 {'name': '06-18-20 Workshop',
  'url': '/document/5624',
  'parents': ['Agendas', 'Workshop Session', '2020']},
 {'name': '02-20-20 Workshop',
  'url':

In [10]:
# # https://pleasantontx.civicweb.net/filepro/documents/5688/ #has 4 documents
# bs = get_soup(url_append='/filepro/documents/5688/', root_url=root_url)

DEBUG:Starting new HTTPS connection (1): pleasantontx.civicweb.net:443


Getting BeautifulSoup object from https://pleasantontx.civicweb.net/filepro/documents/5688/


DEBUG:https://pleasantontx.civicweb.net:443 "GET /filepro/documents/5688/ HTTP/1.1" 200 None


In [9]:
# documents = []
# documents = get_documents(bs, documents)

In [10]:
# documents

[<a class="document-link" href="/document/5626">
 																	07-16-20 Workshop
 																</a>,
 <a class="document-link" href="/document/5628">
 																	08-06-20 Workshop
 																</a>,
 <a class="document-link" href="/document/5624">
 																	06-18-20 Workshop
 																</a>,
 <a class="document-link" href="/document/5613">
 																	02-20-20 Workshop
 																</a>]

In [12]:
documents = []
for doc in bs.find_all("a", class_='document-link'):
    document_info = {
        "name": doc.text.strip(),
        "url": doc["href"],
        "subdomain": subdomain,
        }
    documents.append(document_info)
print(documents)

[{'name': '07-16-20 Workshop', 'url': '/document/5626', 'subdomain': 'pleasantontx'}, {'name': '08-06-20 Workshop', 'url': '/document/5628', 'subdomain': 'pleasantontx'}, {'name': '06-18-20 Workshop', 'url': '/document/5624', 'subdomain': 'pleasantontx'}, {'name': '02-20-20 Workshop', 'url': '/document/5613', 'subdomain': 'pleasantontx'}]


In [175]:
len(documents)

139

In [193]:
documents[2]

{'name': '06-17-21 Workshop',
 'url': '/document/5654',
 'parents': ['Agendas', 'Workshop Session', '2021']}

In [210]:
# TODO: find the pathlib bug
for document in documents[:3]:
    t1 = time.time()
    response = fetch_webpage(url_append=document["url"], root_url=root_url, headers=HEADERS)
    t2 = time.time()
    logger.info((f"Took {round((t2-t1),3)} seconds to get page."))
    download_pdf(response, name=document["name"], parents=document["parents"], subdomain="", out_path="out")

DEBUG:Starting new HTTPS connection (1): pleasantontx.civicweb.net:443
DEBUG:https://pleasantontx.civicweb.net:443 "GET /document/5871 HTTP/1.1" 301 None
DEBUG:https://pleasantontx.civicweb.net:443 "GET /document/5871/ HTTP/1.1" 200 None
INFO:Took 3.361 seconds to get page.
DEBUG:Saving to /Users/rainacao/Documents/GitHub/civicweb-document-downloader/out


IsADirectoryError: [Errno 21] Is a directory: '/Users/rainacao/Documents/GitHub/civicweb-document-downloader/out'

In [13]:
# t1 = time.time()
# response = requests.get(root_url+documents[0]["url"], headers=HEADERS)
# t2 = time.time()
# print(f"Took {round((t2-t1),3)} seconds to get page.")

# t3 = time.time()
# with open("./out/"+documents[0]["name"]+".pdf", 'wb') as pdf:
#     pdf.write(response.content)
#     pdf.close()
# t4 = time.time()
# print(f"Took {round((t4-t3),3)} seconds to save pdf.")

DEBUG:Starting new HTTPS connection (1): pleasantontx.civicweb.net:443
DEBUG:https://pleasantontx.civicweb.net:443 "GET /document/5626 HTTP/1.1" 301 None
DEBUG:https://pleasantontx.civicweb.net:443 "GET /document/5626/ HTTP/1.1" 200 4184384


Took 168.979 seconds to get page.
Took 0.001 seconds to save pdf.
