In [1]:
import undetected_chromedriver as uc
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin,quote

In [2]:
def get_rz_values(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        scripts = soup.find_all('script')

        # Pattern to match the desired script lines
        pattern = re.compile(r"RZ\.(\w+) = '(.*?)'")

        # Dictionary to store the results
        rz_values = {}

        for script in scripts:
            if script.string:
                matches = pattern.finditer(script.string)
                for match in matches:
                    key = match.group(1)
                    value = match.group(2)
                    rz_values[key] = value

        return rz_values

    return None

In [3]:
def extract_users(html_string):
    # Create a BeautifulSoup object to parse the HTML string
    soup = BeautifulSoup(html_string, 'html.parser')
    
    # Find the unordered list with the id 'page_users_list'
    list_boxes = soup.find('ul', {'id': 'page_users_list'})
    
    # Find all list items within the unordered list
    checkboxes = list_boxes.find_all('li')
    
    # Initialize an empty list to store the extracted email addresses
    checked = []

    # Iterate through each list item
    for row in checkboxes:
        # If the list item contains an 'hr' tag, break the loop
        if row.find('hr') is not None:
            break
        
        # Find the input element within the list item
        checkbox = row.find('input')
        
        # Get the value of the input element, which is the user email
        user_email = checkbox['value']
        
        # Add the user email to the checked list
        checked.append(user_email)
    
    # Return the list of extracted email addresses
    return checked

In [4]:
def getPagePermissions(page_url):
    page_info=get_rz_values(page_url)
    if page_info is None:
        print("ERROR WITH PAGE")
        return []
    permissions_url = f"https://cms3.revize.com/revize/util/permissions-editform.jsp?webspace={page_info['webspace']}&page_key={page_info['page_key']}&parent_key={page_info['parent_key']}&adminpanel&set=webspace_page_permissions.page_key%3D{quote(page_info['page_key'])}"
    driver.get(permissions_url)
    return extract_users(driver.page_source)

In [5]:
def get_links_recursive(url, base_url, visited, depth):
    if depth == 0:
        return
    if url in visited:
        return
    print("Scanning:", url)
    visited.append(url)

    try:
        response = requests.get(url)
        if response.status_code != 200:
            return
    except requests.exceptions.RequestException:
        return

    soup = BeautifulSoup(response.content, 'html.parser')

    for link in soup.find_all('a'):
        href = link.get('href')
        if href:
            parsed_url = urlparse(href)
            current_base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
            if not parsed_url.netloc or current_base_url == base_url:
                next_url = urljoin(base_url, href)
                next_url = next_url.split("#")[0].lower()
                if next_url.endswith('php'):
                    get_links_recursive(next_url, base_url, visited, depth - 1)

def crawl_subdomain(start_url, depth):
    parsed_start_url = urlparse(start_url)
    base_url = f"{parsed_start_url.scheme}://{parsed_start_url.netloc}{parsed_start_url.path.rsplit('/', 1)[0]}/"
    visited = list()
    get_links_recursive(start_url, base_url, visited, depth)
    return visited

In [6]:
start_url = 'https://sh.ingham.org/'
urls = crawl_subdomain(start_url,depth=5)
len(urls)

Scanning: https://sh.ingham.org/
Scanning: https://sh.ingham.org/courts_and_sheriff/sheriffs_office/index.php
Scanning: https://sh.ingham.org/courts_and_sheriff/sheriffs_office/jobs.php
Scanning: https://sh.ingham.org/courts_and_sheriff/sheriffs_office/emergencymanagement.php
Scanning: https://sh.ingham.org/courts_and_sheriff/sheriffs_office/corrections.php
Scanning: https://sh.ingham.org/courts_and_sheriff/sheriffs_office/prea_complaint_line.php
Scanning: https://sh.ingham.org/courts_and_sheriff/sheriffs_office/corr_assessment_and_treatment.php
Scanning: https://sh.ingham.org/courts_and_sheriff/sheriffs_office/corrections_medical_department.php
Scanning: https://sh.ingham.org/courts_and_sheriff/sheriffs_office/forgotten_man_ministries.php
Scanning: https://sh.ingham.org/courts_and_sheriff/sheriffs_office/education.php
Scanning: https://sh.ingham.org/courts_and_sheriff/sheriffs_office/the_preseason.php
Scanning: https://sh.ingham.org/courts_and_sheriff/sheriffs_office/jail_information.

47

In [7]:
options = uc.ChromeOptions()
options.add_argument('--window-size=700,700')
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)

try:
    driver = uc.Chrome(options=options)
except TypeError as e:
    if str(e) == 'expected str, bytes or os.PathLike object, not NoneType':
        raise ValueError('Chrome installation not found')
    raise e
driver.get(start_url)

In [8]:
out = {}
for url in urls:
    print(url)
    permissions = getPagePermissions(url)
    print(permissions)
    out[url] = permissions
    print()

https://sh.ingham.org/
['lmiles@ingham.org*', 'mcounterman@ingham.org', 'shughes@ingham.org*', 'srudloff@ingham.org*']

https://sh.ingham.org/courts_and_sheriff/sheriffs_office/index.php
['lmiles@ingham.org*', 'mcounterman@ingham.org', 'shughes@ingham.org*', 'srudloff@ingham.org*']

https://sh.ingham.org/courts_and_sheriff/sheriffs_office/jobs.php
['lmiles@ingham.org*', 'shughes@ingham.org*', 'srudloff@ingham.org*']

https://sh.ingham.org/courts_and_sheriff/sheriffs_office/emergencymanagement.php
['lmiles@ingham.org*', 'shughes@ingham.org*', 'srudloff@ingham.org*']

https://sh.ingham.org/courts_and_sheriff/sheriffs_office/corrections.php
['lmiles@ingham.org*', 'shughes@ingham.org*', 'srudloff@ingham.org*']

https://sh.ingham.org/courts_and_sheriff/sheriffs_office/prea_complaint_line.php
['lmiles@ingham.org*', 'shughes@ingham.org*', 'srudloff@ingham.org*']

https://sh.ingham.org/courts_and_sheriff/sheriffs_office/corr_assessment_and_treatment.php
['lmiles@ingham.org*', 'shughes@ingham.o

In [9]:
driver.close()

In [10]:
out

{'https://sh.ingham.org/': ['lmiles@ingham.org*',
  'mcounterman@ingham.org',
  'shughes@ingham.org*',
  'srudloff@ingham.org*'],
 'https://sh.ingham.org/courts_and_sheriff/sheriffs_office/index.php': ['lmiles@ingham.org*',
  'mcounterman@ingham.org',
  'shughes@ingham.org*',
  'srudloff@ingham.org*'],
 'https://sh.ingham.org/courts_and_sheriff/sheriffs_office/jobs.php': ['lmiles@ingham.org*',
  'shughes@ingham.org*',
  'srudloff@ingham.org*'],
 'https://sh.ingham.org/courts_and_sheriff/sheriffs_office/emergencymanagement.php': ['lmiles@ingham.org*',
  'shughes@ingham.org*',
  'srudloff@ingham.org*'],
 'https://sh.ingham.org/courts_and_sheriff/sheriffs_office/corrections.php': ['lmiles@ingham.org*',
  'shughes@ingham.org*',
  'srudloff@ingham.org*'],
 'https://sh.ingham.org/courts_and_sheriff/sheriffs_office/prea_complaint_line.php': ['lmiles@ingham.org*',
  'shughes@ingham.org*',
  'srudloff@ingham.org*'],
 'https://sh.ingham.org/courts_and_sheriff/sheriffs_office/corr_assessment_and_