In [1]:
import requests
from lxml import html

import unicodecsv as csv
import pandas as pd
import numpy as np

import time
import random
import re
import json

from IPython.display import display, Markdown, HTML
import IPython.display as dsp

pd.set_option('display.width', None)
pd.set_option('display.max_rows', 120)
pd.set_option('display.max_info_rows', 120)
pd.set_option('display.max_info_columns', 120)
pd.options.display.max_colwidth = 200  # pd.set_option('display.max_colwidth', 200)
pd.set_option('display.float_format', lambda fp: f'{fp:,.3f}')


import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn.preprocessing
import sklearn.impute
import sklearn.neighbors
import sklearn.cluster

import scipy.stats

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import Select
from time import sleep

LINE_UP, LINE_CLEAR = None, None


In [2]:
def write(name, data):
    with open(name, 'w') as f:
        f.write(data)

In [3]:
def random_ua():
    user_agents = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.3',
        #'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
        #'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0',
    ]
    selected_ua = random.choice(user_agents)
    print(f'Selected UA: {selected_ua}')
    return selected_ua

In [4]:
def countdown( min, max=None, buffer=None, prefix='Sleeping', postfix='...'):
    '''Print a countdown to screen.  Erase it when it is done.  
    Accepts either the time to countdown OR a min and max for randint for random sleeps.'''
    if max is None:
        max = min
    
    delay = random.randint(min, max)
    for i in range(delay):
        msg = f'{prefix} {delay-i:<2}s'
        msg = f'{msg:<14}s / {delay:>2}s {postfix}'
        if buffer is None:
            print(msg, end='')
        else:
            buffer.update(Markdown(msg))

        time.sleep(1)
        if buffer is None:
            print(f'\r                      ', end='\r')
        #else:
            #buffer.update(Markdown(f' '))
    if buffer is None:
        print(LINE_UP, LINE_CLEAR, end='')
    else:
        buffer.update(Markdown('Sleeping complete ...'))
    

In [5]:
def write_csv(data, target, fieldnames=None):
    # Save data to csv

    with open(f'properties-{target}.csv', 'wb') as csvfile:
        if fieldnames is None:
            fieldnames = list(data[0].keys())
        print(f'Fieldnames: {fieldnames}')
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

In [6]:
def pr(msg, buffer=None, end='\n', auto_format=True):
    #format = dict(top='## {}', mid='{}', bot='{}')  # buffers are IPython.display.display objects with a 'name' attribute added

    if not buffer:
        print(msg, end=end)
    
    else:
       #print('PR: msg type: ', type(msg))
       if isinstance(msg, str):
        if auto_format:
            try:
              msg = buffer.format.format(msg)
            except Exception as e:
              #print('In pr(): buffer formatting failed')
              pass
        buffer.update(Markdown(msg))
       
       else:
          #print('PR: msg type: ',type(msg), 'auto_format: ', auto_format)
          buffer.update(msg)


In [None]:
def get_display_handles():
    dh_top = display('...', display_id=True)
    dh_mid = display('...', display_id=True)
    dh_bot = display('...', display_id=True)

    dh_top.name = 'top'
    dh_mid.name = 'mid'
    dh_bot.name = 'bot'

    dh_top.format = '#### {}'
    dh_bot.format = '{}'
    dh_mid.format = '{}'
    
    return (dh_top, dh_mid, dh_bot)

dh_top, dh_mid, dh_bot = get_display_handles()

'...'

'...'

'...'

In [None]:
def ALCO_fetch_data(target=None, filter=''):
    base_url = 'https://southfayettepa.com/AgendaCenter/Search/?term=&CIDs=5,&startDate=&endDate=&dateRange=&dateSelector='
    if target is None:
        target = '/Default.aspx'
        #url = 'https://www.redfin.com/county/2362/PA/Allegheny-County/'
    else:
        if target[0] != '/':
            target = '/' + target
        if target[-1] != '/':
            target += '/'
        if filter != '':
            if filter[0] == '/':
                filter = filter[1:]
            if filter[-1] != '/':
                filter += '/'
    # url = base_url.format(target+filter)
    url = base_url
    
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'accept-language': 'en-US,en;q=0.9',
        'cache-control': 'max-age=0',
        'sec-ch-ua': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
        'sec-ch-ua-mobile':'?',
        'sec-ch-ua-platform':'"Windows"',
        'sec-fetch-dest':'document',
        'sec-fetch-mode':'navigate',
        'sec-fetch-site':'none',
        'sec-fetch-user':'?1',
        'upgrade-insecure-requests':'1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0',
    }
    s = requests.Session()
    s.headers.update(headers)

    #options = webdriver.FirefoxOptions()
    #service = webdriver.FirefoxService(executable_path='/usr/bin/firefox')
    #driver = webdriver.Firefox(service=service, options=options)
    driver = webdriver.Firefox()
    df = None
    dh_top, dh_mid, dh_bot = get_display_handles()
    print(dh_top, dh_mid, dh_bot)
    #input()
    # Loop
    while url is not None:
        response = s.get(url=url)
        driver.get(url)
        parser = html.fromstring(response.text)
        return s, driver, parser
        pr(f'**{response.status_code}**: {url}', buffer=dh_top)
        
        new_data = RF_get_df(parser, index='propertyId')
        if df is None:
            df = new_data
        else:
            df = pd.concat((df, new_data))
            pr(f'Page {url[-1]}: Added {df.shape[0]:>5} new rows.', buffer=dh_bot)

        last_url = url
        next_suffix = RF_next_url_suffix(parser)
        url = base_url.format(next_suffix) if next_suffix is not None else None
        if url is None or last_url == url:
            pr(f'**BREAKING**: No more URLs. Last URL: {last_url}', buffer=dh_mid)
            break

        countdown(min=3, max=7, buffer=dh_mid, postfix=f'Next URL: {url}')   # Sleep
        pr(f'Looping from beginning.  Next URL: {url}', buffer=dh_mid)
    
    return df

In [2]:
def get_driver(url='https://southfayettepa.com/AgendaCenter/Search/?term=&CIDs=5,&startDate=&endDate=&dateRange=&dateSelector='):
    options = webdriver.FirefoxOptions()
    service = webdriver.FirefoxService(executable_path=r'/snap/bin/geckodriver')
    driver = webdriver.Firefox(options=options, service=service)
    driver.get(url)
    return driver

In [10]:
def setup_page(driver):
    # Determine if this is the disclaimer page
    try:
        if driver.find_element(By.TAG_NAME, value='h1').text.lower() == 'legal disclaimer':
            # We need to click to go forward
            btn_continue = driver.find_element(by='id', value='btnContinue')
            btn_continue.click()
    except NoSuchElementException as e:
        print('No h1 tag found to compare for legal disclaimer')
        pass

    try:
        driver.find_element(By.CLASS_NAME, 'beta-button-2').click()
    except NoSuchElementException as e:
        # Not found ... move on
        print('beta popup close button was not found; failing silently')
        pass

In [11]:
def address_search(driver, selectm='Bethel Park'):
    #street_name = driver.find_element(By.ID, 'txtStreetName')
    #street_name.send_keys(' ')
    
    select_muni = Select(driver.find_element(By.ID, 'ddlMuniCode'))
    select_muni.select_by_visible_text(selectm)

    btn_search = driver.find_element(By.ID, 'btnSearch')
    btn_search.click()
    sleep(3)

    pager = None
    while pager is None:
        try:
            pager = driver.find_element(By.ID, 'pnlPageNums')
        except Exception as e:
            pager = None
        driver.find_element(By.ID, 'txtStreetName').send_keys(' ')  
        driver.find_element(By.ID, 'btnSearch').click()
        sleep(10)

    print('loaded')

    link = driver.find_element(By.XPATH, '//table[@id="dgSearchResults"]//tr[2]/td/a')
    link.click()
    
#address_search(driver)

In [82]:
def grab_page(driver, VERBOSE=False, dh=None):
    data = [td.text for td in driver.find_elements(By.XPATH, '//table[@id="Table3"]//td/span')]
    
    next_key = True
    assessments = False
    assessment_no = 0
    assessment_first = True
    
    data1 = []
    data2 = []
    
    for idx, d in enumerate(data):
        if assessments:
            if assessment_no < 4:
                if assessment_first:  # It's a key
                    #print(d, 'is assessment key.')
                    data1.append(d)    
                else:  # It's a value
                    #print(d, 'is assessment value.')
                    d = d.replace('$', '').replace(',', '')

                    data2.append(d)
                    assessment_no += 1
                    
                assessment_first = not assessment_first

        elif next_key:  # Looking for a key
            if d[-1] == ':':  # It's a key
                #print(d, 'is a key')
                d = d[:-2]  # Strip trailing ' :'
                data1.append(d)
                next_key = False
            else:  # Not actually a key
                if d[4:] == ' County Assessed Value (Projected)':
                    #print(d, 'is last header for assessments.')
                    assessments = True
                else:
                    if data[idx-1] == 'Sale Price':
                        d = d.replace('$', '').replace(',', '')
                    elif data[idx-1] == 'Lot Area':
                        d = d.replace(' Acres', '')
                    #print(d, 'is not actually a key.  Skipping ...')
                    next_key = True
        else: # expecting a value
            #print(d, 'is this a value?')
            data2.append(d)
            next_key = True

    
    as_dict = dict(zip(data1, data2))
    try:
        as_dict['PIN'] = driver.find_element(By.ID, 'BasicInfo1_lblParcelID').text
        as_dict['Address'] = driver.find_element(By.ID, 'BasicInfo1_lblAddress').text
        as_dict['Owner'] = driver.find_element(By.ID, 'BasicInfo1_lblOwner').text
    except NoSuchElementException as e:
        dh.update('PIN/Address/Owner not found.  Probably got timed out.')
        
    if VERBOSE:
        if dh is None:
            print(as_dict.get('PIN', 'NO_PIN_FOUND'), end='    ')
        else:
            dh.update(f'Record {i}:\t{as_dict['PIN']}')
    
    record_number = int(driver.find_element(By.ID, 'Header1_lblRecNum').text)
    
    return as_dict, record_number
#grab_page(driver)

In [13]:
def next_page(driver):
    driver.find_element(By.ID, 'Header1_btnNextRec').click()
#next_page(driver)

In [20]:
driver = get_driver()
type(driver)

selenium.webdriver.firefox.webdriver.WebDriver

In [70]:
def goto_page(driver, start_record_num, dh):
    current_record = int(driver.find_element(By.ID, 'Header1_lblRecNum').text)
    pages_crossed = 0

    while current_record != start_record_num:
        if pages_crossed == 30:
            pages_crossed = 0
            countdown(min=5, max=7, buffer=dh, prefix=f'On page {current_record}.  Sleeping')
        pages_crossed += 1
        next_page(driver)
        current_record = int(driver.find_element(By.ID, 'Header1_lblRecNum').text)


In [83]:
def ALCO_fetch(muni='Bethel Park', start_record_num=1, num=30):
    driver = get_driver()
    setup_page(driver)
    address_search(driver, selectm=muni)
    dh_top, dh_mid, dh_bot = get_display_handles()

    #  Handle data
    i = 0
    max_records = 30  # int(driver.find_element(By.ID, 'Header1_lblTotRec').text)
    record_nos = range(max_records)
    current_record = int(driver.find_element(By.ID, 'Header1_lblRecNum').text)
    df = None
    if current_record != start_record_num:
        goto_page(driver, start_record_num, dh_bot)
    current_record = int(driver.find_element(By.ID, 'Header1_lblRecNum').text)
    dh_top.update(f'On record **{current_record}**.  Beginning extraction.')
    for i in record_nos:
        vb = (i % 10) == 0
        if vb:
            countdown(min=5, max=30, buffer=dh_bot)
        else:
            countdown(min=1, max=6, buffer=dh_bot)

        data, num = grab_page(driver, VERBOSE=vb, dh=dh_mid)

        if df is None:
            df = pd.DataFrame(data, index=[i])
        else:
            df = pd.concat([df, pd.DataFrame(data, index=[i])])
        next_page(driver)

    return df

df = ALCO_fetch(start_record_num=90)
df

loaded


'On record **90**.  Beginning extraction.'

'PIN/Address/Owner not found.  Probably got timed out.'

Sleeping complete ...

NoSuchElementException: Message: Unable to locate element: [id="Header1_lblRecNum"]; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:511:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:136:16


In [71]:
first_df = df.copy()

In [None]:
# enter 0 in fifth parcel id textbox
textbox5 = driver.find_element(By.ID, 'txtParcelID5')
textbox5.send_keys('0')

NoSuchElementException: Message: Unable to locate element: [id="txtParcelID5"]; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:511:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:136:16


In [None]:
btn_search = driver.find_element(By.ID, 'btnSearch')
btn_search.click()
sleep(3)

In [None]:
pager = None
while pager is None:
    try:
        pager = driver.find_element(By.ID, 'pnlPageNums')
    except Exception as e:
        pager = None
    driver.find_element(By.ID, 'btnSearch').click()
    sleep(10)

print('loaded')

loaded


In [None]:
link = driver.find_element(By.XPATH, '//table[@id="dgSearchResults"]//tr[2]/td/a')
link.text

'0001-A-00100-0000-00'

In [147]:
driver = get_driver()

In [148]:
tabs = driver.find_elements(By.XPATH, '//a[@role="tab"]')
for tab in tabs:
    print(tab.get_attribute('href'))
print(tabs)

None
javascript:changeYear(2024, 5, 'a1')
javascript:changeYear(2023, 5, 'a2')
javascript:changeYear(2022, 5,'anchYearDD3')
javascript:changeYear(2021, 5,'anchYearDD4')
javascript:changeYear(2020, 5,'anchYearDD5')
javascript:changeYear(2019, 5,'anchYearDD6')
javascript:changeYear(2018, 5,'anchYearDD7')
javascript:changeYear(2017, 5,'anchYearDD8')
javascript:changeYear(2016, 5,'anchYearDD9')
javascript:changeYear(2015, 5,'anchYearDD10')
javascript:changeYear(2014, 5,'anchYearDD11')
[<selenium.webdriver.remote.webelement.WebElement (session="4a1d16a6-3a95-4e62-9cd8-d62b364bdc45", element="3a38a124-86ed-4852-adeb-68702c4af2f3")>, <selenium.webdriver.remote.webelement.WebElement (session="4a1d16a6-3a95-4e62-9cd8-d62b364bdc45", element="2a0c3192-426a-4d18-bf05-6fad251e93f3")>, <selenium.webdriver.remote.webelement.WebElement (session="4a1d16a6-3a95-4e62-9cd8-d62b364bdc45", element="fc89e065-c7ad-4ceb-86a4-bf98ea0dc3dd")>, <selenium.webdriver.remote.webelement.WebElement (session="4a1d16a6-3

In [None]:
from selenium.common.exceptions import NoSuchElementException

driver = get_driver()

tabs = driver.find_elements(By.XPATH, '//a[@role="tab"]')
for tab in tabs:
    print(tab.get_attribute('href'))
print(tabs)

df = None
def record_data(df_in):
    if type(df_in) is list:
        if len(df_in) == 1:
            df_in = df_in[0].dropna(subset=('Permit No'), axis='rows', how='any').set_index=('Permit No')
        else:
            d = df_in[0].dropna(subset=('Permit No'), axis='rows', how='any').set_index('Permit No')
            for dfi in df_in[1:]:
                dfi = dfi.dropna(subset=('Permit No'), axis='rows', how='any').set_index=('Permit No')
                d = pd.concat([d, dfi])
            df_in = d

    if df is None:
        df = df_in.copy()
    else:
        pd.concat([df, df_in])

#print(tabs)
years =['2023', '2024']
for idx in reversed(range(len(tabs))):
    if tabs[idx].text in years:
        print(tabs[idx].text)
        print(len(tabs[idx].text))
        tabs[idx].click()
        links = driver.find_elements(By.XPATH, '//td/p/a')

        for link in links:
            link.click()
            driver.switch_to.window(driver.window_handles[-1])
            try:
                link_to_pdf = driver.find_element(By.XPATH, '//li/a[contains(text(), "Building Permit Details")]').click()
            except NoSuchElementException:
                break
            pdf_table_df = tabula.io.read_pdf(link_to_pdf, pages='all')

            record_data(pdf_table_df)
            



[]


In [119]:
driver = get_driver()
tabs = driver.find_elements(By.XPATH, '//a[@role="tab"]')
print(tabs)
print('TABS:\n',tabs,'\n\n')
original_window = driver.current_window_handle
for idx in reversed(range(len(tabs))):
    t = driver.find_element(By.XPATH, '//a[@role="tab"]')
    print(t[idx].text)
    if t[idx].text.find('2024') != -1:
        print('found 2025 in', t[idx].text)
        print(t[idx])
        t[idx].click()
        for link in driver.find_elements(By.XPATH, '//td/p/a'):
            link.click()
            



[]
TABS:
 [] 




In [61]:
driver.find_elements(By.XPATH, '//td/p/a')[0].text

'January 8, 2025 Board of Commissioners Combined Workshop/Voting Meeting'

In [63]:
original_window = driver.current_window_handle

In [65]:
driver.switch_to.window(driver.window_handles[-1])

In [75]:
driver.find_element(By.XPATH, '//li/a[contains(text(), "Building Permit Details")]').click()

In [76]:
driver.switch_to.window(driver.window_handles[-1])

In [109]:
import tabula.io
tab_df = tabula.io.read_pdf(driver.current_url, pages='all')

In [144]:
t = tab_df [0].dropna(axis='rows', how='any', subset=('Permit No')).set_index('Permit No')
df = None
if df is None:
    df = t.copy()
else:
    df = pd.concat((df, tab_df))
df


Unnamed: 0_level_0,Date,Tax ID Owner Name,Unnamed: 0,Property Address,Category/Type,Estimated Cost,Fee,Memo
Permit No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-336,12/04/2024,TEMP-HAST-504 CHARTER HOMES,,4015 WINDSFIELD WAY,SINGLE FAMILY,"$264,000.00","$1,117.50",NEW HOME - ELEVATION 20
2024-337,12/04/2024,403-A-7 CHASE & MARTINIQUE JOHNSON,,4211 BATTLE RIDGE RD,SINGLE FAMILY,"$40,000.00",$80.00,DETACHED GARAGE
2024-338,12/05/2024,489-R-12 RYAN HOMES,,4503 BRANDYWINE CIR,SINGLE FAMILY,"$2,150.00",$40.00,SHED
2024-339,12/11/2024,TEMP-HAST-583 CHARTER HOMES,,4038 WINDSFIELD WAY,SINGLE FAMILY,"$224,400.00",$933.00,NEW HOME - ELEVATION 20
2024-340,12/11/2024,480-R-404 CHARTER HOMES,,1683 HASTINGS PARK DR,SINGLE FAMILY,"$40,000.00",$136.00,ROOF & DECK
2024-341,12/12/2024,405-E-108 FOXLANE HOMES OF STONEGATE LLC,,1021 STONEGATE DR,SINGLE FAMILY,"$305,000.00","$1,503.25",NEW HOME - CHARLESTON
2024-342,12/16/2024,322-E-11 WASHINGTON PIKE LLC 3049,,3049 WASHINGTON PKE,SINGLE FAMILY,"$25,000.00",$250.00,SIGN
2024-343,12/16/2024,484-A-4 DANNEEN R CELLONE,,804 MILLERS RUN RD,COMMERCIAL,$500.00,$50.00,SIGN INSTALLATION
2024-344,12/18/2024,405-E-110 KAELON C EGAN,,1027 STONEGATE DR,SINGLE FAMILY,"$3,000.00",$40.00,SHED
2024-345,12/18/2024,325-N-10 ROBERT L MARIANI,,673 MILLERS RUN RD,COMMERCIAL,"$20,000.00",$468.00,DEMOLITION


In [None]:
driver.close()

NoSuchWindowException: Message: Browsing context has been discarded
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
NoSuchWindowError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:679:5
assert.that/<@chrome://remote/content/shared/webdriver/Assert.sys.mjs:515:13
assert.open@chrome://remote/content/shared/webdriver/Assert.sys.mjs:147:4
GeckoDriver.prototype.close@chrome://remote/content/marionette/driver.sys.mjs:2433:15
despatch@chrome://remote/content/marionette/server.sys.mjs:318:40
execute@chrome://remote/content/marionette/server.sys.mjs:289:16
onPacket/<@chrome://remote/content/marionette/server.sys.mjs:262:20
onPacket@chrome://remote/content/marionette/server.sys.mjs:263:9
_onJSONObjectReady/<@chrome://remote/content/marionette/transport.sys.mjs:494:20


In [118]:
driver.switch_to.window(driver.window_handles[-1])
driver.close()
driver.swithc_to.window(original_window)

AttributeError: 'WebDriver' object has no attribute 'swithc_to'