# Tax Appeal
### Created by: Ryan Jinnette

#### This file uses selenium and pandas to scrape county website for tax records that will be used for analysis to fair price valuation

## Set Up and Imports

In [316]:
# !pip install selenium
# !pip install pandas

import pandas as pd
from selenium.webdriver.common.by import By
from selenium import webdriver
import chromedriver_autoinstaller
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException

#import timeit

## Constants and Class Definitions

In [317]:
lots = []

class lot:
    '''This class provides a framework to store data more easily instead of multiple lists.
    I can reference just the instance of the class'''
    def __init__(self, prop_id=None, geo_id=None, name=None, appraised=None, land_value=None, land_size=None, improved_value=None, sqft=None, year=None, bldg_class=None):
        self.prop_id = prop_id
        self.geo_id = geo_id
        self.name = name
        self.appraised = appraised
        self.land_value = land_value
        self.land_size = land_size
        self.improved_value = improved_value
        self.imp_sqft = sqft
        self.year = year
        self.bldg_class = bldg_class
        self.dollar_sqft = None
    
    def set_dtypes(self):
        '''used to keep things consistent so that math can be done on all cells and not throw errors'''
        self.improved_value = float(self.improved_value.replace('(+)','').replace(',','').replace('$',''))
        self.appraised = float(self.appraised.replace(',','').replace('$',''))
        self.land_value = float(self.land_value.replace(',','').replace('$',''))
        self.land_size = float(self.land_size.replace(',',''))
        if self.imp_sqft != 0:
            self.imp_sqft = float(self.imp_sqft[13:-4].strip().replace(',',''))
            


In [318]:
web_url = r'https://esearch.galvestoncad.org/Search/Result?keywords=Johnson%20Crawford#'


## Data Collection
#### Iterates over the webpage and grabs all necessary data, adds to the class and uses the XPATH of certain items to track them on the page for easy identification and repeatability

In [319]:
def collect(web_url):
    driver = webdriver.Chrome()
    driver.get(web_url)
    driver.implicitly_wait(2) # dynamic waiting to see when any certain searched for object becomes available
    time.sleep(2)
    
    #change the list view to 100 instead of default 25
    btn = driver.find_element(By.ID, 'btnFilterByPage')
    btn.click()
    
    btn = driver.find_element(By.XPATH, '/html/body/div[4]/div[3]/div/span[3]/ul/li[3]')
    btn.click()
    
    time.sleep(2)

    rows = driver.find_elements(By.XPATH, '/html/body/div[4]/div[4]/div[2]/table/tbody/tr')
    start = time.time()
    
    for i in range(2, len(rows)+1):
        
        
        # this grabs all initially avail info, will have to go into each property next
        prop_id = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]/td[1]').text
        geo_id = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]/td[3]').text
        name = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]/td[6]').text
        appraised = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]/td[10]').text
        
        #go into each property and get more details
        click_property = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]')
        click_property.click()
        
        land_size = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[5]/div[2]/table/tbody/tr[2]/td[4]').text
        land_value = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[5]/div[2]/table/tbody/tr[2]/td[7]').text
        
        # in case any of the following don't exist. (I.E. the lot is land only)
        try:
            improvement = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[2]/div[2]/div/table/tbody/tr[3]/td').text
        except NoSuchElementException:
            improvement = 0
        if improvement != '$0 (+)':
            try:
                imp_sqft = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[4]/div[2]/div[1]/span[3]').text
                if 'State' in imp_sqft: #some of the pages are not consistent formatting, handles that
                    imp_sqft = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[4]/div[2]/div[1]/span[4]').text
            except NoSuchElementException:
                imp_sqft = 0

            try:
                year = driver.find_element(By.XPATH,f'/html/body/div[4]/div[1]/div[4]/div[2]/table/tbody/tr[2]/td[5]').text
            except NoSuchElementException:
                year = 'Nan'

            try:
                bldg_class = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[4]/div[2]/table/tbody/tr[2]/td[3]').text
            except NoSuchElementException:
                bldg_class = 'Nan'
        else:
            imp_sqft = 0
            year = 'Nan'
            bldg_class = 'Nan'

        
        lot_info = lot(prop_id, geo_id, name, appraised, land_value,land_size, improvement, imp_sqft, year, bldg_class)
        lots.append(lot_info)
        driver.back()
    finish = time.time()
    print(f"took {finish-start} seconds")
        
collect(web_url)




$0 (+)
$606,500 (+)
$373,860 (+)
$608,300 (+)
$0 (+)
$566,810 (+)
$0 (+)
$451,550 (+)
$319,140 (+)
$0 (+)
$0 (+)
$357,550 (+)
$0 (+)
$15,580 (+)
$0 (+)
$0 (+)
$0 (+)
$5,000 (+)
$0 (+)
$0 (+)
$0 (+)
$271,318 (+)
$189,580 (+)
$542,736 (+)
$333,120 (+)
$0 (+)
$0 (+)
$0 (+)
$0 (+)
$540,622 (+)
$26,710 (+)
$436,160 (+)
$250,937 (+)
$356,420 (+)
$356,420 (+)
$246,227 (+)
$0 (+)
$12,000 (+)
$412,302 (+)
$0 (+)
$0 (+)
$0 (+)
$163,683 (+)
$206,803 (+)
$2,000 (+)
$430,257 (+)
$10,000 (+)
$28,640 (+)
$440,480 (+)
$14,540 (+)
$0 (+)
$482,545 (+)
$0 (+)
$0 (+)
$3,010 (+)
$109,870 (+)
$0 (+)
$0 (+)
$0 (+)
$122,050 (+)
took 75.75846290588379 seconds


In [320]:
print(len(lots))


60


In [321]:
for lot in lots:
    lot.set_dtypes()
    print(lot.imp_sqft, type(lot.imp_sqft))

0 <class 'int'>
2577.0 <class 'float'>
1564.0 <class 'float'>
2209.0 <class 'float'>
0 <class 'int'>
1408.0 <class 'float'>
0 <class 'int'>
1368.0 <class 'float'>
1216.0 <class 'float'>
0 <class 'int'>
0 <class 'int'>
1321.0 <class 'float'>
0 <class 'int'>
0.0 <class 'float'>
0 <class 'int'>
0 <class 'int'>
0 <class 'int'>
0.0 <class 'float'>
0 <class 'int'>
0 <class 'int'>
0 <class 'int'>
1271.0 <class 'float'>
1043.0 <class 'float'>
1722.0 <class 'float'>
748.0 <class 'float'>
0 <class 'int'>
0 <class 'int'>
0 <class 'int'>
0 <class 'int'>
2060.0 <class 'float'>
0.0 <class 'float'>
1446.0 <class 'float'>
1040.0 <class 'float'>
1040.0 <class 'float'>
1040.0 <class 'float'>
1040.0 <class 'float'>
0 <class 'int'>
0.0 <class 'float'>
1284.0 <class 'float'>
0 <class 'int'>
0 <class 'int'>
0 <class 'int'>
1138.0 <class 'float'>
1008.0 <class 'float'>
300.0 <class 'float'>
1296.0 <class 'float'>
0.0 <class 'float'>
0.0 <class 'float'>
1500.0 <class 'float'>
0.0 <class 'float'>
0 <class 'int