# Tax Appeal
### Created by: Ryan Jinnette

#### This file uses selenium and pandas to scrape county website for tax records that will be used for analysis to fair price valuation

## Set Up and Imports

In [106]:
# !pip install selenium
# !pip install pandas
# !pip install rpy2
# !pip install tqdm

In [107]:
import pandas as pd
from selenium.webdriver.common.by import By
from selenium import webdriver
import chromedriver_autoinstaller
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from tqdm import tqdm
import rpy2
%load_ext rpy2.ipython

#import timeit

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


## Constants and Class Definitions

In [108]:
class lot:
    '''This class provides a framework to store data more easily instead of multiple lists.
    I can reference just the instance of the class'''
    def __init__(self, prop_id=None, geo_id=None, name=None, appraised=None, land_value=None, land_size=None, improved_value=None, sqft=None, year=None, bldg_class=None):
        self.prop_id = prop_id
        self.geo_id = geo_id
        self.name = name
        self.appraised = appraised
        self.land_value = land_value
        self.land_size = land_size
        self.improved_value = improved_value
        self.imp_sqft = sqft
        self.year = year
        self.bldg_class = bldg_class
        self.dollar_sqft = None
    
    def set_dtypes(self):
        '''used to keep things consistent so that math can be done on all cells and not throw errors'''
        self.improved_value = float(self.improved_value.replace('(+)','').replace(',','').replace('$',''))
        self.appraised = float(self.appraised.replace(',','').replace('$',''))
        self.land_value = float(self.land_value.replace(',','').replace('$',''))
        self.land_size = float(self.land_size.replace(',',''))
        if self.imp_sqft != 0:
            self.imp_sqft = float(self.imp_sqft[13:-4].strip().replace(',',''))
            #self.dollar_sqft = round(self.improved_value / self.imp_sqft,2)
        
            


In [109]:
lots = []
web_url = r'https://esearch.galvestoncad.org/Search/Result?keywords=Johnson%20Crawford#'
year = int(input('What year of records would you like to grab? '))


What year of records would you like to grab? 2023


## Data Collection
#### Iterates over the webpage and grabs all necessary data, adds to the class and uses the XPATH of certain items to track them on the page for easy identification and repeatability

In [110]:
def collect(web_url,year):
    #options = webdriver.ChromeOptions()
    #options.add_argument('--headless')
    #driver = webdriver.Chrome(options = options)
    driver = webdriver.Chrome()
    driver.get(web_url)
    driver.implicitly_wait(2) # dynamic waiting to see when any certain searched for object becomes available
    time.sleep(2)
    
    #change the list view to 100 instead of default 25
    #btn = driver.find_element(By.ID, 'btnFilterByPage')
    #btn.click()
    year_script = f"filterByYear({year});"
    driver.execute_script(year_script)
    
    page_count_script = "handlePageSizeChange(100);"
    driver.execute_script(page_count_script)
    
    
    
    #btn = driver.find_element(By.XPATH, '/html/body/div[4]/div[3]/div/span[3]/ul/li[3]')
    #btn.click()
    
    time.sleep(2)

    rows = driver.find_elements(By.XPATH, '/html/body/div[4]/div[4]/div[2]/table/tbody/tr')
    start = time.time()
    
    for i in tqdm(range(2, len(rows)+1), desc="Processing rows", unit="row"):
        # this grabs all initially avail info, will have to go into each property next
        prop_id = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]/td[1]').text
        geo_id = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]/td[3]').text
        name = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]/td[6]').text
        appraised = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]/td[10]').text
        
        #go into each property and get more details
        click_property = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]')
        click_property.click()
        
        land_size = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[5]/div[2]/table/tbody/tr[2]/td[4]').text
        land_value = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[5]/div[2]/table/tbody/tr[2]/td[7]').text
        
        # in case any of the following don't exist. (I.E. the lot is land only)
        try:
            improvement = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[2]/div[2]/div/table/tbody/tr[3]/td').text
        except NoSuchElementException:
            improvement = 0
        if improvement != '$0 (+)':
            try:
                imp_sqft = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[4]/div[2]/div[1]/span[3]').text
                if 'State' in imp_sqft: #some of the pages are not consistent formatting, handles that
                    imp_sqft = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[4]/div[2]/div[1]/span[4]').text
            except NoSuchElementException:
                imp_sqft = 0

            try:
                year = driver.find_element(By.XPATH,f'/html/body/div[4]/div[1]/div[4]/div[2]/table/tbody/tr[2]/td[5]').text
            except NoSuchElementException:
                year = 'Nan'

            try:
                bldg_class = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[4]/div[2]/table/tbody/tr[2]/td[3]').text
            except NoSuchElementException:
                bldg_class = 'Nan'
        else:
            imp_sqft = 0
            year = 'Nan'
            bldg_class = 'Nan'

        
        lot_info = lot(prop_id, geo_id, name, appraised, land_value,land_size, improvement, imp_sqft, year, bldg_class)
        lots.append(lot_info)
        driver.back()
    finish = time.time()
    print(f"Took {round(finish-start,2)} seconds")
        
collect(web_url,year)




Processing rows: 100%|█████████████████████████| 60/60 [03:36<00:00,  3.60s/row]

Took 216.27 seconds





### Clean Data and Set Dtypes


In [111]:
for lot in lots:
    lot.set_dtypes()
    #print(lot.imp_sqft, type(lot.imp_sqft))

### Converting gathered data into csv export

In [112]:
def convert_to_csv():
    attribute_names = list(vars(lots[0]).keys())
    data = [{attr: getattr(lot, attr) for attr in attribute_names} for lot in lots]
    df = pd.DataFrame(data)
    df.to_csv(f'{year}_data.csv', index = False)

convert_to_csv()

# Main aggregation of functions in main()

In [113]:
def main():
    collect(web_url,year)
    convert_to_csv()