# Tax Appeal
### Created by: Ryan Jinnette

#### This file uses selenium and pandas to scrape county website for tax records that will be used for analysis to fair price valuation

## Set Up and Imports

In [30]:
# !pip install selenium
# for actual scraping/browser interaction

# !pip install pandas
# for df analysis and writing to a csv

# !pip install rpy2
# for running r code if needed

# !pip install tqdm
# to create a graphic look at the progress of the program

# ! pip install chromedriver_autoinstaller
# check to make our webdriver can work with the current version of chrome

In [14]:
import pandas as pd
from selenium.webdriver.common.by import By
from selenium import webdriver
import chromedriver_autoinstaller
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException

# allow for tracking of the collection process
from tqdm import tqdm

# allow R processing of data
import rpy2
%load_ext rpy2.ipython

#import timeit
# to time our program

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


## Constants and Class Definitions

In [15]:
# creates class to save code
class lot:
    '''This class provides a framework to store data more easily instead of multiple lists.
    I can reference just the instance of the class'''
    def __init__(self, prop_id=None, geo_id=None, name=None, appraised=None, land_value=None, land_size=None, improved_value=None, sqft=None, year=None, bldg_class=None):
        self.prop_id = prop_id
        self.geo_id = geo_id
        self.name = name
        self.appraised = appraised
        self.land_value = land_value
        self.land_size = land_size
        self.improved_value = improved_value
        self.imp_sqft = sqft
        self.year = year
        self.bldg_class = bldg_class
        self.dollar_sqft = None
    
    def set_dtypes(self):
        '''used to keep things consistent so that math can be done on all cells and not throw errors'''
        self.improved_value = float(self.improved_value.replace('(+)','').replace(',','').replace('$',''))
        self.appraised = float(self.appraised.replace(',','').replace('$',''))
        self.land_value = float(self.land_value.replace(',','').replace('$',''))
        self.land_size = float(self.land_size.replace(',',''))
        if self.imp_sqft != 0:
            self.imp_sqft = float(self.imp_sqft[13:-4].strip().replace(',',''))
            #self.dollar_sqft = round(self.improved_value / self.imp_sqft,2)
        
            


In [16]:
# instantiates the lots var used to store the class instances
lots = []

# url of apparaisal district with my 
web_url = r'https://esearch.galvestoncad.org/Search/Result?keywords=Johnson%20Crawford#'

# make it dynamic, grabs the 
year = int(input('What year of records would you like to grab? '))


What year of records would you like to grab?  2023


## Data Collection
#### Iterates over the webpage and grabs all necessary data, adds to the class and uses the XPATH of certain items to track them on the page for easy identification and repeatability

In [17]:
def collect(web_url,year):
    '''used to actually collect the data through scraping'''
    driver = webdriver.Chrome()
    driver.get(web_url)
    driver.implicitly_wait(2) # dynamic waiting to see when any certain searched for object becomes available
    time.sleep(2)
    
    #change the list view to 100 instead of default 25
    #btn = driver.find_element(By.ID, 'btnFilterByPage')
    #btn.click()
    year_script = f"filterByYear({year});"
    driver.execute_script(year_script)
    
    page_count_script = "handlePageSizeChange(100);"
    driver.execute_script(page_count_script)
    
    
    
    #btn = driver.find_element(By.XPATH, '/html/body/div[4]/div[3]/div/span[3]/ul/li[3]')
    #btn.click()
    
    time.sleep(2)

    rows = driver.find_elements(By.XPATH, '/html/body/div[4]/div[4]/div[2]/table/tbody/tr')
    start = time.time()
    
    for i in tqdm(range(2, len(rows)+1), desc="Processing rows", unit="row"):
        # this grabs all initially avail info, will have to go into each property next
        prop_id = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]/td[1]').text
        geo_id = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]/td[3]').text
        name = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]/td[6]').text
        appraised = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]/td[10]').text
        
        #go into each property and get more details
        click_property = driver.find_element(By.XPATH, f'/html/body/div[4]/div[4]/div[2]/table/tbody/tr[{i}]')
        click_property.click()
        
        land_size = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[5]/div[2]/table/tbody/tr[2]/td[4]').text
        land_value = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[5]/div[2]/table/tbody/tr[2]/td[7]').text
        
        # in case any of the following don't exist. (I.E. the lot is land only)
        try:
            improvement = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[2]/div[2]/div/table/tbody/tr[3]/td').text
        except NoSuchElementException:
            improvement = 0
        if improvement != '$0 (+)':
            try:
                imp_sqft = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[4]/div[2]/div[1]/span[3]').text
                if 'State' in imp_sqft: #some of the pages are not consistent formatting, handles that
                    imp_sqft = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[4]/div[2]/div[1]/span[4]').text
            except NoSuchElementException:
                imp_sqft = 0

            try:
                year = driver.find_element(By.XPATH,f'/html/body/div[4]/div[1]/div[4]/div[2]/table/tbody/tr[2]/td[5]').text
            except NoSuchElementException:
                year = 'Nan'

            try:
                bldg_class = driver.find_element(By.XPATH, f'/html/body/div[4]/div[1]/div[4]/div[2]/table/tbody/tr[2]/td[3]').text
            except NoSuchElementException:
                bldg_class = 'Nan'
        else:
            imp_sqft = 0
            year = 'Nan'
            bldg_class = 'Nan'

        
        lot_info = lot(prop_id, geo_id, name, appraised, land_value,land_size, improvement, imp_sqft, year, bldg_class)
        lots.append(lot_info)
        driver.back()
    finish = time.time()
    print(f"Took {round(finish-start,2)} seconds")




### Clean Data and Set Dtypes


In [18]:
# clean data through class methods
for entry in lots:
    entry.set_dtypes()

### Converting gathered data into csv export

In [19]:
def convert_to_csv():
    '''convert and export to csv for further data handling'''
    attribute_names = list(vars(lots[0]).keys())
    data = [{attr: getattr(lot, attr) for attr in attribute_names} for lot in lots]
    df = pd.DataFrame(data)
    df.to_csv(f'{year}_data.csv', index = False)

# Main aggregation of functions in main()

In [20]:
def main():
    '''invoke the main function to run all the other code'''
    collect(web_url,year)
    convert_to_csv()

In [21]:
if __name__ == '__main__':
    main()

Processing rows: 100%|█████████████████████████| 60/60 [01:18<00:00,  1.31s/row]

Took 78.79 seconds





# Testing Output Data

In [23]:
df = pd.read_csv(f'{year}_data.csv')

In [26]:
df.head(10)

Unnamed: 0,prop_id,geo_id,name,appraised,land_value,land_size,improved_value,imp_sqft,year,bldg_class,dollar_sqft
0,221853,4275-0000-0001-000,CASON GREGORY A & JONICA A,"$289,040","$289,040",17986.0,$0 (+),0,Nan,Nan,
1,221854,4275-0000-0003-000,CASON GREGORY A & JONICA A,"$770,000","$163,500",8175.0,"$606,500 (+)","Living Area: 2,577.00sqft",2016,BH14,
2,221855,4275-0000-0004-000,MORRIS GARRETT & JENNIFER,"$535,520","$161,660",8083.0,"$373,860 (+)","Living Area: 1,564.00sqft",2014,BH12,
3,221856,4275-0000-0005-000,COLLINS WILLIAM DAVID,"$769,960","$161,660",8083.0,"$608,300 (+)","Living Area: 2,209.00sqft",2014,BH14,
4,221859,4275-0000-0008-000,IVY KEITH,"$140,000","$140,000",9872.0,$0 (+),0,Nan,Nan,
5,221860,4275-0000-0009-000,STANDLEE JENNIFER SUTTON & PATRICIA BESSOLO,"$605,520","$38,710",5530.0,"$566,810 (+)","Living Area: 1,408.00sqft",2007,BH14,
6,221861,4275-0000-0010-000,STEPCHINSKI RANDY K & EMILY G,"$368,904","$38,710",5530.0,$0 (+),0,Nan,Nan,
7,221863,4275-0000-0012-000,ETOYAN MARINE,"$512,390","$30,420",5530.0,"$451,550 (+)","Living Area: 1,368.00sqft",2012,BH14,
8,221866,4275-0000-0015-000,WIMBERLY WARREN R & KEL LEE,"$349,560","$30,420",5530.0,"$319,140 (+)","Living Area: 1,216.00sqft",2016,BH12,
9,221867,4275-0000-0016-000,FERRETT FRANCES,"$30,420","$30,420",5530.0,$0 (+),0,Nan,Nan,


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   prop_id         60 non-null     int64  
 1   geo_id          60 non-null     object 
 2   name            60 non-null     object 
 3   appraised       60 non-null     object 
 4   land_value      60 non-null     object 
 5   land_size       60 non-null     object 
 6   improved_value  60 non-null     object 
 7   imp_sqft        60 non-null     object 
 8   year            59 non-null     object 
 9   bldg_class      60 non-null     object 
 10  dollar_sqft     0 non-null      float64
dtypes: float64(1), int64(1), object(9)
memory usage: 5.3+ KB
