In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine
# Load .env enviroment variables into the notebook
load_dotenv()
# Get the postgres connection information from os file. 

DB_HOST = os.getenv('DB_HOST')
DB_NAME = os.getenv('DB_NAME')
DB_USER = os.getenv('DB_USER')
DB_PASS = os.getenv('DB_PASS')
from bs4 import BeautifulSoup as bs
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import re
import time 

In [2]:

real_acct_df = pd.read_csv('../real_estate_data/real_acct_owner/real_acct.txt', sep='\t',lineterminator='\r',header=0,encoding='latin1',low_memory=False)

In [3]:
columns = real_acct_df.columns
columns

Index(['acct', 'yr', 'mailto', 'mail_addr_1', 'mail_addr_2', 'mail_city',
       'mail_state', 'mail_zip', 'mail_country', 'undeliverable', 'str_pfx',
       'str_num', 'str_num_sfx', 'str', 'str_sfx', 'str_sfx_dir', 'str_unit',
       'site_addr_1', 'site_addr_2', 'site_addr_3', 'state_class',
       'school_dist', 'map_facet', 'key_map', 'Neighborhood_Code',
       'Neighborhood_Grp', 'Market_Area_1', 'Market_Area_1_Dscr',
       'Market_Area_2', 'Market_Area_2_Dscr', 'econ_area', 'econ_bld_class',
       'center_code', 'yr_impr', 'yr_annexed', 'splt_dt', 'dsc_cd', 'nxt_bld',
       'bld_ar', 'land_ar', 'acreage', 'Cap_acct', 'shared_cad', 'land_val',
       'bld_val', 'x_features_val', 'ag_val', 'assessed_val', 'tot_appr_val',
       'tot_mkt_val', 'prior_land_val', 'prior_bld_val',
       'prior_x_features_val', 'prior_ag_val', 'prior_tot_appr_val',
       'prior_tot_mkt_val', 'new_construction_val', 'tot_rcn_val',
       'value_status', 'noticed', 'notice_dt', 'protested', 'certif

In [4]:
# new_cols = ['acct', 'str_pfx', 'str_num',
#        'str_num_sfx', 'str', 'str_sfx', 'str_sfx_dir', 'str_unit',
#        'site_addr_1', 'site_addr_2', 'site_addr_3','bld_val', 'x_features_val', 'ag_val', 
#        'assessed_val', 'tot_appr_val',
#        'tot_mkt_val', 'prior_land_val', 'prior_bld_val',
#        'prior_x_features_val', 'prior_ag_val', 'prior_tot_appr_val',
#        'prior_tot_mkt_val', 'new_construction_val', 'tot_rcn_val',
#        'value_status', 'certified_date']

In [5]:
new_cols = ['acct','str_num','str_num_sfx','str','str_sfx','str_sfx_dir','str_unit','site_addr_1', 'site_addr_2',
'site_addr_3','bld_ar','land_ar','tot_appr_val','tot_mkt_val']

In [6]:
# remove the columns that we don't need
real_acct_df = real_acct_df[new_cols]
real_acct_df.head()

Unnamed: 0,acct,str_num,str_num_sfx,str,str_sfx,str_sfx_dir,str_unit,site_addr_1,site_addr_2,site_addr_3,bld_ar,land_ar,tot_appr_val,tot_mkt_val
0,\n0010010000013,0.0,,COMMERCE,ST,,,0 COMMERCE ST,HOUSTON,77002,0.0,44431.0,0.0,0.0
1,\n0010020000001,907.0,,COMMERCE,ST,,,907 COMMERCE ST,HOUSTON,77002,0.0,5001.0,309120.0,309120.0
2,\n0010020000003,0.0,,COMMERCE,ST,,,0 COMMERCE ST,HOUSTON,77002,0.0,18121.0,889398.0,889398.0
3,\n0010020000004,0.0,,COMMERCE,ST,,,0 COMMERCE ST,HOUSTON,77002,0.0,9061.0,444699.0,444699.0
4,\n0010020000013,921.0,,COMMERCE,ST,,,921 COMMERCE ST,HOUSTON,77002,0.0,3001.0,0.0,0.0


In [7]:
# remove the \n from the account column
def clean(x):
    x = x.replace('\n','')
    return x

In [8]:
real_acct_df['acct'] = real_acct_df['acct'].apply(clean)

In [9]:
real_acct_df.head(2)

Unnamed: 0,acct,str_num,str_num_sfx,str,str_sfx,str_sfx_dir,str_unit,site_addr_1,site_addr_2,site_addr_3,bld_ar,land_ar,tot_appr_val,tot_mkt_val
0,10010000013,0.0,,COMMERCE,ST,,,0 COMMERCE ST,HOUSTON,77002,0.0,44431.0,0.0,0.0
1,10020000001,907.0,,COMMERCE,ST,,,907 COMMERCE ST,HOUSTON,77002,0.0,5001.0,309120.0,309120.0


In [10]:
# load 5 million rows to local database
engine = create_engine(f'postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:5432/Houston_real_estate')
# load df to database
real_acct_df.to_sql('real_acct',engine,index=False,if_exists='replace')

In [11]:
real_acct_df.dtypes

acct             object
str_num         float64
str_num_sfx      object
str              object
str_sfx          object
str_sfx_dir      object
str_unit         object
site_addr_1      object
site_addr_2      object
site_addr_3      object
bld_ar          float64
land_ar         float64
tot_appr_val    float64
tot_mkt_val     float64
dtype: object

## Start Stan's web scraping from HAR

In [12]:
def fix_num(num):
    return float(re.sub('[^0-9\.]', "", num))

def fix_num_int(num):
    return int(re.sub('[^0-9\.]', "", num))

In [26]:
#read Haris county zip codes
zip_code_data=pd.read_csv("harris_county_zip_codes.csv")
zip_code_list=list(zip_code_data['zip_code'].map(str))

In [27]:
#set up chrome browser
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



INFO:WDM:

Current google-chrome version is 94.0.4606
INFO:WDM:Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
INFO:WDM:Get LATEST driver version for 94.0.4606
Driver [/Users/henrywycislo/.wdm/drivers/chromedriver/mac64/94.0.4606.61/chromedriver] found in cache
INFO:WDM:Driver [/Users/henrywycislo/.wdm/drivers/chromedriver/mac64/94.0.4606.61/chromedriver] found in cache


In [None]:
#define empty dataframe to appand data to
har_df = pd.DataFrame(columns = ['mls', 'address', 'bld_sft','lot_sft','price', 'zip'])
#search patterns for building and land square footage
bldgsft_pattern = re.compile(r'Building Sqft')
lotsft_pattern = re.compile(r'Lot Sqft')

#define a subroutine to extract data from house structure and append it to a dataframe
def extract_from_house(house):
    global har_df
    asking_price=fix_num(house.find('div', class_="price").text)
    try:
        bldg_sft=fix_num(house.find('div', class_="mp_features").find(text=bldgsft_pattern).__dict__["previous_sibling"].text)
    except AttributeError:
        bldg_sft=0
    try:
        lot_sft=fix_num(house.find('div', class_="mp_features").find(text=lotsft_pattern).__dict__["previous_sibling"].text)
    except AttributeError:
        lot_sft=0
    addr=house.find('a', class_="address").text
    mls_no=fix_num_int(house.find('div', class_="mpi_mls").text)
    property_zip=current_zip
    har_df = har_df.append({'mls' : mls_no, 'address' : addr, 'bld_sft' : bldg_sft, 'lot_sft': lot_sft, 'price': asking_price, 'zip': current_zip }, ignore_index = True)

# cycle through all Harris county zip codes
for current_zip in zip_code_list:
    time.sleep(10)
    print(current_zip)
    #define starting url for current zip code
    url=f'https://www.har.com/zipcode_{current_zip}/realestate/for_sale'
    browser.visit(url)
    html = browser.html
    soup = bs(html, 'html.parser')
    #Scrape the houses for the current search page
    houses=soup.find_all('div', class_="prop_item")
        
    for house in houses:
        extract_from_house(house)

    #figure out how many pages we will have to click through
    navigation=soup.find_all('div', class_="pagination")
    number_of_pages=len(navigation[0].ul.find_all('li'))

    #click through pages 2 through the last page 
    for i in range(2, number_of_pages+1):
        nextlink=browser.links.find_by_text(str(i))
        time.sleep(5)
        #figure out ambiguity to what is the next search page link
        if len(nextlink)==1:
            nextlink.click()
        else:
            nextlink[1].click()
        
        #extract houses from the current page and append to a data frame
        html = browser.html
        soup = bs(html, 'html.parser')
        houses=soup.find_all('div', class_="prop_item")
        for house in houses:
            extract_from_house(house)
            
#save scraped results for later use            
har_df.to_csv('har_data.csv',index=False)

77002
77003
77004


In [14]:
current_zip='77401'
bldgsft_pattern = re.compile(r'Building Sqft')
lotsft_pattern = re.compile(r'Lot Sqft')
for house in houses:
    asking_price=fix_num(house.find('div', class_="price").text)
    bldg_sft=fix_num(house.find('div', class_="mp_features").find(text=bldgsft_pattern).__dict__["previous_sibling"].text)
    lot_sft=fix_num(house.find('div', class_="mp_features").find(text=lotsft_pattern).__dict__["previous_sibling"].text)
    addr=house.find('a', class_="address").text
    mls_no=fix_num_int(house.find('div', class_="mpi_mls").text)
    property_zip=current_zip
    har_df = har_df.append({'mls' : mls_no, 'address' : addr, 'bld_sft' : bldg_sft, 'lot_sft': lot_sft, 'price': asking_price, 'zip': current_zip }, ignore_index = True)
    

In [15]:
har_df

Unnamed: 0,mls,address,bld_sft,lot_sft,price,zip
0,62981339,"5318 Valerie Street, Bellaire, TX 77401",5497.0,20475.0,3900000.0,77401
1,82184508,"220 Mulberry Lane, Bellaire, TX 77401",5869.0,34800.0,2550000.0,77401
2,11743323,"5301 Pine Street, Bellaire, TX 77401",6508.0,18900.0,2199000.0,77401
3,27145525,"4913 Elm Street, Bellaire, TX 77401",5367.0,12628.0,1995000.0,77401
4,41138611,"502 Chelsea Street, Bellaire, TX 77401",5403.0,9150.0,1995000.0,77401
5,49358906,"5308 Braeburn Drive, Bellaire, TX 77401",5659.0,20475.0,1950000.0,77401
6,35679760,"4900 Wedgewood Drive, Bellaire, TX 77401",5397.0,8750.0,1947000.0,77401
7,32194385,"4570 Elm Street, Bellaire, TX 77401",5318.0,8775.0,1899000.0,77401
8,3203955,"4819 Saxon Street, Bellaire, TX 77401",5806.0,9100.0,1875000.0,77401
9,87167029,"112 Marrakech Court, Bellaire, TX 77401",5140.0,7440.0,1845000.0,77401


In [16]:
# load to postgre
har_df.to_sql('har',engine,index=False,if_exists='replace')

In [25]:
# need to parse har address into multiple columns for future query against hcad
engine.execute('''
drop table if exists har2;
create table har2 as select mls, cast(substring(address,1,position(' ' in address)-1) as integer) as str_num ,
substring(address, position(' ' in address),position(',' in address) - position(' ' in address )) as street_name,
split_part(address, ',',2) as city from har;
''')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fde934064a8>

In [23]:
engine.execute('''
alter table har
add column house_number integer,
add column str_sfx text,
add column city text;
''')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fde93406128>

In [24]:
engine.execute('''
update har
set house_number = har2.str_num,
str_sfx=har2.street_name,
city=har2.city
from har2
where har.mls = har2.mls;
''')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x7fde93406320>

In [None]:
# end session and close the browser
browser.quit()