In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

In [2]:
# Create empty dataframe
zillow_df = pd.DataFrame(columns = ["SITE_ADDRE", "LINK", "BEDS", "BATH", "SQFT", "PARKING", "WALK_SCORE", "TRANSIT_SCORE", "BIKE_SCORE", "SCHOOL_1_NAME", "SCHOOL_1_SCORE", "SCHOOL_2_NAME", "SCHOOL_2_SCORE", "SCHOOL_3_NAME", "SCHOOL_3_SCORE"])
address_df = pd.DataFrame(columns = ["ADDRESS"])

In [3]:
# Add addresses from Wake County dataset to zillow_df
wake_df = pd.read_csv("resources/WAKE_COUNTY_HOUSING_DATA.csv")

address_df["ADDRESS"] = wake_df["SITE_ADDRE"].astype(str) + " " + wake_df["CITY"].fillna("")
address_df.ADDRESS = address_df.ADDRESS.str.replace('  ', ' ')

In [4]:
# Create subset of data with first 14,000 rows (full dataset is ~360,000 rows)
address_df_part = address_df.iloc[:14000, :]
address_df_part

Unnamed: 0,ADDRESS
0,2457 BERTIE DR RALEIGH
1,2848 PROVIDENCE RD RALEIGH
2,409 S LAKESIDE DR RALEIGH
3,540 MARSHBURN RD WENDELL
4,1612 BENNETT ST RALEIGH
...,...
13995,1217 POPLAR AVE GARNER
13996,10326 BOYCE RD CREEDMOOR
13997,401 JONES FRANKLIN RD RALEIGH
13998,716 KIMBROUGH ST RALEIGH


In [5]:
# Set up browser
executable_path = {'executable_path': ChromeDriverManager().install()}

[WDM] - Downloading: 100%|████████████████████████████████████████████████████████| 6.81M/6.81M [00:00<00:00, 15.8MB/s]


In [6]:
# Loop through all address_df addresses
for house in range(len(address_df_part)):
    
    # Start remote browser
    browser = Browser('chrome', **executable_path, headless=False)
    
    # Find and search the current address
    address_to_search = address_df_part.iloc[house].to_string(index=False, header=False)
    address_to_search = address_to_search.replace("  ", " ")
    address_url = address_to_search.replace(" ", "-").lower()
    url = "https://www.zillow.com/homes/" + address_url
    browser.visit(url)
    
    # Wait 3 seconds for browser to load
    time.sleep(3)
    
    # Load html and soup using LXML parser
    html = browser.html
    soup = BeautifulSoup(html, "lxml")
    soup2 = soup.find(id="detail-container-column")
    
    # Try to take in data; if not found, "none"
    try:
        beds = soup2.find("div", {"data-testid": "property-card-beds"}).text
        bath = soup2.find("div", {"data-testid": "property-card-baths"}).text
        sqft = soup2.find("div", {"data-testid": "property-card-property-size"}).find("strong").text
        features_list = soup2.find_all("span", {"class": "Text-c11n-8-84-0__sc-aiai24-0 dpf__sc-2arhs5-3 qxgaF kOlNqB"})
        walk = soup2.find("a", {"aria-describedby": "walk-score-text"}).text
        transit = soup2.find("a", {"aria-describedby": "transit-score-text"}).text
        bike = soup2.find("a", {"aria-describedby": "bike-score-text"}).text
        school_names = soup2.findAll("div", {"class": "hdp__sc-12m9m4o-2 htLBjq"})
        school_scores = soup2.findAll("div", {"class": "hdp__sc-12m9m4o-1 dDDZBk"})

    except Exception as e:
        link = None
        beds = None
        bath = None
        sqft = None
        features_list = None
        walk = None
        transit = None
        bike = None
        schools = None
        
    # Clean data for dataframe
    try:
        beds = str(beds).split(' ', 1)[0]
        bath = str(bath).split(' ', 1)[0]

        parking = str(features_list[4])
        parking = parking.split('>', 1) [1]
        parking = parking.split('<', 1)[0]

        school_1_name = str(school_names[0]).split('blank\">', 1)[1]
        school_1_name = school_1_name.split('<', 1)[0]
        school_1_score = str(school_scores[0]).split('eIEmla\">', 1)[1]
        school_1_score = school_1_score.split('<', 1)[0]
        school_2_name = str(school_names[1]).split('blank\">', 1)[1]
        school_2_name = school_2_name.split('<', 1)[0]
        school_2_score = str(school_scores[1]).split('eIEmla\">', 1)[1]
        school_2_score = school_2_score.split('<', 1)[0]
        school_3_name = str(school_names[2]).split('blank\">', 1)[1]
        school_3_name = school_3_name.split('<', 1)[0]
        school_3_score = str(school_scores[2]).split('eIEmla\">', 1)[1]
        school_3_score = school_3_score.split('<', 1)[0]
    
    except Exception as e:
        beds = None
        bath = None
        parking = None
        school_1_name = None
        school_1_score = None
        school_2_name = None
        school_2_score = None
        school_3_name = None
        school_3_score = None
        
    # Add to dataframe
    zillow_df = zillow_df.append({"SITE_ADDRE": address_to_search, "LINK": url, "BEDS": beds, "BATH": bath, "SQFT": sqft, 
                              "PARKING": parking, "WALK_SCORE": walk, "TRANSIT_SCORE": transit, 
                              "BIKE_SCORE": bike, "SCHOOL_1_NAME": school_1_name, "SCHOOL_1_SCORE": school_1_score, 
                              "SCHOOL_2_NAME": school_2_name, "SCHOOL_2_SCORE": school_2_score, 
                              "SCHOOL_3_NAME": school_3_name, "SCHOOL_3_SCORE": school_3_score}, ignore_index=True)
    # End browser session to stop captchas
    browser.quit()

KeyboardInterrupt: 

In [9]:
# Manual browser quit
browser.quit()

In [7]:
# See how many rows were scraped
zillow_df.shape

(37, 15)

In [8]:
# Only scraped 32 rows of data before unpassable captcha (says press and hold button but doesn't remove captcha)
zillow_df.head(37)

Unnamed: 0,SITE_ADDRE,LINK,BEDS,BATH,SQFT,PARKING,WALK_SCORE,TRANSIT_SCORE,BIKE_SCORE,SCHOOL_1_NAME,SCHOOL_1_SCORE,SCHOOL_2_NAME,SCHOOL_2_SCORE,SCHOOL_3_NAME,SCHOOL_3_SCORE
0,2457 BERTIE DR RALEIGH,https://www.zillow.com/homes/2457-bertie-dr-ra...,-,2.0,1828.0,0 spaces,20.0,41.0,23.0,Hunter Elementary School,5.0,Ligon Middle,4.0,William G Enloe High,7.0
1,2848 PROVIDENCE RD RALEIGH,https://www.zillow.com/homes/2848-providence-r...,-,1.0,1240.0,Garage,4.0,22.0,28.0,Walnut Creek Elementary School,3.0,West Lake Middle,3.0,Southeast Raleigh High,2.0
2,409 S LAKESIDE DR RALEIGH,https://www.zillow.com/homes/409-s-lakeside-dr...,-,2.0,1037.0,0 spaces,31.0,32.0,19.0,Adams Elementary,5.0,Lufkin Road Middle,5.0,Athens Drive High,5.0
3,540 MARSHBURN RD WENDELL,https://www.zillow.com/homes/540-marshburn-rd-...,,,,,,,,,,,,,
4,1612 BENNETT ST RALEIGH,https://www.zillow.com/homes/1612-bennett-st-r...,,,,,,,,,,,,,
5,8712 W LAKE CT RALEIGH,https://www.zillow.com/homes/8712-w-lake-ct-ra...,-,5.5,3770.0,Garage,19.0,0.0,10.0,Leesville Road Elementary,5.0,Leesville Road Middle,7.0,Leesville Road High,6.0
6,605 WOODLAND RD RALEIGH,https://www.zillow.com/homes/605-woodland-rd-r...,-,1.0,996.0,Garage,9.0,0.0,22.0,Smith Elementary,6.0,North Garner Middle,4.0,Garner High,3.0
7,209 DIXIE TRL RALEIGH,https://www.zillow.com/homes/209-dixie-trl-ral...,4,2.0,2174.0,0 spaces,68.0,44.0,84.0,Olds Elementary,6.0,Martin Middle,5.0,Needham Broughton High,6.0
8,6512 BRANDYWINE RD RALEIGH,https://www.zillow.com/homes/6512-brandywine-r...,,,,,,,,,,,,,
9,6508 BRANDYWINE RD RALEIGH,https://www.zillow.com/homes/6508-brandywine-r...,-,3.0,2389.0,0 spaces,12.0,0.0,23.0,Reedy Creek Elementary,2.0,Reedy Creek Middle,6.0,Athens Drive High,5.0


In [None]:
# Export to CSV without the index
# Not used yet
zillow_df.to_csv('resources/zillow_pieces/pt_01.csv')