In [1]:
from bs4 import BeautifulSoup
import time
import json
import pymongo
import requests

In [2]:
# Set up connection to local MongoDB
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Define database and collection
db = client.housing_db
collection = db.zillow_listings

collection.drop()

In [4]:
#Use session request headers to avoid Zillow detecting automation and popping up captcha
request_headers = {
    'accept':
    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    }

#Create session and grab first page of listings
with requests.Session() as session:

    url = 'https://www.zillow.com/johns-creek-ga/'
    response = session.get(url, headers=request_headers)

not_done = True;

previous_ext = ''

#While loop to control paging
while not_done:
    
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(response.content, 'html.parser')

    # Zillow HTLM contains a large script element with a json of every listing on the page, 
    # it includes all attributes needed, thereby eliminating the need to visit individual 
    # detail listing pages
    listings = soup.findAll("script", type="application/json")
    temp = listings[1].contents[0].string.strip("<!--")
    temp = temp.strip("-->")
    temp_json = json.loads(temp)
    houses = temp_json["cat1"]
    results = houses["searchResults"]
    houselist = results["listResults"]

    for listing in houselist:
        list_dict = {"addrStreet": listing["addressStreet"],
                    "addrCity": listing["addressCity"],
                    "addrState": listing["addressState"],
                    "addrZip": listing["addressZipcode"],
                    "estPrice": listing["zestimate"],
                    "listPrice": listing["unformattedPrice"],
                    "noOfBeds": listing["beds"], 
                    "noOfBaths": listing["baths"],
                    "sqFootage": listing["area"],
                    "listingUrl": listing["detailUrl"],
                    "listingSource": "Zillow"}
        collection.insert_one(list_dict)
        
    #Find url extension for next page
    next_page = soup.find('a', rel='next')
    extension = next_page['href']
    
    #Compare to prior 'next page' url to make sure last page isn't reached
    if extension == previous_ext:
        not_done = False
    else:
        previous_ext = extension
        with requests.Session() as session:
            url = 'https://www.zillow.com'+extension
            response = session.get(url, headers=request_headers)
            soup = BeautifulSoup(response.content, 'html.parser')

