In [1]:
from bs4 import BeautifulSoup
import time
import json
import pymongo
import requests

In [2]:
# Set up connection to local MongoDB
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Define database and collection
db = client.housing_db
collection = db.redfin_listings

collection.drop()

In [4]:
#Use session request headers to avoid Zillow detecting automation and popping up captcha
request_headers = {
    'accept':
    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    }

#Create session and grab first page of listings
with requests.Session() as session:

    url = 'https://www.redfin.com/city/33537/GA/Johns-Creek'
    response = session.get(url, headers=request_headers)


    
# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(response.content, 'html.parser')
        
#Find list of direct page buttons 
next_pages = soup.findAll('a', class_='clickable goToPage')




In [5]:
# This block of code scrapes all listings from the initial page.
# The code block below repeats most of this code in a loop for the subsequent pages.

# Find All script elements of type json 
listing_jsons = soup.findAll('script', type='application/ld+json')

# Set flag to skip first one
first = True
for listing in listing_jsons:
    if first:
        first = False
    else:
        # If past first element, convert contents to json and start parsing
        temp = listing.contents[0]
        temp_json = json.loads(temp)
        try:
            # Note: On the redfin main page, listings are lists of dictionaries
            # Elements intermixed that aren't subsciptable are not listings
            # usually they are Ads, etc. 
            json_dtl = temp_json[0]
            address = json_dtl['address']
            street = address['streetAddress']
            city = address['addressLocality']
            state = address['addressRegion']
            zipcode = address['postalCode']
            listing_url = 'https://www.redfin.com'+json_dtl['url']

            # Visit listing detail page to gather additional information
            with requests.Session() as session:

                response = session.get(listing_url, headers=request_headers)

            soup = BeautifulSoup(response.content, 'html.parser')
            detail_data = soup.findAll('span', class_='content text-right')
            list_price = detail_data[9].contents[0]
            list_price = list_price.strip('$,')
            list_price = list_price.replace(',','')
            list_price = float(list_price)
            est_price = detail_data[11].contents[0]
            est_price = est_price.strip('$,')
            est_price = est_price.replace(',','')
            est_price = float(est_price)
            stats_data = soup.findAll('span', class_='statsValue')
            area = stats_data[0].contents[0]
            area = area.replace(',','')
            area = float(area)
            stats_data = soup.findAll('div', class_='statsValue')
            beds = stats_data[1].contents[0]
            beds = float(beds)
            baths = stats_data[2].contents[0]
            baths = float(baths)
            # Write record to mongoDB collection
            list_dict = {"addrStreet": street,
                        "addrCity": city,
                        "addrState": state,
                        "addrZip": zipcode,
                        "estPrice": est_price,
                        "listPrice": list_price,
                        "noOfBeds": beds, 
                        "noOfBaths": baths,
                        "sqFootage": area,
                        "listingUrl": listing_url,
                        "listingSource": "Redfin"}
            collection.insert_one(list_dict)

        except:
            print('not a listing')

        


not a listing
not a listing
not a listing
not a listing
not a listing
not a listing
not a listing


In [6]:
# Scrape all listings from remaining pages

for page in next_pages:
    url= 'https://www.redfin.com'+page['href']
    print(url)
    
    with requests.Session() as session:

        response = session.get(url, headers=request_headers)
    
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find All script elements of type json 
    listing_jsons = soup.findAll('script', type='application/ld+json')
    # Set flag to skip first one
    first = True
    for listing in listing_jsons:
        if first:
            first = False
        else:
            # If past first element, convert contents to json and start parsing
            temp = listing.contents[0]
            temp_json = json.loads(temp)
            try:
                # Note: On the redfin main page, listings are lists of dictionaries
                # Elements intermixed that aren't subsciptable are not listings
                # usually they are Ads, etc. 
                json_dtl = temp_json[0]
                address = json_dtl['address']
                street = address['streetAddress']
                city = address['addressLocality']
                state = address['addressRegion']
                zipcode = address['postalCode']
                listing_url = 'https://www.redfin.com'+json_dtl['url']

                # Visit listing detail page to gather additional information
                with requests.Session() as session:

                    response = session.get(listing_url, headers=request_headers)

                soup = BeautifulSoup(response.content, 'html.parser')
                detail_data = soup.findAll('span', class_='content text-right')
                list_price = detail_data[9].contents[0]
                list_price = list_price.strip('$,')
                list_price = list_price.replace(',','')
                list_price = float(list_price)
                est_price = detail_data[11].contents[0]
                est_price = est_price.strip('$,')
                est_price = est_price.replace(',','')
                est_price = float(est_price)
                stats_data = soup.findAll('span', class_='statsValue')
                area = stats_data[0].contents[0]
                area = area.replace(',','')
                area = float(area)
                stats_data = soup.findAll('div', class_='statsValue')
                beds = stats_data[1].contents[0]
                beds = float(beds)
                baths = stats_data[2].contents[0]
                baths = float(baths)
                # Write record to mongoDB collection
                list_dict = {"addrStreet": street,
                            "addrCity": city,
                            "addrState": state,
                            "addrZip": zipcode,
                            "estPrice": est_price,
                            "listPrice": list_price,
                            "noOfBeds": beds, 
                            "noOfBaths": baths,
                            "sqFootage": area,
                            "listingUrl": listing_url,
                            "listingSource": "Redfin"}
                collection.insert_one(list_dict)

            except:
                print('not a listing')


https://www.redfin.com/city/33537/GA/Johns-Creek/page-2
not a listing
not a listing
not a listing
not a listing
not a listing
not a listing
not a listing
not a listing
not a listing
not a listing
not a listing
https://www.redfin.com/city/33537/GA/Johns-Creek/page-3
not a listing
not a listing
not a listing
not a listing
not a listing
not a listing
not a listing
not a listing
not a listing
not a listing
not a listing
