# Scrape Realtor.com Listings

In [1]:
import pandas as pd
import requests
from time import sleep
import concurrent.futures
import json
from datetime import datetime
import re

## Setup Headers and Other Constants

In [2]:
URL = "https://www.realtor.com/api/v1/hulk_main_srp"

QUERYSTRING = {"client_id": "rdc-x", "schema": "vesta"}

HEADERS = {
    "authority": "www.realtor.com",
    "accept": "application/json",
    "accept-language": "en-US,en;q=0.5",
    "content-type": "application/json",
    "origin": "https://www.realtor.com",
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "sec-gpc": "1",
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
}

QUERY = """
    query ConsumerSearchMainQuery(
        $query: HomeSearchCriteria!
        $limit: Int
        $offset: Int
        $sort: [SearchAPISort]
        $sort_type: SearchSortType
        $client_data: JSON
        $bucket: SearchAPIBucket
        ) {
        home_search: home_search(
            query: $query
            sort: $sort
            limit: $limit
            offset: $offset
            sort_type: $sort_type
            client_data: $client_data
            bucket: $bucket
        ) {
            count
            total
            results {
            property_id
            list_price
            primary
            rent_to_own {
                rent
                right_to_purchase
                provider
            }
            primary_photo(https: true) {
                href
            }
            source {
                id
                agents {
                office_name
                }
                type
                spec_id
                plan_id
            }
            community {
                property_id
                permalink
                description {
                name
                }
                advertisers {
                office {
                    hours
                    phones {
                    type
                    number
                    }
                }
                builder {
                    fulfillment_id
                }
                }
            }
            products {
                brand_name
                products
            }
            listing_id
            matterport
            virtual_tours {
                href
                type
            }
            status
            permalink
            price_reduced_amount
            other_listings {
                rdc {
                listing_id
                status
                listing_key
                primary
                }
            }
            description {
                beds
                baths
                baths_full
                baths_half
                baths_1qtr
                baths_3qtr
                garage
                stories
                type
                sub_type
                lot_sqft
                sqft
                year_built
                sold_price
                sold_date
                name
            }
            location {
                street_view_url
                address {
                line
                postal_code
                state
                state_code
                city
                coordinate {
                    lat
                    lon
                }
                }
                county {
                name
                fips_code
                }
            }
            tax_record {
                public_record_id
            }
            lead_attributes {
                show_contact_an_agent
                opcity_lead_attributes {
                cashback_enabled
                flip_the_market_enabled
                }
                lead_type
                ready_connect_mortgage {
                show_contact_a_lender
                show_veterans_united
                }
            }
            open_houses {
                start_date
                end_date
                description
                methods
                time_zone
                dst
            }
            flags {
                is_coming_soon
                is_pending
                is_foreclosure
                is_contingent
                is_new_construction
                is_new_listing(days: 14)
                is_price_reduced(days: 30)
                is_plan
                is_subdivision
            }
            list_date
            last_update_date
            coming_soon_date
            photos(limit: 2, https: true) {
                href
            }
            tags
            branding {
                type
                photo
                name
            }
            }
        }
    }
"""

## Pull from Realtor

In [3]:
None and 1 < None

In [4]:
results = []

offset = 0
# total gets updated, just need a value greater than offset for first iteration
total = offset + 1

# for the purpose of this example, creating a max
max_results = 200

while offset < total:

    print(f"handling offset {offset} in a total of {total}      ", end='\r')

    payload = {
        "query": QUERY,
        "variables": {
            "query": {
                "status": ["for_sale", "ready_to_build"],
                "primary": True,
                "search_location": {"location": "Burke, VA"}
            },
            "limit": 42,
            "offset": offset,
            "sort_type": "relevant",
            "by_prop_type": ["home"]
        },
        "operationName": "ConsumerSearchMainQuery",
        "callfrom": "SRP",
        "nrQueryType": "MAIN_SRP",
        "isClient": True,
    }

    response = requests.request(
        "POST", URL, json=payload, headers=HEADERS, params=QUERYSTRING)

    if response.status_code != 200:
        raise ValueError(f"Bad status code on response: {response.status_code}")

    try:
        data = response.json()['data']['home_search']
    except:
        print("Failed to read data, something went wrong with the request")
        raise

    total = data['total']

    response_results = data['results']
    offset += len(response_results)

    results += response_results

    if max_results and offset >= max_results:
        print("\n")
        print(f"Hit max: {max_results}")
        break

print("Done!                                            ")

Done!                                            


## Parsing Data

We now have our listings data, my preference is to extract the parts I'm interested in and structure them within a pandas `DataFrame`

In [5]:
def parse_flags(flags):
    # flags example value
    # {'is_coming_soon': None, 'is_new_listing': False, 'is_price_reduced': None, 'is_foreclosure': None, 'is_new_construction': None, 'is_pending': True, 'is_contingent': None}

    status = []
    if flags.get('is_coming_soon') is True:
        status.append("coming soon")
    if flags.get('is_new_listing') is True:
        status.append("new listing")
    if flags.get('is_price_reduced') is True:
        status.append("price reduced")
    if flags.get('is_foreclosure') is True:
        status.append("foreclosure")
    if flags.get('is_new_construction') is True:
        status.append("new construction")
    if flags.get('is_pending') is True:
        status.append("pending")
    if flags.get('is_contingent') is True:
        status.append("contingent")

    return ", ".join(status)

In [6]:
df = pd.DataFrame()

for result in results:

    if result['location']['address'].get('coordinate'):
        lat = result['location']['address'].get('coordinate', {}).get('lat')
        lon = result['location']['address'].get('coordinate', {}).get('lon')
    else:
        lat, lon = None, None

    df = pd.concat([df, pd.DataFrame([{
            'id': result['property_id'],
            'list date': pd.to_datetime(result['list_date']),
            'status': result['status'],
            'flags': parse_flags(result['flags']),
            'home type': result['description']['type'],
            'year built': result['description']['year_built'],
            'price': result['list_price'],
            'hoa fee': result.get('hoa', {}).get('fee', None),
            'beds': result['description']['beds'],
            'baths': result['description']['baths'],
            'interior sqft': result['description']['sqft'],
            'lot sqft': result['description']['lot_sqft'],
            'address': f"{result['location']['address']['line']} {result['location']['address']['city']}, {result['location']['address']['state_code']} {result['location']['address']['postal_code']}",
            'addresss line': result['location']['address']['line'],
            'city': result['location']['address']['city'],
            'state': result['location']['address']['state_code'],
            'zipcode': result['location']['address']['postal_code'],
            'latitude': lat,
            'longitude': lon,
            'url': f"https://www.realtor.com/realestateandhomes-detail/{result['permalink']}",
        }])])

print(df.shape)
df.head()

(46, 20)


Unnamed: 0,id,list date,status,flags,home type,year built,price,hoa fee,beds,baths,interior sqft,lot sqft,address,addresss line,city,state,zipcode,latitude,longitude,url
0,6754890289,2023-11-18 18:11:12+00:00,for_sale,new listing,single_family,1985,830000,,4,3,2857,10338.0,"7598 Seabrook Ln Springfield, VA 22153",7598 Seabrook Ln,Springfield,VA,22153,38.74936,-77.25086,https://www.realtor.com/realestateandhomes-det...
0,6002022197,2023-11-18 05:14:23+00:00,for_sale,new listing,townhomes,1973,530000,,3,4,1408,1650.0,"6856 Dina Leigh Ct Springfield, VA 22153",6856 Dina Leigh Ct,Springfield,VA,22153,38.769756,-77.260306,https://www.realtor.com/realestateandhomes-det...
0,6992687417,2023-11-16 05:13:25+00:00,for_sale,new listing,single_family,1993,1399900,,5,5,5931,26609.0,"6486 Lake Meadow Dr Burke, VA 22015",6486 Lake Meadow Dr,Burke,VA,22015,38.778356,-77.289498,https://www.realtor.com/realestateandhomes-det...
0,5057900396,2023-11-17 02:04:50+00:00,for_sale,new listing,single_family,1977,850000,,4,4,3528,10498.0,"8902 Grass Valley Ct Springfield, VA 22153",8902 Grass Valley Ct,Springfield,VA,22153,38.759675,-77.254183,https://www.realtor.com/realestateandhomes-det...
0,6635525029,2023-11-10 05:15:07+00:00,for_sale,"new listing, contingent",condos,1981,372500,,3,2,1079,,"5804 Cove Landing Rd Apt 202 Burke, VA 22015",5804 Cove Landing Rd Apt 202,Burke,VA,22015,38.795661,-77.306284,https://www.realtor.com/realestateandhomes-det...


## Additional Data

From this point we can look towards incorporating additional information, what that information is depends on the end goal of this data.

### Get Wildfire and Flood Risk

Realtor.com has a Wildfire and Flood Risk scoring system that isn't included in their regular API. Let's incoporate it into our data.

In [5]:
def get_risks(property_id: str):
        headers = {
            "authority": "www.realtor.com",
            "accept": "*/*",
            "accept-language": "en-US,en;q=0.9",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "sec-gpc": "1",
            "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
        }

        url = "https://www.realtor.com/api/v1/hulk"

        querystring = {"client_id": "rdc-x", "schema": "vesta"}

        payload = {
            "query": """
            query GetLocalData($propertyId: ID!) {
                home(property_id: $propertyId) {
                    local {
                        flood {
                            fsid
                            flood_factor_score
                            flood_factor_severity
                            flood_cumulative_30
                            flood_trend
                            flood_trend_paragraph
                            fema_zone
                            firststreet_url
                            flood_insurance_text
                            environmental_risk
                            trend_direction
                            insurance_requirement
                            insurance_rates {
                            provider_logo
                            provider_url
                            providers
                            }
                            insurance_quotes {
                            provider_name
                            provider_url
                            provider_logo
                            expires
                            price
                            home_coverage
                            contents_coverage
                            disclaimer
                            }
                        }
                        wildfire {
                            fsid
                            fire_factor_score
                            fire_factor_severity
                            fire_cumulative_30
                            fire_trend
                            fire_trend_paragraph
                            usfs_relative_risk
                            firststreet_url
                            fire_insurance_text
                            insurance_rates {
                            provider_logo
                            provider_url
                            providers
                            }
                        }
                    }
                }
            }
            """,
            "variables": {"propertyId": property_id}
        }
        try:
            response = requests.request(
                "POST", url, json=payload, headers=headers, params=querystring)
            data = response.json()
        except:
            return 'ERROR'

        if data is None:
            return ''
        else:
            flood = data.get('data', {}).get('home', {}).get('local', {}).get('flood')
            wildfire = data.get('data', {}).get('home', {}).get('local', {}).get('wildfire')
            return {
                "flood": flood if flood else {},
                "wildfire": wildfire if wildfire else {}, 
            }

In [8]:
get_risks(6635525029)

{'flood': {'fsid': '512328756',
  'flood_factor_score': 1,
  'flood_factor_severity': 'minimal',
  'flood_cumulative_30': 'This property has no flood risk in the Flood Factor™ model.',
  'flood_trend': 'This property’s flood risk is not changing.',
  'flood_trend_paragraph': 'This property’s risk of flood is <b>not changing</b>.',
  'fema_zone': ['X (unshaded)'],
  'firststreet_url': 'https://riskfactor.com/property/5804-cove-landing-rd-burke-va-22015/512328756_fsid/flood?utm_source=realtor',
  'flood_insurance_text': 'As this property is located in FEMA Zone X (unshaded), flood insurance is not federally required to obtain a mortgage. You may want to purchase flood insurance to protect your home. Explore quotes for flood insurance from <b>$435</b> to <b>$917</b> per year.',
  'environmental_risk': 1,
  'trend_direction': None,
  'insurance_requirement': 'recommended',
  'insurance_rates': [{'provider_logo': 'https://assets.floodfactor.com/insurance/neptune_flood.png',
    'provider_ur

In [20]:
id_risk_dict = {}

for i, realtor_id in enumerate(df['id'].values, 1):

    print(f"handling {i} of {len(df['id'].values)}", end='\r')
    id_risk_dict[realtor_id] = get_risks(realtor_id)

pd.unique([type(x) for x in id_risk_dict.values()])

handling 45 of 45

  pd.unique([type(x) for x in id_risk_dict.values()])


array([<class 'dict'>], dtype=object)

That last line shows that all of the values are dicts, meaning that there weren't any error or empty response data

Also as you can see, these requests would strongly benefit from multithreading to increase the speed of data fetching.

### Adding Risks to Dataframe

In [21]:
df['Flood Risk'] = df['id'].apply(lambda x: id_risk_dict.get(x, {}).get('flood', {}).get('flood_factor_severity'))
df['Wildfire Risk'] = df['id'].apply(lambda x: id_risk_dict.get(x, {}).get('wildfire', {}).get('fire_factor_severity'))

df.head()

Unnamed: 0,id,list date,status,flags,home type,year built,price,hoa fee,beds,baths,...,address,addresss line,city,state,zipcode,latitude,longitude,url,Flood Risk,Wildfire Risk
0,6635525029,2023-11-10 05:15:07+00:00,for_sale,new listing,condos,1981,372500,,3,2,...,"5804 Cove Landing Rd Apt 202 Burke, VA 22015",5804 Cove Landing Rd Apt 202,Burke,VA,22015,38.795661,-77.306284,https://www.realtor.com/realestateandhomes-det...,minimal,Minor
0,5957870279,2023-11-09 19:17:11+00:00,for_sale,pending,single_family,1977,784900,,4,3,...,"9210 Rockefeller Ln Springfield, VA 22153",9210 Rockefeller Ln,Springfield,VA,22153,38.758991,-77.260197,https://www.realtor.com/realestateandhomes-det...,minimal,Minor
0,5959122647,2023-11-07 00:39:17+00:00,for_sale,"new listing, contingent",single_family,1978,649900,,4,3,...,"9814 Pebble Weigh Ct Burke, VA 22015",9814 Pebble Weigh Ct,Burke,VA,22015,38.793154,-77.259604,https://www.realtor.com/realestateandhomes-det...,minimal,Moderate
0,6494123889,2023-11-09 16:50:41+00:00,for_sale,new listing,townhomes,1980,560000,,3,3,...,"5701 Walnut Wood Ln Burke, VA 22015",5701 Walnut Wood Ln,Burke,VA,22015,38.79649,-77.295153,https://www.realtor.com/realestateandhomes-det...,minimal,Moderate
0,5157781713,2023-11-08 17:12:35+00:00,for_sale,pending,single_family,1977,769900,,4,3,...,"5624 Signal Point Ct Burke, VA 22015",5624 Signal Point Ct,Burke,VA,22015,38.797391,-77.249788,https://www.realtor.com/realestateandhomes-det...,minimal,Minor
