In [12]:
from bs4 import BeautifulSoup
import requests
import re
import usaddress
import pandas as pd
import datetime

In [13]:
URL_BASE = 'https://www.realtor.com/realestateandhomes-search/'

In [14]:
NUMBER_RE = re.compile(r'([0-9.,]+)')

In [15]:
def convert_int(s):
    return int(s.replace(",", ""))

def convert_float(s):
    return float(s.replace(",", ""))

In [16]:
def append_tag(tag, row, extract_number=False,convert=lambda s: s):
    if tag:
        if extract_number:
            txt = tag.get_text()
            match = re.search(NUMBER_RE, txt)
            if match:
                row.append(convert(match.group(1)))
            else:
                print(f'Not Match: {txt}')
                row.append(None)
        else:
            row.append(tag.get_text())
    else:
        row.append(None)


In [17]:
def append_lotsize(tag, row):
    if tag:
        txt = tag.get_text()
        match = re.search(NUMBER_RE, txt)
        if match:
            num = convert_float(match.group(1))
            if 'sqft' in txt.lower():
                num = round(num / 43560, 2)
            row.append(num)
        else:
            print(f'Not Match: {txt}')
            row.append(None)
    else:
        row.append(None)

In [18]:
def split_address(address_str, row):
    addr = usaddress.parse(address_str)
    parsed = {k : v for (v, k) in addr}
    if 'ZipCode' in parsed:
        row.append(int(parsed['ZipCode']))
    else:
        row.append(None)
    if 'StateName' in parsed:
        row.append(parsed['StateName'])
    else:
        row.append('N/A')

Schema:
1. Price
2. Zip Code
3. State
4. Number of beds
5. Number of baths
6. House Area (sq.ft.)
7. Lot Size (acres)


In [19]:
header_columns = ["price", "zip", "state", "bedrooms", "bathrooms", "house_area", "lot_size"]

In [20]:
proxy = "http://776ab99ea1b57626d1cf50147bca41cc33410c5f:@proxy.zenrows.com:8001"
proxies = {"http": proxy, "https": proxy}

In [29]:
MAX_RETRIES = 3
def process_location_zenrows(criteria, num_pages=2):
    ret = []
    pg = 1
    attempt = 1
    while pg <= num_pages:
        print(f'Processing {criteria}, Page #{pg}')
        url = URL_BASE + criteria + f'/pg-{pg}'
        response = requests.get(url, proxies=proxies, verify=False)
        if response.status_code == requests.codes.ok:
            attempt = 1
            soup = BeautifulSoup(response.text, 'html.parser')
            cards = soup.find_all('div', attrs={'data-label': 'property-card'})
            for card in cards:
                row = []
                summary_tag = card.find('div', class_='summary-wrap')
                if summary_tag:
                    price_tag = summary_tag.find('span', {'data-label': 'pc-price'})
                    address_tag = summary_tag.find('div', {'data-label': 'pc-address'})
                    beds_tag = summary_tag.find('li', {'data-label': 'pc-meta-beds'})
                    baths_tag = summary_tag.find('li', {'data-label': 'pc-meta-baths'})
                    sqft_tag = summary_tag.find('li', {'data-label': 'pc-meta-sqft'})
                    lotsize_tag = summary_tag.find('li', {'data-label': 'pc-meta-sqftlot'})
                    # Price
                    append_tag(price_tag, row, extract_number=True,convert=convert_float)
                    # Add Zip Code and State
                    split_address(address_tag.get_text(), row)
                    # Beds
                    append_tag(beds_tag, row, extract_number=True, convert=convert_int)
                    # Baths
                    append_tag(baths_tag, row, extract_number=True, convert=convert_float)
                    # House Area
                    append_tag(sqft_tag, row, extract_number=True, convert=convert_float)
                    # Lot Size
                    append_lotsize(lotsize_tag, row)
                    ret.append(row)
            pg += 1
        elif response.status_code == requests.codes.not_found:
            print("NOT_FOUND")
        else:
            print(f'Got {response.status_code} on attempt {attempt} ...')
            attempt += 1
            if attempt > MAX_RETRIES:
                break
    return ret


In [30]:
if __name__ == '__main__':
    with open('locations.txt') as lf:
        locs = lf.readlines()
    final_df = pd.DataFrame(columns=header_columns)
    for location in locs:
        data = process_location_zenrows(location, num_pages=5)
        df = pd.DataFrame(data=data, columns=header_columns)
        final_df = pd.concat([final_df, df], ignore_index=True)

Processing Massachusetts
, Page #1




Processing Massachusetts
, Page #2




Processing Massachusetts
, Page #3




Processing Massachusetts
, Page #4




Processing Massachusetts
, Page #5




Processing New-York
, Page #1




Got 422 on attempt 1 ...
Processing New-York
, Page #1




Processing New-York
, Page #2




Not Match: Studio
Processing New-York
, Page #3




Processing New-York
, Page #4




Got 422 on attempt 1 ...
Processing New-York
, Page #4




Not Match: Studio
Not Match: Studio
Processing New-York
, Page #5




Processing New-Jersey
, Page #1




Got 422 on attempt 1 ...
Processing New-Jersey
, Page #1




Processing New-Jersey
, Page #2




Processing New-Jersey
, Page #3




Processing New-Jersey
, Page #4




Got 422 on attempt 1 ...
Processing New-Jersey
, Page #4




Processing New-Jersey
, Page #5




Got 422 on attempt 1 ...
Processing New-Jersey
, Page #5




Got 422 on attempt 2 ...
Processing New-Jersey
, Page #5




Processing Connecticut
, Page #1




Got 422 on attempt 1 ...
Processing Connecticut
, Page #1




Processing Connecticut
, Page #2




Processing Connecticut
, Page #3




Got 422 on attempt 1 ...
Processing Connecticut
, Page #3




Processing Connecticut
, Page #4




Got 422 on attempt 1 ...
Processing Connecticut
, Page #4




Processing Connecticut
, Page #5




Got 422 on attempt 1 ...
Processing Connecticut
, Page #5




Processing Rhode-Island
, Page #1




Processing Rhode-Island
, Page #2




Processing Rhode-Island
, Page #3




Got 422 on attempt 1 ...
Processing Rhode-Island
, Page #3




Processing Rhode-Island
, Page #4




Processing Rhode-Island
, Page #5




Processing New-Hampshire
, Page #1




Got 422 on attempt 1 ...
Processing New-Hampshire
, Page #1




Got 422 on attempt 2 ...
Processing New-Hampshire
, Page #1




Not Match: Studio
Processing New-Hampshire
, Page #2




Processing New-Hampshire
, Page #3




Got 422 on attempt 1 ...
Processing New-Hampshire
, Page #3




Got 422 on attempt 2 ...
Processing New-Hampshire
, Page #3




Processing New-Hampshire
, Page #4




Processing New-Hampshire
, Page #5




Got 422 on attempt 1 ...
Processing New-Hampshire
, Page #5




In [31]:
filename = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
final_df.to_csv(f'{filename}.csv', index=False)