In [107]:
import requests
import time
import pandas as pd
import usaddress
import pickle
import os.path
import glob

In [142]:
SEARCH_URL = "https://zillow-com1.p.rapidapi.com/propertyExtendedSearch"
DETAIL_URL = "https://zillow-com1.p.rapidapi.com/property"

In [109]:
API_HEADERS = {
    "X-RapidAPI-Key": "9b09cd5020mshb8ed4cd92d4f0b2p15227cjsnda3d7f123fa9",
    "X-RapidAPI-Host": "zillow-com1.p.rapidapi.com"
}

In [110]:
MAX_RETRIES = 3

def send_request(url, headers, params):
    response = None
    attempt = 1
    while True:
        response = requests.request("GET", url, headers=headers, params=params)
        if response.status_code == requests.codes.ok:
            limit = -1
            remaining = -1
            if 'X-RateLimit-Requests-Limit' in response.headers:
                limit = int(response.headers['X-RateLimit-Requests-Limit'])
            if 'X-RateLimit-Requests-Remaining' in response.headers:
                remaining = int(response.headers['X-RateLimit-Requests-Remaining'])
            if limit > 0 and remaining > 0:
                if remaining / limit < 0.10:
                    print(f'Warning: {remaining} out of {limit} requests remaining!')
            break
        elif response.status_code == requests.codes.too_many_requests:
            time.sleep(0.5)
            attempt = 1
            continue
        else:
            attempt += 1
            if attempt > MAX_RETRIES:
                break
    if response and response.status_code == requests.codes.ok:
        return response.json()
    else:
        return None

In [148]:
# TEST CELL
search_criteria = {
    "location": "Boston, MA",
    "status_type": "ForRent",
    "home_type": "Houses, Townhomes, Condos, Multi-Family, Apartments",
    "page": 1
}

detail_params = {
    "zpid": "2099672322"
}

In [None]:
query_response = send_request(SEARCH_URL, headers=API_HEADERS, params=search_criteria)

In [149]:
# TEST CELL
property_detail = send_request(DETAIL_URL, headers=API_HEADERS, params=detail_params)

In [128]:
def get_address_data(address_str):
    parsed = usaddress.parse(address_str)
    place = [val.replace(',', '') for (val, tag) in parsed if tag == 'PlaceName']
    state = [val.replace(',', '') for (val, tag) in parsed if tag == 'StateName']
    zip = [val.replace(',', '') for (val, tag) in parsed if tag == 'ZipCode']
    r_place = None
    r_state = None
    r_zip = None
    if len(place) > 0:
        r_place = ' '.join(place)
    if len(state) > 0:
        r_state = ' '.join(state)
    if len(zip) > 0:
        zip_str = zip[0]
        if '-' in zip_str:
            splits = zip_str.split('-')
            zip_str = splits[0]
        r_zip = int(zip_str)
    return r_state, r_place, r_zip


In [124]:
HEADER_COLUMNS = ['home_type', 'state', 'borough', 'zip', 'lot_size', 'house_area', 'bedrooms', 'bathrooms', 'price']

def process_location(location, status_type):
    qstring = {
        'location': location,
        'status_type': status_type,
        'home_type': 'Houses, Townhomes, Condos, Multi-Family, Apartments',
#        'lotSizeMin': '1000 sqft',
        'page': 1
    }
    response = send_request(SEARCH_URL, API_HEADERS, params=qstring)
    if response:
        total_pages = response['totalPages']
        print(f'Total pages: {total_pages}')
        pg = 1
        rows = []
        while pg <= total_pages:
            print(f'Processing page {pg} of {total_pages} ...')
            properties = response['props']
            print(f'Properties in page: {len(properties)}')
            for prop in properties:
                row = []
                # property type
                if 'propertyType' in prop:
                    row.append(prop['propertyType'])
                else:
                    row.append('SINGLE_FAMILY')
                # address data
                if 'address' in prop:
                    state, borough, zip = get_address_data(prop['address'])
                    if (not state) or (not borough) or (not zip):
                        continue
                    else:
                        row.append(state)
                        row.append(borough)
                        row.append(zip)
                else:
                    continue
                # lot size
                if 'lotAreaValue' in prop:
                    unit = 'sqft'
                    if 'lotAreaUnit' in prop:
                        unit = prop['lotAreaUnit']
                    lot_size = prop['lotAreaValue']
                    if unit == 'sqft':
                        lot_size = round(lot_size / 43560, 4)
                    row.append(lot_size)
                else:
                    row.append(None)
                # house area
                if 'livingArea' in prop:
                    row.append(prop['livingArea'])
                else:
                    continue
                # bedrooms
                if 'bedrooms' in prop:
                    row.append(prop['bedrooms'])
                else:
                    row.append(1)
                # bathrooms
                if 'bathrooms' in prop:
                    row.append(prop['bathrooms'])
                else:
                    row.append(1)
                # price
                if 'price' in prop:
                    row.append(prop['price'])
                else:
                    continue
                rows.append(row)
            # new request
            pg += 1
            qstring['page'] = pg
            response = send_request(SEARCH_URL, API_HEADERS, params=qstring)
            if not response:
                break
        # dump data
        if len(rows) > 0:
            outfile = location.replace(' ', '').replace(',', '_')
            outfile += '.csv'
            df = pd.DataFrame(data = rows, columns=HEADER_COLUMNS)
            return df, outfile
        else:
            return None, None

In [125]:
def generate_data(statefile, inputfile, targetdir, status):
    state = set()
    if os.path.isfile(statefile):
        with open(statefile, 'rb') as f:
            state = pickle.load(f)
    queries = []
    with open(inputfile) as f:
        queries = f.readlines()
    for q in queries:
        q = q.strip()
        if not q in state:
            print(f'Processing location: {q}')
            try:
                df, outf = process_location(q, status)
                df.to_csv(os.path.join(targetdir, outf), index=False)
                state.add(q)
            except Exception as err:
                print(f'Error processing {q}: {err}')
        else:
            print(f'Skipping location {q} as it is already processed!')
    with open(statefile, 'wb') as f:
        pickle.dump(state, f)

In [126]:
def merge_csv(sourcedir, targetfile):
    files = glob.glob(os.path.join(sourcedir, '*.csv'))
    final_df = pd.DataFrame(columns=HEADER_COLUMNS)
    for file in files:
        temp_df = pd.read_csv(file)
        final_df = final_df.append(temp_df, ignore_index=True)
    final_df.to_csv(targetfile, index=False)

In [129]:
generate_data('state-sale.bin', 'queries.txt', 'data/sale', 'ForSale')

Skipping location Bronx, NY as it is already processed!
Skipping location Brooklyn, NY as it is already processed!
Skipping location Manhattan, NY as it is already processed!
Skipping location Queens, NY as it is already processed!
Skipping location Staten Island, NY as it is already processed!
Skipping location Buffalo, NY as it is already processed!
Skipping location Rochester, NY as it is already processed!
Skipping location Yonkers, NY as it is already processed!
Skipping location Syracuse, NY as it is already processed!
Skipping location Albany, NY as it is already processed!
Skipping location New Rochelle, NY as it is already processed!
Skipping location Mount Vernon, NY as it is already processed!
Skipping location Schenectady, NY as it is already processed!
Skipping location Utica, NY as it is already processed!
Processing location: Newark, NJ
Total pages: 9
Processing page 1 of 9 ...
Properties in page: 40
Processing page 2 of 9 ...
Properties in page: 40
Processing page 3 of 

In [130]:
generate_data('state-rent.bin', 'queries.txt', 'data/rent', 'ForRent')

Processing location: Bronx, NY
Total pages: 4
Processing page 1 of 4 ...
Properties in page: 40
Processing page 2 of 4 ...
Properties in page: 40
Processing page 3 of 4 ...
Properties in page: 40
Processing page 4 of 4 ...
Properties in page: 3
Processing location: Brooklyn, NY
Total pages: 3
Processing page 1 of 3 ...
Properties in page: 40
Processing page 2 of 3 ...
Properties in page: 40
Processing page 3 of 3 ...
Properties in page: 4
Processing location: Manhattan, NY
Total pages: 1
Processing page 1 of 1 ...
Properties in page: 24
Processing location: Queens, NY
Total pages: 6
Processing page 1 of 6 ...
Properties in page: 40
Processing page 2 of 6 ...
Properties in page: 40
Processing page 3 of 6 ...
Properties in page: 40
Processing page 4 of 6 ...
Properties in page: 40
Processing page 5 of 6 ...
Properties in page: 40
Processing page 6 of 6 ...
Properties in page: 25
Processing location: Staten Island, NY
Total pages: 2
Processing page 1 of 2 ...
Properties in page: 40
Proces

In [131]:
merge_csv('data/sale', 'data/merged/sales-data.csv')
merge_csv('data/rent', 'data/merged/rent-data.csv')

In [100]:
generate_data('state-rent.bin', 'test_queries.txt', 'data/rent', 'ForRent')

Processing location: Brooklyn, NY
Total pages: 3
Processing page 1 of 3 ...
Properties in page: 40
Processing page 2 of 3 ...
Properties in page: 40
Processing page 3 of 3 ...
Properties in page: 4
Processing location: Manhattan, NY
Total pages: 1
Processing page 1 of 1 ...
Properties in page: 24


In [84]:
dataframe, outfile = process_location('Boston, MA', 'ForSale')

Total pages: 20
Processing page 1 of 20 ...
Properties in page: 40
Processing page 2 of 20 ...
Properties in page: 40
Processing page 3 of 20 ...
Properties in page: 40
Processing page 4 of 20 ...
Properties in page: 40
Processing page 5 of 20 ...
Properties in page: 40
Processing page 6 of 20 ...
Properties in page: 40
Processing page 7 of 20 ...
Properties in page: 40
Processing page 8 of 20 ...
Properties in page: 40
Processing page 9 of 20 ...
Properties in page: 40
Processing page 10 of 20 ...
Properties in page: 40
Processing page 11 of 20 ...
Properties in page: 40
Processing page 12 of 20 ...
Properties in page: 40
Processing page 13 of 20 ...
Properties in page: 40
Processing page 14 of 20 ...
Properties in page: 40
Processing page 15 of 20 ...
Properties in page: 40
Processing page 16 of 20 ...
Properties in page: 40
Processing page 17 of 20 ...
Properties in page: 40
Processing page 18 of 20 ...
Properties in page: 40
Processing page 19 of 20 ...
Properties in page: 40
Proce