In [1]:
import requests
import time
import pandas as pd
from api import SearchListings, ListingDetails

In [107]:
toplevel_props = ['ListingID', 'DefaultParentArea', 'Url', 'Building', 'BathsFull', 'BathsHalf', 'Beds', 'CurrentPrice',\
    'Days', 'FullStreetAddress', 'Latitude', 'Longitude', 'ListDate', 'LotSize', 'OriginalPrice', 'Ownership',\
    'PropertyStyle', 'PropertyType', 'Remarks', 'SqFt', 'TownhouseType', 'UnitCount', 'WalkScore', 'WalkScoreDescription', 'YearBuilt',\
    'Zip', 'Details', 'Amenities', 'BuildingAmenities']
secondlevel_props = {'DefaultParentArea': [['Name', 'Neighborhood']],\
    'Building': [['UnitCount', 'BuildingUnitCount'],['Name', 'BuildingName']]}
details_props = {
    'Interior Features': ['Accessibility Features', 'AllRoom Features', 'Elevator', 'Interior Features',\
        'Kitchen Appliances', 'Master Bedroom', 'Bedroom 2', 'Dining Room', 'Living Room'],\
    'Utilities': ['Fuel Description', 'Hot Water', 'Sewer', 'Water Description', 'Cooling Description',\
        'Heating Description'],\
    'Green Features': ['Green Energy Supplement', 'Green Verification HES Metric', 'Green Verification HES Year', "Green Verification HES "]
}

In [57]:
class HomeScraper():
    def __init__(self, city):
        self.city = city
        self.search_api = SearchListings(city)
        self.listings_api = ListingDetails()
        self.listing_detail_cookies = None
        self.total_listings = 0
        self.listings = None

    def download_listings(self):
        url = self.search_api.url()
        headers = self.search_api.headers()

        #use get to pull cookies
        get_response = requests.get(url, headers=headers)
        assert get_response.status_code == 200, f"failed get request with status code: {get_response.status_code}"


        #get the count of active listings
        time.sleep(1)
        payload = self.search_api.payload(maximumListings=10)
        post_response = requests.post(url, cookies=get_response.cookies, headers=headers, json=payload)
        assert post_response.status_code == 200, f'post request to search listings returned w/ status code: {post_response.status_code}'
        response_json = post_response.json()
        self.total_listings = response_json['d']['Count']

        # get all active listings
        time.sleep(1)
        payload = self.search_api.payload(maximumListings=self.total_listings)
        post_response = requests.post(url, cookies=get_response.cookies, headers=headers, json=payload)
        assert post_response.status_code == 200, f'post request to search listings returned w/ status code: {post_response.status_code}'
        response_json = post_response.json()
        self.listings = response_json['d']['Listings']
        return
    
    def download_listing_details(self, listing_id):
        # download detail of a single listing
        url = self.listings_api.url()
        headers = self.listings_api.headers()
        payload = self.listings_api.payload(listing_id)
        
        # TODO: use get to pull cookies
        # get_response = requests.get(url, headers=headers)
        # assert get_response.status_code == 200, f"failed get request with status code: {get_response.status_code}"

        # get details
        post_response = requests.post(url, cookies=self.listing_detail_cookies, headers=headers, json=payload)
        assert post_response.status_code == 200, f'post request to listing details returned w/ status code: {post_response.status_code}'
        if not self.listing_detail_cookies:
            self.listing_detail_cookies = post_response.cookies
        response_json = post_response.json()
        return response_json['d']
    
    def download_details(self, count=None):
        if not count:
            count = self.count
        # download details of all listings
        all_listings_details = []
        for listing in self.listings[:count]:
            details = self.download_listing_details(listing['Listing']['ID'])
            time.sleep(1)
            all_listings_details.append(details)
        return all_listings_details
    
    

In [58]:
home_scraper = HomeScraper('Portland, OR')
home_scraper.download_listings()

In [97]:
all_listings_details = home_scraper.download_details(10)
all_listings_details[1]

{'__type': 'Homesnap.API.HSListingDetail',
 'DefaultParentAreaID': 109819,
 'DefaultParentArea': {'AreaID': 109819,
  'Name': 'Downtown Portland',
  'ShortName': 'Downtown Portland',
  'Subhead': 'Portland, OR Neighborhood',
  'State': 'OR',
  'USPSCity': 'Portland',
  'AreaTypeID': 6,
  'SubTypeID': 2,
  'TopLevelAreaID': 372,
  'CountyID': 3234,
  'Polygon': None,
  'Levels': None,
  'North': 45.523115,
  'South': 45.505193,
  'East': -122.668805,
  'West': -122.689466,
  'HasValidCoordinates': True,
  'MostRecentListDate': '/Date(1648391440000)/',
  'Url': '/sitemap/OR/Portland/Downtown-Portland',
  'ImageUrl': 'https://s3.amazonaws.com/homesnap.areas/0/ni/5/original.jpg',
  'UrlForSale': '/homes/for_sale/OR/Portland/Downtown-Portland/p_21,109819',
  'UrlForSaleOpenHouse': '/homes/for_sale/open_house/OR/Portland/Downtown-Portland/p_21,109819/f_1',
  'UrlForRent': '/homes/for_rent/OR/Portland/Downtown-Portland/p_21,109819',
  'Status': 15,
  'CityModeID': 0,
  'AreaTypeFlag': 32,
  '

In [103]:
def clean_listing_details(listings_details):
    cleaned_listings_details = [] 
    for details in listings_details:
        details_dict = {}
        for prop in toplevel_props:
            inner_details = details[prop]
            if not inner_details:
                details_dict[prop] = None
            elif prop in secondlevel_props.keys():
                # for nested properties, extract the values
                # and store in a renamed property key
                for inner_prop in secondlevel_props[prop]:
                    details_dict[inner_prop[1]] = inner_details[inner_prop[0]]
            elif prop == 'Details':
                # extract more deeply nested values
                for inner_prop in inner_details:
                    inner_prop_name = inner_prop['Name']
                    if inner_prop_name in details_props.keys():
                        for innermost_prop in inner_prop['Fields']:
                            innermost_prop_name = innermost_prop['Name']
                            if innermost_prop_name in details_props[inner_prop_name]:
                                details_dict[innermost_prop_name] = innermost_prop['Value']
            else:
                details_dict[prop] = inner_details 
            
        cleaned_listings_details.append(details_dict)
    return pd.json_normalize(cleaned_listings_details)

In [104]:
cleaned_details = clean_listing_details(all_listings_details)

In [106]:
cleaned_details.columns

Index(['ListingID', 'Neighborhood', 'Url', 'Building', 'BathsFull',
       'BathsHalf', 'Beds', 'CurrentPrice', 'Days', 'FullStreetAddress',
       'Latitude', 'Longitude', 'ListDate', 'LotSize', 'OriginalPrice',
       'Ownership', 'PropertyStyle', 'PropertyType', 'Remarks', 'SqFt',
       'TownhouseType', 'UnitCount', 'WalkScore', 'WalkScoreDescription',
       'YearBuilt', 'Zip', 'Amenities', 'BuildingAmenities',
       'BuildingUnitCount', 'BuildingName', 'Accessibility Features',
       'AllRoom Features', 'Elevator', 'Interior Features',
       'Kitchen Appliances', 'Master Bedroom', 'Bedroom 2', 'Dining Room',
       'Living Room', 'Fuel Description', 'Hot Water', 'Sewer',
       'Water Description', 'Cooling Description', 'Heating Description',
       'Green Energy Supplement', 'Green Verification HES Metric',
       'Green Verification HES Year'],
      dtype='object')