In [1]:
import requests
import time
import pandas as pd
from api import SearchListings, ListingDetails

In [35]:
toplevel_props = ['ListingID', 'DefaultParentArea', 'Url', 'Building', 'BathsFull', 'BathsHalf', 'Beds', 'CurrentPrice',\
    'Days', 'FullStreetAddress', 'Latitude', 'Longitude', 'ListDate', 'LotSize', 'OriginalPrice', 'Ownership',\
    'PropertyStyle', 'PropertyType', 'Remarks', 'SqFt', 'TownhouseType', 'UnitCount', 'WalkScore', 'WalkScoreDescription', 'YearBuilt',\
    'Zip', 'Details', 'Amenities', 'BuildingAmenities']
secondlevel_props = {'DefaultParentArea': ['Name', 'Neighborhood'],\
    'Building': [['UnitCount', 'BuildingUnitCount'],['Name', 'BuildingName']]}
details_props = {
    'Interior Features': ['Accessibility Features', 'AllRoom Features', 'Elevator', 'Interior Features',\
        'Kitchen Appliances', 'Master Bedroom', 'Bedroom 2', 'Dining Room', 'Living Room'],\
    'Utiliyies': ['Fuel Description', 'Hot Water', 'Sewer', 'Water Description', 'Cooling Description',\
        'Heating Description'],\
    'Green Features': ['Green Energy Supplement']
}

In [52]:

class HomeScraper():
    def __init__(self, city):
        self.city = city
        self.search_api = SearchListings(city)
        self.listings_api = ListingDetails()
        self.listing_detail_cookies = None
        self.total_listings = 0
        self.listings = None

    def download_listings(self):
        url = self.search_api.url()
        headers = self.search_api.headers()

        #use get to pull cookies
        get_response = requests.get(url, headers=headers)
        assert get_response.status_code == 200, f"failed get request with status code: {get_response.status_code}"


        #get the count of active listings
        time.sleep(1)
        payload = self.search_api.payload(maximumListings=10)
        post_response = requests.post(url, cookies=get_response.cookies, headers=headers, json=payload)
        assert post_response.status_code == 200, f'post request to search listings returned w/ status code: {post_response.status_code}'
        response_json = post_response.json()
        self.total_listings = response_json['d']['Count']

        # get all active listings
        time.sleep(1)
        payload = self.search_api.payload(maximumListings=self.total_listings)
        post_response = requests.post(url, cookies=get_response.cookies, headers=headers, json=payload)
        assert post_response.status_code == 200, f'post request to search listings returned w/ status code: {post_response.status_code}'
        response_json = post_response.json()
        self.listings = response_json['d']['Listings']
        return
    
    def download_listing_details(self, listing_id):
        # download detail of a single listing
        url = self.listings_api.url()
        headers = self.listings_api.headers()
        payload = self.listings_api.payload(listing_id)
        
        # TODO: use get to pull cookies
        # get_response = requests.get(url, headers=headers)
        # assert get_response.status_code == 200, f"failed get request with status code: {get_response.status_code}"

        # get details
        post_response = requests.post(url, cookies=self.listing_detail_cookies, headers=headers, json=payload)
        assert post_response.status_code == 200, f'post request to listing details returned w/ status code: {post_response.status_code}'
        if not self.listing_detail_cookies:
            self.listing_detail_cookies = post_response.cookies
        response_json = post_response.json()
        return response_json['d']
    
    def download_details(self, count):
        # download details of all listings
        all_listings_details = []
        for listing in self.listings[:count]:
            details = self.download_listing_details(listing['Listing']['ID'])
            time.sleep(1)
            all_listings_details.append(details)
        return all_listings_details
    
    def clean
    
    @staticmethod
    def clean_listing_details(listings_details):
        cleaned_listings_details = [] 
        for details in listings_details:
            details_dict = {}
            for prop in toplevel_props:
                details_dict[prop] = details[prop]
            cleaned_listings_details.append(details_dict)
        return cleaned_listings_details


    

In [53]:
home_scraper = HomeScraper('Portland, OR')
home_scraper.download_listings()
all_listings_details = home_scraper.download_details(2)

In [54]:
all_listings_details[1]

{'__type': 'Homesnap.API.HSListingDetail',
 'DefaultParentAreaID': 109819,
 'DefaultParentArea': {'AreaID': 109819,
  'Name': 'Downtown Portland',
  'ShortName': 'Downtown Portland',
  'Subhead': 'Portland, OR Neighborhood',
  'State': 'OR',
  'USPSCity': 'Portland',
  'AreaTypeID': 6,
  'SubTypeID': 2,
  'TopLevelAreaID': 372,
  'CountyID': 3234,
  'Polygon': None,
  'Levels': None,
  'North': 45.523115,
  'South': 45.505193,
  'East': -122.668805,
  'West': -122.689466,
  'HasValidCoordinates': True,
  'MostRecentListDate': '/Date(1648391440000)/',
  'Url': '/sitemap/OR/Portland/Downtown-Portland',
  'ImageUrl': 'https://s3.amazonaws.com/homesnap.areas/0/ni/1/original.jpg',
  'UrlForSale': '/homes/for_sale/OR/Portland/Downtown-Portland/p_21,109819',
  'UrlForSaleOpenHouse': '/homes/for_sale/open_house/OR/Portland/Downtown-Portland/p_21,109819/f_1',
  'UrlForRent': '/homes/for_rent/OR/Portland/Downtown-Portland/p_21,109819',
  'Status': 15,
  'CityModeID': 0,
  'AreaTypeFlag': 32,
  '

In [55]:
cleaned_details = home_scraper.clean_listing_details(all_listings_details)

In [56]:
cleaned_details[0]

{'ListingID': 118112620,
 'DefaultParentArea': {'AreaID': 109835,
  'Name': 'Powellhurst - Gilbert',
  'ShortName': 'Powellhurst - Gilbert',
  'Subhead': 'Portland, OR Neighborhood',
  'State': 'OR',
  'USPSCity': 'Portland',
  'AreaTypeID': 6,
  'SubTypeID': 2,
  'TopLevelAreaID': 372,
  'CountyID': 3234,
  'Polygon': None,
  'Levels': None,
  'North': 45.504699,
  'South': 45.476046,
  'East': -122.514586,
  'West': -122.578702,
  'HasValidCoordinates': True,
  'MostRecentListDate': '/Date(1648388361000)/',
  'Url': '/sitemap/OR/Portland/Powellhurst---Gilbert',
  'ImageUrl': 'https://s3.amazonaws.com/homesnap.areas/0/ni/11/original.jpg',
  'UrlForSale': '/homes/for_sale/OR/Portland/Powellhurst---Gilbert/p_21,109835',
  'UrlForSaleOpenHouse': '/homes/for_sale/open_house/OR/Portland/Powellhurst---Gilbert/p_21,109835/f_1',
  'UrlForRent': '/homes/for_rent/OR/Portland/Powellhurst---Gilbert/p_21,109835',
  'Status': 15,
  'CityModeID': 0,
  'AreaTypeFlag': 32,
  'HasMLSCoverage': True,
  

In [38]:
print(list(cleaned_details[0].keys()) == toplevel_props)
for prop in toplevel_props:
    if prop not in cleaned_details[0].keys():
        print(prop)

False


In [49]:
a = list(cleaned_details[0].keys()) 
print(type(a))
a == toplevel_props
a, toplevel_props


<class 'list'>


(['DefaultParentArea',
  'Building',
  'BathsFull',
  'BathsHalf',
  'Beds',
  'CurrentPrice',
  'Days',
  'FullStreetAddress',
  'Latitude',
  'ListDate',
  'ListingID',
  'Longitude',
  'LotSize',
  'OriginalPrice',
  'Ownership',
  'PropertyStyle',
  'PropertyType',
  'Remarks',
  'SqFt',
  'TownhouseType',
  'UnitCount',
  'Url',
  'WalkScore',
  'WalkScoreDescription',
  'YearBuilt',
  'Zip',
  'Details',
  'Amenities',
  'BuildingAmenities'],
 ['ListingID',
  'DefaultParentArea',
  'Url',
  'Building',
  'BathsFull',
  'BathsHalf',
  'Beds',
  'CurrentPrice',
  'Days',
  'FullStreetAddress',
  'Latitude',
  'Longitude',
  'ListDate',
  'LotSize',
  'OriginalPrice',
  'Ownership',
  'PropertyStyle',
  'PropertyType',
  'Remarks',
  'SqFt',
  'TownhouseType',
  'UnitCount',
  'WalkScore',
  'WalkScoreDescription',
  'YearBuilt',
  'Zip',
  'Details',
  'Amenities',
  'BuildingAmenities'])

In [51]:
a = ['a', 'b']
b = ['b', 'a']
a == b

False

In [15]:
pd.set_option('display.max_columns', None)
home_scraper.listings[0]
details = home_scraper.download_listing_details(home_scraper.listings[0]['Listing']['ID'])

In [16]:
details

{'__type': 'Homesnap.API.HSListingDetail',
 'DefaultParentAreaID': 109835,
 'DefaultParentArea': {'AreaID': 109835,
  'Name': 'Powellhurst - Gilbert',
  'ShortName': 'Powellhurst - Gilbert',
  'Subhead': 'Portland, OR Neighborhood',
  'State': 'OR',
  'USPSCity': 'Portland',
  'AreaTypeID': 6,
  'SubTypeID': 2,
  'TopLevelAreaID': 372,
  'CountyID': 3234,
  'Polygon': None,
  'Levels': None,
  'North': 45.504699,
  'South': 45.476046,
  'East': -122.514586,
  'West': -122.578702,
  'HasValidCoordinates': True,
  'MostRecentListDate': '/Date(1648388361000)/',
  'Url': '/sitemap/OR/Portland/Powellhurst---Gilbert',
  'ImageUrl': 'https://s3.amazonaws.com/homesnap.areas/0/ni/6/original.jpg',
  'UrlForSale': '/homes/for_sale/OR/Portland/Powellhurst---Gilbert/p_21,109835',
  'UrlForSaleOpenHouse': '/homes/for_sale/open_house/OR/Portland/Powellhurst---Gilbert/p_21,109835/f_1',
  'UrlForRent': '/homes/for_rent/OR/Portland/Powellhurst---Gilbert/p_21,109835',
  'Status': 15,
  'CityModeID': 0,
 

In [None]:
listings.columns

In [None]:
listings['Listing.ID'][0]

In [None]:
listings.head(2)

In [None]:
import requests

url = "https://www.homesnap.com/service/Listings/GetDetails"

payload = {
    "listingID": 117692827,
    "parts": 506,
    "schoolsPolygonType": 1,
    "promo": {
        "promoCampaign": None,
        "promoSource": None,
        "promoMedium": "web-xs",
        "promoTerm": None,
        "promoContent": None,
        "promoDate": None
    }
}
headers = {
    "cookie": "ASP.NET_SessionId=yppkigaisg5islm2h142pcqe; SERVERID=web7; User=ID%3D770047381%26Hash%3Dee7d2e74ccd95b24c1a9b1963755545f845602e7",
    "Connection": "keep-alive",
    "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="99", "Google Chrome";v="99"',
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Content-Type": "application/json; charset=UTF-8",
    "X-Requested-With": "XMLHttpRequest",
    "sec-ch-ua-mobile": "?0",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36",
    "sec-ch-ua-platform": '"Windows"',
    "Origin": "https://www.homesnap.com",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-Mode": "cors",
    "Sec-Fetch-Dest": "empty",
    "Referer": "https://www.homesnap.com/OR/Portland/136-SE-74th-Avenue",
    "Accept-Language": "en-US,en;q=0.9",
    "Cookie": "_fbp=fb.1.1647876752391.1624983748; _gcl_au=1.1.473135812.1647876752; User=ID=770047381&Hash=ee7d2e74ccd95b24c1a9b1963755545f845602e7; _gid=GA1.2.384448963.1647996657; ASP.NET_SessionId=caudawsn5ys1gq4ldawuqcjd; SERVERID=web7; _gat=1; _ga_SJ09CZBE62=GS1.1.1648098188.7.1.1648099108.0; _ga=GA1.1.1645780538.1647876752"
}

response = requests.request("POST", url, json=payload, headers=headers)

print(response.json())

In [None]:
description = response.json()['d']['Remarks']

In [None]:
details = response.json()['d']['Details']

In [None]:
json_normalize(details) 