In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import pickle
import json

Individual listings pages use javascript. Data is loaded by calling rest API. Wait for page to load and extract information using selenium

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException

# type `which chromedriver` from shell to find chromedriver
#chromedriver = "/Volumes/Files/homebrew/bin/chromedriver"
chromedriver = "/usr/local/bin/chromedriver"
driver = webdriver.Chrome(chromedriver)

In [3]:
def parseAddress(driver):
    buildingName = None
    streetAddress = None
    city = None
    state = None
    zipCode = None
    result = {}
    
    addressLines = driver.find_element_by_xpath(".//h1[@class='SmallText HdpAddress-title']").text.split('\n')
    offset = 0
    
    if len(addressLines) > 2:
        buildingName = addressLines[0].strip()
        offset = 1


    streetAddress = addressLines[0 + offset].strip()
    city = addressLines[1 + offset].split(',')[0].strip()
    state = addressLines[1 + offset].split(',')[1].strip().split(' ')[0]
    zipCode = addressLines[1 + offset].split(',')[1].strip().split(' ')[1]
    
    return({'buildingName':buildingName,
            'streetAddress':streetAddress,
            'city':city,
            'state':state,
            'zipCode':zipCode
           })

In [4]:
def parseIsMultipleListingMode(driver):

    try:
        #driver.find_element_by_xpath("//div[@class='MultiModelHdpHeader']")
        driver.find_element_by_xpath("//div[@class='MultiModelHdpHeader' or @class='MultiModelsGroup']")
        return(True)
    except NoSuchElementException:
        return(False)

In [5]:
def parsePropertyType(driver):
    propType = driver.find_element_by_xpath("//div[@class='PropertyTypeIcon']").text
    
    return(propType)

In [6]:
#this one is tricky. Input address then get results
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def inputCommuteAddress(driver, addressStr = '79 Madison Avenue, New York, NY, United States'):
    #Add address on commute time section... this only needs to be done once per session
    driver.find_element_by_xpath("//div[@class='CommuteTimes-hdp-commute-add-dest']//a").click()
    
    #enter address
    driver.find_element_by_xpath("//input[@class='AddressAutocomplete-input']").send_keys(addressStr)

    #select first suggestion from drop-down
    element = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.XPATH, "//div[@class='AddressAutocomplete-suggestion']"))
                );
    element.click()
    
    #click add button - wait for sometime for button to be drawn. button drawn after API returns transit times
    element = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.XPATH, "//button[@class='Button Button-md Button-primary Button-full']"))
                );
    element.click()

    

In [7]:
def getElemByXPathWithDelay(driver, xpath, delay=5):
    element = WebDriverWait(driver, 4).until(
                EC.presence_of_element_located((By.XPATH, xpath))
                );
    return(element)

def parseCommuteTimesAndDistance(driver):
    commuteDistance = ''
    try:
        commuteDistance = driver.find_element_by_xpath("//div[@class='CommuteTimes-hdp-commute-distance']").text
    except:
        inputCommuteAddress(driver)

        element = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, "//div[@class='CommuteTimes-hdp-commute-distance']"))
                );
        commuteDistance = element.text.replace('\n','')
        
    commuteTimeByCar = getElemByXPathWithDelay(driver, "//div[@class='CommuteTimes-hdp-transit-info']").text.replace('\n','')
    commuteTimeByTransit = getElemByXPathWithDelay(driver, "//div[@class='CommuteTimes-hdp-transit-info']/following-sibling::div").text.replace('\n','')
    commuteTimeByFoot = getElemByXPathWithDelay(driver, "//div[@class='CommuteTimes-hdp-transit-info']/following-sibling::div/following-sibling::div").text.replace('\n','')
    commuteTimeByBike = getElemByXPathWithDelay(driver, "//div[@class='CommuteTimes-hdp-transit-info']/following-sibling::div/following-sibling::div/following-sibling::div").text.replace('\n','')
    
    return({'commuteDistance':commuteDistance,
            'commuteTimeByCar':commuteTimeByCar,
            'commuteTimeByTransit':commuteTimeByTransit,
            'commuteTimeByFoot':commuteTimeByFoot,
            'commuteTimeByBike':commuteTimeByBike
            })



In [8]:
def parsePropertyAmmenities(driver):
    elems = [elem.text for elem in driver.find_elements_by_xpath("//span[text()='Property amenities']/ancestor::div/div/ul//li") if elem.text.strip()]
    return(elems)
#driver.find_elements_by_xpath("//span[text()='Property amenities']/ancestor::div/div/ul//li")[0].text

def parseUnitAmmenities(driver):
    elems = [elem.text for elem in driver.find_elements_by_xpath("//span[text()='Unit amenities']/ancestor::div/div/ul//li") if elem.text.strip()]
    return(elems)

In [10]:
def parseBuildingName(driver):
    buildingName = None
    try:
        buildingName = driver.find_element_by_xpath(".//h1[@class='SmallText HdpAddress-title']/div[@class='Utils-text-overflow']").text
    except:
        pass
    return(buildingName)


In [2]:
def parseBuildingDescription(driver):
    desc = ''
    try:
        desc = driver.find_element_by_xpath("//div[@id='HdpDescriptionContent']").text
    except:
        print('No Building Description')
    return(desc)



In [12]:
def clickAllShowMores(driver):
    for elem in driver.find_elements_by_xpath("//*[contains(text(),'Show more')]"):
        elem.click()

In [3]:
def parseSchoolsText(driver):
    return(driver.find_elements_by_xpath("//div[@class='HdpSchools']/div")[0].text)


In [14]:
def parseLatLong(driver):
    elem = driver.find_elements_by_xpath("/html/head/script[@type='application/ld+json']")[0].get_attribute("innerHTML")
    geo = json.loads(elem)['@graph'][0]['geo']
    return({'latitude': geo['latitude'],
     'longitude': geo['longitude']})

In [4]:
def parseSingleUnit(driver):
    price = getElemByXPathWithDelay(driver, "//div[@class='SingleModelHdpHeader-pricing']").text
    bedrooms = driver.find_elements_by_xpath("//div[@class='BedsBathsSqft-item']")[0].text
    baths = driver.find_elements_by_xpath("//div[@class='BedsBathsSqft-item']")[1].text
    sqFt = driver.find_elements_by_xpath("//div[@class='BedsBathsSqft-item']")[2].text
    unitDesc = parseBuildingDescription(driver)
    address = parseAddress(driver)
    commuteDistance = parseCommuteTimesAndDistance(driver)
    clickAllShowMores(driver)
    propertyAmenities = parsePropertyAmmenities(driver)
    unitAmenities = parseUnitAmmenities(driver)
    #buildingDesc = parseBuildingDescription(driver)
    schoolText = parseSchoolsText(driver)
    latLong = parseLatLong(driver)
    
    dictionary = {
                 'buildingName': address['buildingName'],
                 'city': address['city'],
                 'state': address['state'],
                 'streetAddress': address['streetAddress'],
                 'zipCode': address['zipCode'],
                 'price': price,
                 'bedrooms':bedrooms,
                 'baths':baths,
                 'sqFt':sqFt,
                 'commuteDistance': commuteDistance['commuteDistance'],
                 'commuteTimeByBike': commuteDistance['commuteTimeByBike'],
                 'commuteTimeByCar': commuteDistance['commuteTimeByCar'],
                 'commuteTimeByFoot': commuteDistance['commuteTimeByFoot'],
                 'commuteTimeByTransit': commuteDistance['commuteTimeByTransit'],
                 'propertyAmenities': propertyAmenities,
                 'unitAmenities': unitAmenities,
                 'buildingDesc':unitDesc,
                 'schoolText': schoolText,
                 'latitude': latLong['latitude'],
                 'longitude': latLong['longitude']
                }
    
    return(dictionary)

In [16]:
#print(driver.find_elements_by_xpath("//div[@class='BuildingSummaryView']")[0].find_element_by_xpath("//span[@class='BuildingSummaryView-price']").text)
#print(driver.find_elements_by_xpath("//div[@class='BuildingSummaryView']")[0].find_element_by_xpath(".//div[@class='BuildingSummaryView-title']").text)
#print(driver.find_elements_by_xpath("//div[@class='BuildingSummaryView']")[0].find_elements_by_xpath("//div[@class='BedsBathsSqft-item']")[0].text)
#print(driver.find_elements_by_xpath("//div[@class='BuildingSummaryView']")[0].find_elements_by_xpath("//div[@class='BedsBathsSqft-item']")[1].text)
#print(driver.find_elements_by_xpath("//div[@class='BuildingSummaryView']")[0].find_elements_by_xpath("//div[@class='BedsBathsSqft-item']")[2].text)

#Test webpages:
#driver.get('https://hotpads.com/exo-astoria-apartments-astoria-ny-11102-smwv9a/pad')
#driver.get('https://hotpads.com/2506-21st-st-astoria-ny-11102-tgdnfk/building')

def parseListingInfo(driver):
    lst = []

    if not parseIsMultipleListingMode(driver):
        try:
            xpath = "//div[@class='BuildingSummaryView active' or @class='BuildingSummaryView']"
            n_elems = len(driver.find_elements_by_xpath(xpath))
            if n_elems > 0:
                print('Multiple listings in the same building')
                for idx in range(0,n_elems):
                    print('  Listing' + str(idx))
                    driver.find_elements_by_xpath(xpath)[idx].find_element_by_xpath(".//span[text()='View more info']").click()
                    dictionary = parseSingleUnit(driver)

                    lst.append(dictionary)
                    driver.back();
                    time.sleep(.25)
            else:
                print('Single Unit')
                dictionary = parseSingleUnit(driver)
                lst.append(dictionary)


            return(lst)
            #if(len(lst) == 0):
            #    raise
        except Exception as e:
            print('single mode, parseListingInfo failed for ' + driver.current_url)
            print(e)
            return(None)
    else:
        try:
            
            print('Building')
            address = parseAddress(driver)

            commuteDistance = parseCommuteTimesAndDistance(driver)
            buildingDesc = parseBuildingDescription(driver)
            schoolText = parseSchoolsText(driver)
            clickAllShowMores(driver)
            propertyAmenities = parsePropertyAmmenities(driver)
            unitAmenities = parseUnitAmmenities(driver)
            latLong = parseLatLong(driver)
            
            for elem in driver.find_elements_by_xpath("//div[@class='ModelFloorplanItem']"):
                title = elem.find_element_by_xpath(".//div[contains(@class,'ModelFloorplanItem-content')]").text
                price = elem.find_element_by_xpath(".//div[@class='ModelFloorplanItem-price']").text
                bedrooms = elem.find_elements_by_xpath("//div[@class='BedsBathsSqft-item']")[0].text
                baths = elem.find_elements_by_xpath("//div[@class='BedsBathsSqft-item']")[1].text
                sqFt = elem.find_elements_by_xpath("//div[@class='BedsBathsSqft-item']")[2].text
                dictionary = {
                         'buildingName': address['buildingName'],
                         'city': address['city'],
                         'state': address['state'],
                         'streetAddress': address['streetAddress'],
                         'zipCode': address['zipCode'],
                         'price': price,
                         'title': title,
                         'bedrooms':bedrooms,
                         'baths':baths,
                         'sqFt':sqFt,
                         'commuteDistance': commuteDistance['commuteDistance'],
                         'commuteTimeByBike': commuteDistance['commuteTimeByBike'],
                         'commuteTimeByCar': commuteDistance['commuteTimeByCar'],
                         'commuteTimeByFoot': commuteDistance['commuteTimeByFoot'],
                         'commuteTimeByTransit': commuteDistance['commuteTimeByTransit'],
                         'propertyAmenities': propertyAmenities,
                         'unitAmenities': unitAmenities,
                         'buildingDesc':buildingDesc,
                         'schoolText': schoolText,
                         'latitude': latLong['latitude'],
                         'longitude': latLong['longitude']
                        }
                lst.append(dictionary)
            return(lst)
        except:
            print('multiple mode, parseListingInfo failed for ' + driver.current_url)

            return(None)
    return(lst);
    
 

Go through each summary listing pkl and navigate to each individual listing. Scrape information on individual listing and pickle away

In [None]:
import glob
import datetime

listingSummaryPklsFiles = glob.glob('data/summaryListings/summary*.pkl')


for lstSummaryPklFn in listingSummaryPklsFiles:
    lstSumPkl = pd.read_pickle(lstSummaryPklFn)
    
    print('PROCESSING: ' + lstSummaryPklFn + ' Listings: ' + str(len(lstSumPkl)))
    
    if not ('Status' in lstSumPkl.columns):
        lstSumPkl['Status'] = 'Pending'
    
    counter = 1
    for listingIdx, listingRow in lstSumPkl.iterrows():
        print(datetime.datetime.now())
        if lstSumPkl.loc[listingIdx,'Status'] == 'Success':
            print('Previously scraped successfully: ' + listingRow.link)
            counter += 1
            continue   #------- Turn this back on
        
        unitLisingFn = lstSummaryPklFn.replace('summary','unit').replace('.pkl','_'+str(listingIdx)+'.pkl')
        print('Scraping apartment ' + str(counter) + ' of ' + str(len(lstSumPkl)) + ' in ' + lstSummaryPklFn)
        
        try:
            listingCompleteUrl = 'https://hotpads.com' + listingRow.link
            print('url:'+ listingCompleteUrl)
            
            driver.get(listingCompleteUrl)
            time.sleep(.25)
            
            listingInfoDict = parseListingInfo(driver)
            
            if listingInfoDict is None:
                print('******SCRAPING ERROR with :' + listingCompleteUrl)
                lstSumPkl.loc[listingIdx,'Status'] = 'Failed'
                continue
            
            listingInfoDf = pd.DataFrame(listingInfoDict)
            
            print('Pickling: '+ unitLisingFn)
            listingInfoDf.to_pickle(unitLisingFn)
            lstSumPkl.loc[listingIdx,'Status'] = 'Success'
            
        except Exception as e:
            lstSumPkl.loc[listingIdx,'Status'] = 'Failed'
            print(e)
            pass
        
        lstSumPkl.to_pickle(lstSummaryPklFn)
        
        time.sleep(.75)
        counter += 1
        print('')



PROCESSING: data/summaryListings/summary10001_0.pkl Listings: 67
2017-04-16 21:33:55.002545
Previously scraped successfully: /avalon-west-chelsea-new-york-ny-10001-9y8/pad
2017-04-16 21:33:55.003423
Previously scraped successfully: /777-6th-avenue-new-york-ny-10001-sj7zqk/pad
2017-04-16 21:33:55.003823
Previously scraped successfully: /beatrice-new-york-ny-10001-sjn11n/pad
2017-04-16 21:33:55.006803
Previously scraped successfully: /800-sixth-new-york-ny-10001-skg1de/pad
2017-04-16 21:33:55.007759
Previously scraped successfully: /ava-high-line-new-york-ny-10001-7q4k/pad
2017-04-16 21:33:55.008390
Previously scraped successfully: /abington-house-new-york-ny-10001-9z9/pad
2017-04-16 21:33:55.009866
Previously scraped successfully: /308-w-30th-st-new-york-ny-10001-atg/building
2017-04-16 21:33:55.011256
Previously scraped successfully: /360-w-34th-st-new-york-ny-10001-wdbcuq/building
2017-04-16 21:33:55.013110
Previously scraped successfully: /125-w-31st-st-new-york-ny-10001-eftz/buildin

Combine all individual listings pickles into a single one dataframe and pickle that away for analysis

In [None]:
import glob 

listingUnitLisings = glob.glob('data/unitListings/*.pkl')
df = None
for f in listingUnitLisings:
    if df is None:
        df = pd.read_pickle(f)
    else:
        df = pd.concat([df,pd.read_pickle(f)])