Import libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import pickle

Scrape NYC Neighborhood Mappings from NY State Health Department website. This might be slightly different from neighborhoods used by realtors

In [5]:
url = 'https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm'

df = pd.read_html(url)[0]

df = df.loc[1:]

df['shift'] = False
df.loc[df.iloc[:,2].isnull(), 'shift'] = True

df.loc[df.loc[:,'shift'], 2] = df.loc[df.loc[:,'shift'],1]
df.loc[df.loc[:,'shift'], 1] = df.loc[df.loc[:,'shift'],0]
df.loc[df['shift'] == True, 0 ] = None

df = df.iloc[:,0:3]

df.columns = ['borough', 'neighborhood', 'zips']

df.borough = df.borough.fillna(method='ffill')
df.head()

Unnamed: 0,borough,neighborhood,zips
1,Bronx,Central Bronx,"10453, 10457, 10460"
2,Bronx,Bronx Park and Fordham,"10458, 10467, 10468"
3,Bronx,High Bridge and Morrisania,"10451, 10452, 10456"
4,Bronx,Hunts Point and Mott Haven,"10454, 10455, 10459, 10474"
5,Bronx,Kingsbridge and Riverdale,"10463, 10471"


Process dataframe to get it into format: Borough, Neighborhood, ZipCode

In [6]:
df.zips = df.zips.str.split(',')

tempZipDF = pd.concat([pd.DataFrame({'zipCode':v}, index=np.repeat(k,len(v))) 
            for k,v in df.zips.to_dict().items()])   
tempZipDF

neighborhoodsDF = df

neighborhoodsDF = (
    neighborhoodsDF.merge(tempZipDF, left_index=True, right_index=True)
    .loc[:,['borough', 'neighborhood', 'zipCode']]
)

neighborhoodsDF.zipCode = neighborhoodsDF.zipCode.str.strip()

print(neighborhoodsDF.head())
print(neighborhoodsDF.tail())

  borough            neighborhood zipCode
1   Bronx           Central Bronx   10453
1   Bronx           Central Bronx   10457
1   Bronx           Central Bronx   10460
2   Bronx  Bronx Park and Fordham   10458
2   Bronx  Bronx Park and Fordham   10467
          borough              neighborhood zipCode
40  Staten Island               South Shore   10312
41  Staten Island  Stapleton and St. George   10301
41  Staten Island  Stapleton and St. George   10304
41  Staten Island  Stapleton and St. George   10305
42  Staten Island                Mid-Island   10314


Derive links for zip code landing pages

In [53]:
url = 'https://hotpads.com/%s/apartments-for-rent' % ('1')

neighborhoodsDF['zipUrlLink'] = neighborhoodsDF.zipCode.apply(lambda x: 'https://hotpads.com/%s/apartments-for-rent' % x)


Scrape zip code landing page to figure out the number of results pages

In [54]:
def getNumberOfListings(url):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    
    noOfListings = int(soup.find(class_ ='AreaPage-listings-count').text.split()[5])
    return(noOfListings)

In [55]:
def getSummaryUrlList(url,noOfListings):
    urlList = [
           url if i==0 
           else url+'?start='+ str(i) 
             for i in list(range(0,noOfListings+1,20))
          ]
    return(urlList)

In [61]:
def getSummaryPageLinks(baseZipUrl):
    noOfListings = getNumberOfListings(baseZipUrl)  
    summaryPageLinks = getSummaryUrlList(baseZipUrl, noOfListings)
    df = pd.DataFrame({'baseZipUrl':np.repeat(baseZipUrl,len(summaryPageLinks)), 'summaryPageLink':summaryPageLinks})
    return(df)

In [63]:
getSummaryPageLinks('https://hotpads.com/11102/apartments-for-rent').head()

Unnamed: 0,summaryPageLink,zipCode
0,https://hotpads.com/11102/apartments-for-rent,https://hotpads.com/11102/apartments-for-rent
1,https://hotpads.com/11102/apartments-for-rent?...,https://hotpads.com/11102/apartments-for-rent
2,https://hotpads.com/11102/apartments-for-rent?...,https://hotpads.com/11102/apartments-for-rent
3,https://hotpads.com/11102/apartments-for-rent?...,https://hotpads.com/11102/apartments-for-rent
4,https://hotpads.com/11102/apartments-for-rent?...,https://hotpads.com/11102/apartments-for-rent


In [86]:
def getAllSummaryPageLinks(neighborhoodsDF):
    linksDF = None
    for baseZipUrlLink in neighborhoodsDF.zipUrlLink:
        tempDF = None
        
        try:
            print(baseZipUrlLink)
            tempDF = getSummaryPageLinks(baseZipUrlLink)
        except:
            print('Failed !!!!:' + baseZipUrlLink)
        
        if not(tempDF is None):
            if linksDF is None:
                linksDF = tempDF
            else:
                linksDF = pd.concat([linksDF, tempDF])
            # adding delay so that I don't get banned from the website
        time.sleep(2)
    return(linksDF)
    

In [87]:
allLinksDF = getAllSummaryPageLinks(neighborhoodsDF)

https://hotpads.com/10453/apartments-for-rent
https://hotpads.com/10457/apartments-for-rent
https://hotpads.com/10460/apartments-for-rent
https://hotpads.com/10458/apartments-for-rent
https://hotpads.com/10467/apartments-for-rent
https://hotpads.com/10468/apartments-for-rent
https://hotpads.com/10451/apartments-for-rent
https://hotpads.com/10452/apartments-for-rent
https://hotpads.com/10456/apartments-for-rent
https://hotpads.com/10454/apartments-for-rent
https://hotpads.com/10455/apartments-for-rent
https://hotpads.com/10459/apartments-for-rent
https://hotpads.com/10474/apartments-for-rent
https://hotpads.com/10463/apartments-for-rent
https://hotpads.com/10471/apartments-for-rent
https://hotpads.com/10466/apartments-for-rent
https://hotpads.com/10469/apartments-for-rent
https://hotpads.com/10470/apartments-for-rent
https://hotpads.com/10475/apartments-for-rent
https://hotpads.com/10461/apartments-for-rent
https://hotpads.com/10462/apartments-for-rent
https://hotpads.com/10464/apartmen

In [88]:
allLinksDF

Unnamed: 0,summaryPageLink,zipCode
0,https://hotpads.com/10453/apartments-for-rent,https://hotpads.com/10453/apartments-for-rent
1,https://hotpads.com/10453/apartments-for-rent?...,https://hotpads.com/10453/apartments-for-rent
2,https://hotpads.com/10453/apartments-for-rent?...,https://hotpads.com/10453/apartments-for-rent
3,https://hotpads.com/10453/apartments-for-rent?...,https://hotpads.com/10453/apartments-for-rent
4,https://hotpads.com/10453/apartments-for-rent?...,https://hotpads.com/10453/apartments-for-rent
0,https://hotpads.com/10457/apartments-for-rent,https://hotpads.com/10457/apartments-for-rent
1,https://hotpads.com/10457/apartments-for-rent?...,https://hotpads.com/10457/apartments-for-rent
2,https://hotpads.com/10457/apartments-for-rent?...,https://hotpads.com/10457/apartments-for-rent
3,https://hotpads.com/10457/apartments-for-rent?...,https://hotpads.com/10457/apartments-for-rent
4,https://hotpads.com/10457/apartments-for-rent?...,https://hotpads.com/10457/apartments-for-rent


Pickle away allLinks data frame

In [None]:
allLinksDF.to_picke('allLinksDF.pkl')

There are nearly 30,000 listing on this website. In order to deal with failures, separate summary links to batches.

In [130]:
t = pd.read_pickle('allLinksDF.pkl')
t = t.rename(columns={'zipCode':'baseZipUrl'})

In [132]:
t['ordNum'] = range(0,len(t))

In [133]:
t['batchGroup'] = t.ordNum // 5
t['scrapingStatus'] = 'NotStarted'

In [134]:
t.to_pickle('allSummaryLinksDF.pkl')

In [145]:
t = pd.read_pickle('allSummaryLinksDF.pkl')

In [154]:
neighborhoodSummaryLinkAllDF = neighborhoodsDF.merge(t, left_on='zipUrlLink', right_on='baseZipUrl')
neighborhoodSummaryLinkAllDF.to_pickle('data/neighborhoodSummaryLinkAllDF.pkl')

In [140]:
for n, g in t.groupby('batchGroup'):
    fileName = 'summaryLinks_'+ str(n) + '.pkl'
    g.to_pickle('data/'+fileName)

In [181]:
#neighborhoodSummaryLinkAllDF.groupby(['zipCode','ordNum']).size().hist()
for zipcode,dat in neighborhoodSummaryLinkAllDF.groupby('zipCode'):
    filename = 'data/summaryLinksDF_{}.pkl'.format(zipcode)
    dat.to_pickle(filename)

In [194]:
neighborhoodSummaryLinkAllDF['ordNum'] = neighborhoodSummaryLinkAllDF.groupby('zipCode').zipCode.transform(lambda x: range(0,len(x)))

In [197]:
neighborhoodSummaryLinkAllDF['batchGroup'] = neighborhoodSummaryLinkAllDF['ordNum']//5

In [216]:
del neighborhoodSummaryLinkAllDF['baseZipUrl']

In [217]:
for n,g in neighborhoodSummaryLinkAllDF.groupby(['zipCode','batchGroup']):
    filename = 'data/summaryLinksDF_{}_{}.pkl'.format(n[0],n[1])
    g.to_pickle(filename)

In [218]:
pd.read_pickle('data/summaryLinksDF_11106_0.pkl').head()

Unnamed: 0,borough,neighborhood,zipCode,zipUrlLink,summaryPageLink,ordNum,batchGroup,scrapingStatus
957,Queens,Northwest Queens,11106,https://hotpads.com/11106/apartments-for-rent,https://hotpads.com/11106/apartments-for-rent,0,0,NotStarted
958,Queens,Northwest Queens,11106,https://hotpads.com/11106/apartments-for-rent,https://hotpads.com/11106/apartments-for-rent?...,1,0,NotStarted
959,Queens,Northwest Queens,11106,https://hotpads.com/11106/apartments-for-rent,https://hotpads.com/11106/apartments-for-rent?...,2,0,NotStarted
960,Queens,Northwest Queens,11106,https://hotpads.com/11106/apartments-for-rent,https://hotpads.com/11106/apartments-for-rent?...,3,0,NotStarted
961,Queens,Northwest Queens,11106,https://hotpads.com/11106/apartments-for-rent,https://hotpads.com/11106/apartments-for-rent?...,4,0,NotStarted


In [237]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

# type `which chromedriver` from shell to find chromedriver
#chromedriver = "/Volumes/Files/homebrew/bin/chromedriver"
chromedriver = "/usr/local/bin/chromedriver"
driver = webdriver.Chrome(chromedriver)

In [269]:
driver.get('https://hotpads.com/11102/apartments-for-rent')
#
driver.close()

In [2]:
#driver.find_elements_by_xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "name", " " ))')
#driver.find_elements_by_xpath("//div[@class='listing-info']/div/a")[0]

def getListingSummaries(url):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page,"lxml")
    
    result =[{'name':c.find_all(class_='name')[0].text, 
          'link':c.find_all(class_ ='Linker Linker-default')[0]['href'],
          'city':c.find_all(class_ ='city')[0].text,
          'min_price':c.find_all(class_ ='min-price')[0].text,
          'no_bedrooms':c.find_all(class_ ='beds')[0].text,
         }  for c in soup.find_all(class_='listing-info')
        ]
    return(result)

In [3]:
def getAllListingSummariesInForSumPkl(sumPkl):
    listingSummariesAllPagesInZip =[]
    for i in range(0,len(sumPkl)):
        try:
            summaryPageLink = sumPkl.iloc[i].summaryPageLink
            print('Accessing summary page: '+ summaryPageLink)
            listingSummaries = getListingSummaries(summaryPageLink)
            listingSummariesAllPagesInZip.extend(listingSummaries)
        except:
            print('Error getListingSummaries: ' + summaryPageLink)
        #print(listingSummaries)
        time.sleep(1)
    return(listingSummariesAllPagesInZip)

Go through each summary listings page and get individual listing links and high level information about the listing

In [4]:
#getListingSummaries('https://hotpads.com/11102/apartments-for-rent')
import shutil
import os


import glob
import time

count = 1
allSummaryLinkPkls = glob.glob("data/summaryLinksDF_*.pkl")

for sumPklFile in allSummaryLinkPkls:
    sumPkl = pd.read_pickle(sumPklFile)
    print('Going through summary pages in '+ sumPklFile)
    summaryListingFn = sumPklFile.replace('data/','data/summaryListings/').replace('summaryLinksDF_','summary')
    summaryListings = getAllListingSummariesInForSumPkl(sumPkl)
    summaryListingsDf = pd.DataFrame(summaryListings)

    print('Pickling: '+ summaryListingFn)
    summaryListingsDf.to_pickle(summaryListingFn)
    
    shutil.move(sumPklFile, "data/done/" + sumPklFile.replace('data/',''))
    print('')
    print(str(count) +' out of ' + str(len(allSummaryLinkPkls)) + ' Done.')
    print('')
    count += 1
    
    



Going through summary pages in data/summaryLinksDF_10001_0.pkl
Accessing summary page: https://hotpads.com/10001/apartments-for-rent
Accessing summary page: https://hotpads.com/10001/apartments-for-rent?start=20
Accessing summary page: https://hotpads.com/10001/apartments-for-rent?start=40
Accessing summary page: https://hotpads.com/10001/apartments-for-rent?start=60
Accessing summary page: https://hotpads.com/10001/apartments-for-rent?start=80
Pickling: data/summaryListings/summary10001_0.pkl

1 out of 304 Done.

Going through summary pages in data/summaryLinksDF_10002_0.pkl
Accessing summary page: https://hotpads.com/10002/apartments-for-rent
Accessing summary page: https://hotpads.com/10002/apartments-for-rent?start=20
Accessing summary page: https://hotpads.com/10002/apartments-for-rent?start=40
Accessing summary page: https://hotpads.com/10002/apartments-for-rent?start=60
Accessing summary page: https://hotpads.com/10002/apartments-for-rent?start=80
Pickling: data/summaryListings/

Check if data frame is being pickled away correctly

In [321]:
pd.read_pickle('data/summaryListings/summary11101_0.pkl')

Unnamed: 0,city,link,min_price,name,no_bedrooms
0,"Long Island City, NY 11101",/27-on-27th-long-island-city-ny-11101-1q2qtrj/pad,"$2,900+",27 On 27th,1 to 2 beds
1,"Long Island City, NY 11101",/4441-purves-st-long-island-city-ny-11101-w3g9...,"$2,299+",4441 Purves St,Studio to 2 beds
2,"Long Island City, NY 11101",/29-28-41st-ave-long-island-city-ny-11101-1pgh...,"$2,000",29-28 41st Ave,Studio
3,"Long Island City, NY 11101",/3427-43rd-st-long-island-city-ny-11101-tuwhca...,"$2,000",3427 43rd St #2R,2 beds
4,"Long Island City, NY 11101",/4237-27th-st-long-island-city-ny-11101-wd1sbr...,"$2,700+",4237 27th St,1 bed
5,"Long Island City, NY 11109",/475-48th-ave-long-island-city-ny-11109-sjmbwr...,"$2,489+",475 48th Ave,Studio to 3 beds
6,"Long Island City, NY 11101",/42-60-crescent-st-long-island-city-ny-11101-w...,"$4,350",42-60 Crescent St #7C,2 beds
7,"Long Island City, NY 11101",/519-borden-ave-long-island-city-ny-11101-sntt...,"$2,850+",519 Borden Ave,1 to 2 beds
8,"Long Island City, NY 11101",/3451-41st-st-long-island-city-ny-11101-1jbevw...,"$2,075",3451 41st St #1,2 beds
9,"Long Island City, NY 11101",/42-17-27th-st-long-island-city-ny-11101-1pg3v...,"$3,600",42-17 27th St #17E,1 bed
