In [1]:
import os
import sys
from bs4 import BeautifulSoup
from requests import get
import requests


In [2]:
remax_url = 'https://leadingedge-northcarolina.remax.com/realestatehomesforsale/charlotte-nc-p001.html?query=listingdate-dorder/price-dorder/'
BASE_URL='https://leadingedge-northcarolina.remax.com'

In [3]:
remax_soup = BeautifulSoup(get(remax_url).text,'html.parser')

In [4]:
remax_urls=[]
search = remax_soup.find_all('a',class_='listing-pane-info js-detaillink')
for link in search:
    remax_urls.append(BASE_URL+link['href'])

In [6]:
house_soup = BeautifulSoup(get(remax_urls[1]).text,'html.parser')

In [7]:
def findReMaxURLS(soup):
    """
    soup:  bs4 soup -  BeautifulSoup object of a url of a remax search result page
    BASE_URL: string - the beginning of the URL that remax uses
    -
    returns: list - a list of all URLs of homes on the given page  
    """
    BASE_URL='https://leadingedge-northcarolina.remax.com'
    remax_urls = []
    linksoup = soup.find_all('a',class_='listing-pane-info js-detaillink')
    for crouton in linksoup:
        remax_urls.append(BASE_URL+crouton['href'])
    return remax_urls

In [8]:
house_soup.find_all('li',attrs={'hmsitemprop':'Address'})[0].text.strip()

u'2100 Shadyview Drive'

In [9]:
def pullHomeData(home_url):
    """
    home_url - string: url of the remax home from which we wish to extract data
    
    - 
    returns: home_dict: dict {
    
            address:{
                    address_line1: [string] - street number, street
                    unit: [string] - optional, apartment number, extra info
                    state: [string] - state abbreviation (two letter)
                    city: [string]
                    country: [string] - country str
                    zipcode: [string] - five number zipcode
                    }
                    
            listing_data: {
            
                    list_price: int - home listing price (in USD)
                    num_bedrooms: float - number of bedrooms (1 decimal)
                    num_bathrooms: float - number of bathrooms (1 decimal)
                    building_area_sq_ft: positive int in SQUARE FEET
            
            }
            
            valuation: {
          
            }
            
            features: {
                
                lot_size: float - size of the yard, in ACRES
                floors: int - number of floors of the house
                garage: int - is there a garage (not quite sure what this one is tbh)
                date_listed_on_site: [datetime] date that the house was listed on the site 
                school: [text] - average rating of all nearby schools
                coords: [tuple] a tuple of the latitude and longitude of the address
                desc: [text] a paragraph description of the home
                mls: [int] MLS listing number
                recent_selling_history: [list of tuples] list of tuples of buying history
                is_foreclosure: [boolean] is this a foreclosure home?
                
            }
            
            images: {
            
                image_list: list - a list of all the image urls associated with this house
            
            }
    }       
    
    """
    scrape_address = {}
    homesoup = BeautifulSoup(get(home_url).text,'html.parser')
    scrape_address['address_line1']  = homesoup.find_all('li',attrs={'hmsitemprop':'Address'})[0].text.strip()
    scrape_address['city']  = homesoup.find_all('li',attrs={'hmsitemprop':'City'})[0].text.strip()
    scrape_address['state']  = homesoup.find_all('li',attrs={'hmsitemprop':'State'})[0].text.strip()
    scrape_address['zipcode']  = homesoup.find_all('li',attrs={'hmsitemprop':'Zip'})[0].text.strip()
    
    response = canonicalizeAddress(scrape_address)[0]
    
    address={}
    address['address_line1'] = str(response['address_info']['address'])
    address['city'] = str(response['address_info']['city'])
    address['zipcode'] = str(response['address_info']['zipcode'])
    address['state'] = str(response['address_info']['state'])
    address['unit'] = str(response['address_info']['unit'])
    address['lat'] = str(response['address_info']['lat'])
    address['lon'] = str(response['address_info']['lng'])
    address['slug'] = str(response['address_info']['slug'])
    print response
    
    
    try: 
        if str(response['address_info']['status']['details'][0])=='Address fully verified':
            print "verified address with house canary API"
        else:
            print 'error for address %s %s %s' % (address['address_line1'],address['city'],address['state'])
            
    except:
        print 'error for address %s %s %s' % (address['address_line1'],address['city'],address['state'])

        
    

    
    listing_data = {}
    listing_data['num_bedrooms'] = int(homesoup.find_all('span',class_='listing-detail-beds-val')[0].text.strip())
    listing_data['num_bathrooms'] = int(homesoup.find_all('span',class_='listing-detail-baths-val')[0].text.strip())
    listing_data['building_area_sq_ft'] = int(homesoup.find_all('span',class_='listing-detail-sqft-val')[0].text.strip().replace(',',''))
    listing_data['list_price'] = int(homesoup.find_all('span',class_='listing-detail-price-amount  pad-half-right')[0].text.strip().replace(',',''))
    
    features = {}
    try:
        features['MLS'] = int(homesoup.find_all('li',attrs={'hmsitemprop':'MLSNumber'})[0].text.strip())
    except:
        features['MLS'] = None
    features['is_foreclosure'] = str(house_soup.find_all('li',attrs={'hmsitemprop':'IsForeclosure'})[0].text.strip()) == 'True'
    try:
        features['desc'] = homesoup.find_all('p',class_="listing-bio")[0].text.strip()
    except:
        features['desc'] = None
    features['year_built'] = findNestedInfo(homesoup,'Year Built')
    
    try:
        school_score = getAverageSchoolRating(homesoup,address['lat'],address['lon'],address['zipcode'])
    except:
        school_score = None
    
    features['school_score'] = school_score
    
    
    
    
    
    
    
    images={}
    images['img_urls'] = pullImageURLSFromSlideshow(homesoup)
    
    home_dict = {}
    home_dict['scrape_address'] = scrape_address
    home_dict['address'] = address
    home_dict['listing_data'] = listing_data
    home_dict['images'] = images
    home_dict['features'] = features
    
    
    
    return home_dict
    

In [10]:
remax_urls[1]

u'https://leadingedge-northcarolina.remax.com/realestatehomesforsale/2100-shadyview-drive-charlotte-nc-28210-id305413655.html'

In [295]:
char = BeautifulSoup(get('https://executive6-northcarolina.remax.com/realestatehomesforsale/6823-olde-sycamore-drive-mint-hill-nc-28227-gid400015464390.html').text,'html.parser')

In [18]:
pool=pullHomeData('https://executive1-northcarolina.remax.com/realestatehomesforsale/4914-dawnridge-drive-charlotte-nc-28226-gid400015487345.html')

{u'address_info': {u'status': {u'changes': [], u'errors': [], u'requested_item': {u'city': u'Charlotte', u'state': u'NC', u'zipcode': u'28226', u'address': u'4914 Dawnridge Drive'}, u'details': [u'Address fully verified'], u'match': True}, u'city': u'Charlotte', u'county_fips': u'37119', u'geo_precision': u'rooftop', u'block_id': u'371190030083005', u'zipcode': u'28226', u'slug': u'4914-Dawnridge-Dr-Charlotte-NC-28226', u'address_full': u'4914 Dawnridge Dr Charlotte NC 28226', u'zipcode_plus4': u'8088', u'state': u'NC', u'metrodiv': None, u'unit': None, u'address': u'4914 Dawnridge Dr', u'lat': 35.09606, u'lng': -80.83135, u'blockgroup_id': u'371190030083', u'msa': u'16740'}, u'property/geocode': {u'api_code_description': u'ok', u'api_code': 0, u'result': True}}
verified address with house canary API


In [302]:
'pool' in pool['features']['desc']

True

In [15]:
house_soup.find_all('span',class_='listing-detail-sqft-val')[0].text.strip()

u'1,562'

In [16]:
int(house_soup.find_all('span',class_='listing-detail-sqft-val')[0].text.strip().replace(',',''))

1562

In [12]:
def pullImageURLSFromSlideshow(soup):
    """
    soup: [bs4 soup object]  the soup object of the website that we are going to be scraping from.
    
    returns: [list] a list of urls of images of the house of a given webpage
    """
    imglist = []
    for noodle in soup.find_all('figure',class_='figure figure__slideshow'):
        if len(noodle) == 3:
            imglist.append(noodle['data-href'])
    return imglist

In [33]:
house_soup.find_all('figure', class_='figure figure__slideshow')[2]['data-href']

u'https://cdn-4.eneighborhoods.com/x2/@v=-242405177@/130/9/932/3352932/3352932_2.jpg'

In [35]:
pullImageURLSFromSlideshow(house_soup)

[u'https://cdn-4.eneighborhoods.com/x2/@v=-239259449@/130/9/932/3352932/3352932_1.jpg',
 u'https://cdn-4.eneighborhoods.com/x2/@v=-242405177@/130/9/932/3352932/3352932_2.jpg',
 u'https://cdn-4.eneighborhoods.com/x2/@v=-241356601@/130/9/932/3352932/3352932_3.jpg',
 u'https://cdn-4.eneighborhoods.com/x2/@v=-236113721@/130/9/932/3352932/3352932_4.jpg',
 u'https://cdn-4.eneighborhoods.com/x2/@v=-235065145@/130/9/932/3352932/3352932_5.jpg',
 u'https://cdn-4.eneighborhoods.com/x2/@v=-238210873@/130/9/932/3352932/3352932_6.jpg',
 u'https://cdn-4.eneighborhoods.com/x2/@v=-237162297@/130/9/932/3352932/3352932_7.jpg',
 u'https://cdn-4.eneighborhoods.com/x2/@v=-248696633@/130/9/932/3352932/3352932_8.jpg',
 u'https://cdn-4.eneighborhoods.com/x2/@v=-247648057@/130/9/932/3352932/3352932_9.jpg',
 u'https://cdn-4.eneighborhoods.com/x2/@v=320079634@/130/9/932/3352932/3352932_10.jpg',
 u'https://cdn-4.eneighborhoods.com/x2/@v=319031058@/130/9/932/3352932/3352932_11.jpg',
 u'https://cdn-4.eneighborhoods.

In [43]:
house_soup.find_all('li',attrs={'hmsitemprop':'MLSNumber'})[0].text.strip()

u'3352932'

In [54]:
str(house_soup.find_all('li',attrs={'hmsitemprop':'IsForeclosure'})[0].text.strip()) == 'True'

False

In [58]:
house_soup.find_all('p',class_="listing-bio")[0].text.strip()

u'Second floor unit. Open layout. Completely Renovated, Updated Bathroom,  Granite Counter Tops, New Kitchen Appliances  New Engineered Wood floors. New carpet. Screened in porch. Stone fireplace. Plenty of storage inside the unit and outside. Fridge, and washer/dryer to remain. Large MBR with his/her closets. MBA has large garden tub. Easy access to shopping, dining, and entertainment.Instructions:Lockbox/Key, Showing ServiceDirections:Located in Brandywine on Albemarle Rd... Right on Dockside, then left on Winery Ln.'

In [13]:
def findNestedInfo(soup,info):
    """
    soup: [bs4 soup obj] the soup obj of webpage to scrape
    
    returns: [int] year the house was built
    
    """

    for idx, noodle in enumerate(house_soup.find_all('dt',class_='listing-detail-stats-main-key')):
        if info in noodle.text.strip():
            data = house_soup.find_all('dd',class_="listing-detail-stats-main-val")[idx]
            data= data.text.strip()
    return data

In [87]:
house_soup.find_all('dd',class_="listing-detail-stats-main-val")[6].text.strip()

u'1982'

In [95]:
test_url = 'https://www.remax.com/realestatehomesforsale/durham-nc-p001.html'
test_soup = BeautifulSoup(get(test_url).text,'html.parser')

In [97]:
durham= findReMaxURLS(test_soup)

In [218]:
pullHomeData(durham[4])

NameError: global name 'requests' is not defined

In [100]:
durham[4]

u'https://leadingedge-northcarolina.remax.com/realestatehomesforsale/207-edward-street-durham-nc-27701-gid400014629847.html'

In [14]:
def getAverageSchoolRating(soup,lat,lon,zipcode,radius=5):
    """
    this querys the remax API endpoint and 
    
    inputs
    soup: [bs4 soup object] soup object of webpage for home we are scraping
    
    -
    output
    returns [int] an integer of how good the school is, on a scale of 0 to 100
    A+ 100
    A 95
    A- 91
    B+ 88
    B 85
    B- 81
    C+ 78
    C 75
    C- 71
    D+ 68
    D 65
    D- 61
    F 50
    
    this rating excludes and schools that have an 'N/A' rating in the average
    
    
    """
    siteID = findSiteID(soup) #this global function is defined elsewhere in the program
    
    API_call_BASE = "https://leadingedge-northcarolina.remax.com/api/homefacts/"

    
    API_call = API_call_BASE + '?&radius={}&lat={}&long={}&schoolspergrades=true&zipcode={}'.format(str(radius),str(lat),str(lon),str(zipcode))
    
    response = get(API_call)
    response.raise_for_status()
    
    home_stats = response.json()
    
    school_grade = 0
    count=0
    for idx, school in enumerate(home_stats['HFSchools']):
        gradestr = school['SchoolGrade']
        if gradestr == 'Unavailable':
            count+=1
        else:
            score = gradeToScore(gradestr)
            school_grade += score
    adj_idx = idx+1-count
    avg_school_score = float(school_grade)/float(adj_idx)
    
    
    
    return avg_school_score
    
    

In [15]:
def findSiteID(soup):
    """
    when given a soup object, finds the siteID if one is on the page and returns that.
    
    input
    soup [bs4 soup object] the bs4 soup object for the page we are scraping
    
    returns siteID [int] the siteID and makes sure it's the correct length 
    
    """
    siteID = int(soup.text[soup.text.strip().find('siteid')+18:soup.text.strip().find('siteid')+26])
    return siteID

In [184]:
test3 = BeautifulSoup(get(remax_urls[5]).text,'html.parser')

In [211]:
score=getAverageSchoolRating(test3)

In [212]:
score

80.0

In [16]:
def gradeToScore(gradestr):
    """
    takes a string of the grade of a school as an input, returns a number fr 0 to 100
    
    input
    gradestr: [string] string of the grade of the school pulled from the remax API
    
    returns school_score [float] a number fr 0 to 100 that is the converted school grade
    """
    
    
    grade2score = {
        'Aplus': 100,
        'A': 95,
        'Aminus':91,
        'Bplus':88,
        'B':85,
        'Bminus':81,
        'Cplus':78,
        'C':75,
        'Cminus':71,
        'Dplus':68,
        'D':65,
        'Dminus':61,
        'F':50,
    }
    
    school_score = grade2score[gradestr]
    return school_score
    

In [213]:
remax_urls[5]

u'https://leadingedge-northcarolina.remax.com/realestatehomesforsale/2944-brahman-meadows-lane-charlotte-nc-28273-gid400015469703.html'

In [17]:
def canonicalizeAddress(remax_address_dict):
    """
    this takes a dictionary of address data scraped from the remax website and calls the housecanary API
    to make sure that all of the data is standardized
    
    input
    
    remax_address_dict [dict] {
        address_line1: [string] steet number, street
        unit: [int] (optional) unit no
        state: [string] state, two letter abbreviation
        zipcode: [int] - five number zipcode
    }
    
    returns [dict] of street address info from the house canary API 
    
    """
    hc_key = 'WVXI9291RNJY49L4ZKH5'
    hc_secret = 'BcsFQhZ9o3Jxjexy7fXjq1G0LusBiQ2r' 
    params = {
        
        'address': remax_address_dict['address_line1'],
        'state': remax_address_dict['state'],
        'zipcode': remax_address_dict['zipcode'],
        'city': remax_address_dict['city'],
    }
    geocode_url = 'https://api.housecanary.com/v2/property/geocode'
    response = requests.get(geocode_url, params=params, auth=(hc_key, hc_secret))
    response = response.json()
    return response



    
    

input
Built 1990 or newer

150-500 price range

at least 3 bedrooms

at least 2 bathrooms

must have garage

good school ratings and low crime

no pools, wells, or septics

located in an established subdivision