### Testing


In [None]:
#!conda install -c conda-forge folium --yes


In [None]:
import folium

print('Folium installed and imported')

In [None]:
world_map = folium.Map()



In [None]:
import pandas as pd
import numpy as np

In [None]:
# Get geo json file showing vancouver districts
!wget 'https://maps.vancouver.ca/server/rest/services/Hosted/NeighbourhoodBoundaries/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson' -O world_countries.json



In [None]:
world_geo = r'world_countries.json'

world_geo

In [None]:
lat = 49.246292
lng =  -123.116226
van_map = folium.Map(location=[lat,lng], zoom_start=12)




### Start Web Scraping

We are going to use BeautifulSoup for scraping

In [20]:
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import requests
import pandas as pd
import numpy as np
import re

In [21]:
# url to scrape
base_url = 'https://www.zolo.ca/new-westminister-real-estate/'
neighbourhoods = ['connaught-heights',
                  'downtown-nw',
                 'fraserview-nw',
                 'glenbrooke-north',
                 'moody-park',
                 'quay',
                 'queensborough',
                 'queens-park',
                 'sapperton',
                 'the-heights-nw',
                 'uptown-nw',
                 'west-end-nw']
                

neighbourhood_urls= []

for n in neighbourhoods:
    neighbourhood_urls.append(base_url + n)
    

In [36]:

listings = []

for url in neighbourhood_urls:
    data = get_listings(url)
    for d in data:
        listings.append(d)

print(len(listings))
    
column_names = ['MLS', 'Price', 'Beds', 'Baths', 'SQFT', 'Street', 'City', 'Neighborhoud', 'Latitude', 'Longitude', 'Built In']
df = pd.DataFrame(listings, columns=column_names)



259


Unnamed: 0,MLS,Price,Beds,Baths,SQFT,Street,City,Neighborhoud,Latitude,Longitude,Built In
0,R2428839,1799999,7,6,3694,2030 Edinburgh Street,New Westminster,Connaught Heights,49.202618,-122.950287,2017
1,R2428756,1050000,2,1,1600,2223 Ninth Avenue,New Westminster,Connaught Heights,49.202888,-122.956581,1943
2,R2426278,7500000,–,–,982,2021 Marine Way,New Westminster,Connaught Heights,49.200081,-122.947113,1990
3,R2426078,539000,2,2,3764,217-6707 Southpoint Drive,Burnaby,South Slope,49.208576,-122.962044,2019
4,R2425871,2080000,9,8,3144,2309 Dublin Street,New Westminster,Connaught Heights,49.200985,-122.956169,1930
5,R2406911,1335000,5,3,2404,1728 Seventh Avenue,New Westminster,West End NW,49.203651,-122.941628,2016
6,R2424120,1099000,6,4,2500,522 Fourteenth Street,Vancouver,West End NW,49.205284,-122.933258,1937
7,R2424116,1299000,6,4,4199,1514 Dublin Street,New Westminster,West End NW,49.208740,-122.941208,2011
8,R2422370,1448000,–,–,4101,6182 9th Avenue,Burnaby,Big Bend,49.196922,-122.962654,2015
9,R2418212,1838000,8,7,2228,2117 Ninth Avenue,New Westminster,Connaught Heights,49.204094,-122.954353,9999


In [12]:
def get_listings(url):
    r = requests.get(url)
    
    # setup object and parser
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # SoupStrainer needs to match class exactly
    attrs_dict = {"class": "listings md-flex md-flex-row md-flex-wrap md-flex-align-start list-unstyled js-listings"}

    # create the listings subset doc
    
    # use the soupstrainer to get the subset of the document 
    # containing the information you would like to retreive
    listings = SoupStrainer("ul", attrs=attrs_dict)
    
    # The card-listing--location list contains location information
    locations = soup.find_all("div", class_="card-listing--location")

    addresses=[]
    cities=[]
    neighbourhoods=[]
    latitudes=[]
    longitudes=[]

    for l in locations:
        try:
            addresses.append(l.find_next(itemprop="streetAddress").string)
            cities.append(l.find_next(itemprop="addressLocality").string)   
            neighbourhoods.append(l.find_next(class_="neighbourhood").contents[1])
            latitudes.append(l.find_next(itemprop="latitude")['content'])
            longitudes.append(l.find_next(itemprop="longitude")['content'])
            pass
        except:
            continue
        
             
    # The card-listing--values list contains price info
    info = soup.find_all("ul", class_="card-listing--values")

    prices=[]
    beds=[]
    baths=[]
    sqft=[]
    built_in=[]

    
    for i in info:
        try:
            price_ele = i.find_next(class_="price")  
            prices.append(price_ele.find_next(itemprop="price").string)

            bed_ele = price_ele.find_next_sibling()    
            beds.append(bed_ele.string.split()[0])

            bath_ele = bed_ele.find_next_sibling()
            baths.append(bath_ele.contents[1].split()[0])
            sqft_ele = bath_ele.find_next_sibling("li")

            sqft_ = sqft_ele.contents[1].split()[0] if (sqft_ele.contents[1].split()[0]).isdigit() else np.nan
            sqft.append(sqft_)

            built_in_ele = sqft_ele.find_next_sibling("li")      
            year = built_in_ele.contents[2].string if built_in_ele != None else np.nan   
            built_in.append(year)

            pass
        except:
            continue
        
    # The card-listing--image div contains the MLS number
    mls_info = soup.find_all("div", "card-listing--image")

    mls=[]

    for i in mls_info:
        # use regex to split the string to get MLS
        mls.append(re.split(":", i.img['alt'])[1].strip())
        
    # zip all the data into an iterable zipped object. Once action performed it loses values
    zipped = zip(mls,prices,beds,baths,sqft,addresses,cities,neighbourhoods,latitudes,longitudes,built_in)

    # store zipped list data into variable
    data = list(zipped)
    
    
    return data

In [13]:

df.dropna(inplace=True)

indexNames=df[df['SQFT']==np.nan].index
 

# Delete these row indexes from dataFrame
df.drop(indexNames,inplace=True)

In [14]:
# convert price to int
df['Price'] = df['Price'].replace(',', '', regex=True).astype(int)

# can't conver with missing data
#df['Beds'] = df['Beds'].astype(int)
#df['Baths'] = df['Baths'].astype(int)
df['SQFT'] = df['SQFT'].astype(int)
df['PPS'] = (df['Price'] / df['SQFT']).astype(int)
df['Built In'] = df['Built In'].astype(int)

from datetime import date
current_year = date.today().year

df['Age'] = current_year - df['Built In']

In [15]:
df.sort_values(['PPS'], axis=0, ascending=True, inplace=True)
df

Unnamed: 0,MLS,Price,Beds,Baths,SQFT,Street,City,Neighborhoud,Latitude,Longitude,Built In,PPS,Age
20,R2382536,600000,2,2,5751,213-6707 Southpoint Drive,Burnaby,South Slope,49.208576,-122.962044,1937,104,83
3,R2426078,539000,2,2,3764,217-6707 Southpoint Drive,Burnaby,South Slope,49.208576,-122.962044,2019,143,1
14,R2404423,432800,2,2,2605,302-518 Thirteenth Street,New Westminster,Uptown NW,49.205818,-122.931702,1928,166,92
7,R2424116,1299000,6,4,4199,1514 Dublin Street,New Westminster,West End NW,49.20874,-122.941208,2011,309,9
17,R2395106,1149000,4,3,3331,1-1408 Sixth Avenue,New Westminster,Uptown NW,49.205521,-122.934036,2007,344,13
8,R2422370,1448000,–,–,4101,6182 9th Avenue,Burnaby,Big Bend,49.196922,-122.962654,2015,353,5
15,R2403308,999900,2,1,2341,6255 Thorne Avenue,Burnaby,Big Bend,49.197052,-122.959877,2018,427,2
16,R2399999,1088000,5,2,2476,1805 Eighth Avenue,New Westminster,West End NW,49.20504,-122.944817,2019,439,1
6,R2424120,1099000,6,4,2500,522 Fourteenth Street,Vancouver,West End NW,49.205284,-122.933258,1937,439,83
0,R2428839,1799999,7,6,3694,2030 Edinburgh Street,New Westminster,Connaught Heights,49.202618,-122.950287,2017,487,3


In [18]:
df.count()

MLS             21
Price           21
Beds            21
Baths           21
SQFT            21
Street          21
City            21
Neighborhoud    21
Latitude        21
Longitude       21
Built In        21
PPS             21
Age             21
dtype: int64