### Testing


In [None]:
!conda install -c conda-forge folium --yes


In [None]:
import folium

print('Folium installed and imported')

In [None]:
world_map = folium.Map()



In [None]:
import pandas as pd
import numpy as np

In [None]:
# Get geo json file showing vancouver districts
!wget 'https://maps.vancouver.ca/server/rest/services/Hosted/NeighbourhoodBoundaries/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson' -O world_countries.json



In [None]:
world_geo = r'world_countries.json'

world_geo

In [None]:
lat = 49.246292
lng =  -123.116226
van_map = folium.Map(location=[lat,lng], zoom_start=12)




### Start Web Scraping

We are going to use BeautifulSoup for scraping

In [74]:
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import requests
import pandas as pd
import numpy as np
import re

In [87]:
# url to scrape
base_url = 'https://www.zolo.ca/new-westminister-real-estate/'
neighbourhoods = ['connaught-heights',
                 'downtown-nw',
                 'fraserview-nw',
                 'glenbrooke-north',
                 'moody-park',
                 'quay',
                 'queensborough',
                 'queens-park',
                 'sapperton',
                 'the-heights-nw',
                 'uptown-nw',
                 'west-end-nw']

print(neighbourhoods)

neighbourhood_urls= []

for n in neighbourhoods:
    neighbourhood_urls.append(url + n)
    
#neighbourhood_urls

['connaught-heights', 'downtown-nw', 'fraserview-nw', 'glenbrooke-north', 'moody-park', 'quay', 'queensborough', 'queens-park', 'sapperton', 'the-heights-nw', 'uptown-nw', 'west-end-nw']


In [76]:
def get_listings(url):
    r = requests.get(url)
    
    # setup object and parser
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # SoupStrainer needs to match class exactly
    attrs_dict = {"class": "listings md-flex md-flex-row md-flex-wrap md-flex-align-start list-unstyled js-listings"}

    # create the listings subset doc
    
    # use the soupstrainer to get the subset of the document 
    # containing the information you would like to retreive
    listings = SoupStrainer("ul", attrs=attrs_dict)
    
    # The card-listing--location list contains location information
    locations = soup.find_all("div", class_="card-listing--location")

    addresses=[]
    cities=[]
    neighbourhoods=[]
    latitudes=[]
    longitudes=[]

    for l in locations:
        addresses.append(l.find_next(itemprop="streetAddress").string)
        cities.append(l.find_next(itemprop="addressLocality").string)   
        neighbourhoods.append(l.find_next(class_="neighbourhood").contents[1])
        latitudes.append(l.find_next(itemprop="latitude")['content'])
        longitudes.append(l.find_next(itemprop="longitude")['content'])
        
        
    # The card-listing--values list contains price info
    info = soup.find_all("ul", class_="card-listing--values")

    prices=[]
    beds=[]
    baths=[]
    sqft=[]
    built_in=[]

    for i in info:
        price_ele = i.find_next(class_="price")  
        prices.append(price_ele.find_next(itemprop="price").string)

        bed_ele = price_ele.find_next_sibling()    
        beds.append(bed_ele.string.split()[0])

        bath_ele = bed_ele.find_next_sibling()
        baths.append(bath_ele.contents[1].split()[0])

        sqft_ele = bath_ele.find_next_sibling("li")
        sqft_ = sqft_ele.contents[1].split()[0] if (sqft_ele.contents[1].split()[0]).isdigit() else np.nan
        sqft.append(sqft_)

        built_in_ele = sqft_ele.find_next_sibling("li")      
        year = built_in_ele.contents[2].string if built_in_ele != None else np.nan   
        built_in.append(year)

    # The card-listing--image div contains the MLS number
    mls_info = soup.find_all("div", "card-listing--image")

    mls=[]

    for i in mls_info:
        # use regex to split the string to get MLS
        mls.append(re.split(":", i.img['alt'])[1].strip())
        
    # zip all the data into an iterable zipped object. Once action performed it loses values
    zipped = zip(mls,prices,beds,baths,sqft,addresses,cities,neighbourhoods,latitudes,longitudes,built_in)

    # store zipped list data into variable
    data = list(zipped)
    
    # create columns pandas dataframe
    column_names = ['MLS', 'Price', 'Beds', 'Baths', 'SQFT', 'Street', 'City', 'Neighborhoud', 'Latitude', 'Longitude', 'Built In']

    # create dataframe and assign columns to it
    df = pd.DataFrame(data, columns=column_names)
    
    return df

In [77]:
# drops all rows containing NA
df = get_listings(url)

df.dropna(inplace=True)

indexNames=df[df['SQFT']==np.nan].index
 

# Delete these row indexes from dataFrame
df.drop(indexNames,inplace=True)

In [78]:
# convert price to int
df['Price'] = df['Price'].replace(',', '', regex=True).astype(int)

# can't conver with missing data
#df['Beds'] = df['Beds'].astype(int)
#df['Baths'] = df['Baths'].astype(int)
df['SQFT'] = df['SQFT'].astype(int)
df['PPS'] = (df['Price'] / df['SQFT']).astype(int)
df['Built In'] = df['Built In'].astype(int)

from datetime import date
current_year = date.today().year

df['Age'] = current_year - df['Built In']

In [79]:
df.sort_values(['PPS'], axis=0, ascending=True, inplace=True)
df

Unnamed: 0,MLS,Price,Beds,Baths,SQFT,Street,City,Neighborhoud,Latitude,Longitude,Built In,PPS,Age
1,R2426481,2098000,7,6,4483,220 Durham Street,New Westminster,GlenBrooke North,49.21751,-122.918228,2017,467,3
5,R2410706,2299888,7,6,4530,92 Glover Avenue,New Westminster,GlenBrooke North,49.222878,-122.917427,1958,507,62
0,R2427592,1398000,5,3,2576,314 Seventh Avenue,New Westminster,GlenBrooke North,49.216042,-122.918625,1936,542,84
3,R2418025,509999,2,1,882,205-38 Seventh Avenue,New Westminster,GlenBrooke North,49.219677,-122.912064,2003,578,17
6,R2408074,1098000,4,2,1764,44 Eighth Avenue,New Westminster,GlenBrooke North,49.220573,-122.915092,1947,622,73
2,R2419591,1858000,6,6,2948,622 Colborne Street,New Westminster,GlenBrooke North,49.218906,-122.912308,2018,630,2
4,R2412728,515000,2,1,803,308-38 Seventh Avenue,New Westminster,GlenBrooke North,49.219677,-122.912064,2003,641,17
