# CKME136 - Capstone Project - Steven Lee
### Toronto Real Estate Listings

### 1. Imports for webscraping and data cleaning

In [1]:
import csv
import itertools
import re
import requests
import time

from bs4 import BeautifulSoup
# Beautiful Soup Documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

### 2. Create a list of all toronto listing pages

In [5]:
#example list page https://listing.ca/mls/?.cy.........12..$
listing_page_urls = []
def all_listing_page_urls(url_start, url_end):
    for i in range(1,175):
        generated_urls = url_start+str(i)+url_end
        listing_page_urls.append(generated_urls)

toronto_listing_url_start = 'https://listing.ca/mls/?.cy.........'
toronto_listing_url_end = '..$'

all_listing_page_urls(toronto_listing_url_start, toronto_listing_url_end)

listing_page_urls[:4]

['https://listing.ca/mls/?.cy.........1..$',
 'https://listing.ca/mls/?.cy.........2..$',
 'https://listing.ca/mls/?.cy.........3..$',
 'https://listing.ca/mls/?.cy.........4..$']

### 3. Create list of specific listing urls

In [3]:
def individual_listing_urls(listing_page_urls):
    #create empty list
    all_page_text = []
    #extract html from list of page urls
    for page_url in listing_page_urls:
        resp = requests.get(page_url).text
        soup = BeautifulSoup(resp)
        page_text = soup.find_all('a')
        all_page_text.extend(page_text)
        time.sleep(2)
    #extract urls from each text
    urls = [x.get("href") for x in all_page_text]
    #regex for urls that are street numbers
    r = re.compile('https://toronto.listing.ca/[0-9]')
    #create unique list of listing urls
    individual_listing_urls = list(set(filter(r.match, urls)))
    return individual_listing_urls


In [218]:
#individual_listing_urls
individual_listing_urls = individual_listing_urls(listing_page_urls)

In [8]:
#create a test set of listing urls
test = listing_page_urls[:4]
test_1 = individual_listing_urls(test)
test_1 = test_1[5:8]

In [9]:
test_1

['https://toronto.listing.ca/88-scott-st-504.C4361608.htm#15-l',
 'https://toronto.listing.ca/60-fairfax-cres-414.E4361425.htm#15-w',
 'https://toronto.listing.ca/70-baylawn-dr.E4361632.htm#15-c']

### 4. Create a CSV of raw html text

In [215]:
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def raw_indiviudal_listing_scraper(individual_listing_urls):
    with open('test.csv','w') as f1:
        writer = csv.writer(f1, delimiter=',',lineterminator='\n',)
        
        for listing in individual_listing_urls:
            resp = [requests.get(listing).text]
        
        writer.writerow(resp)
        
raw_indiviudal_listing_scraper(test_1)

In [214]:
with open('test.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    
    for row in csv_reader:
        print row

In [20]:
test_1

['https://toronto.listing.ca/88-scott-st-504.C4361608.htm#15-l',
 'https://toronto.listing.ca/60-fairfax-cres-414.E4361425.htm#15-w',
 'https://toronto.listing.ca/70-baylawn-dr.E4361632.htm#15-c']

### 5. Extract listing features from individual listing urls

In [22]:
#to extract rooms data from a listing
def rooms_indiviudal_listing_scraper(urls):

    for listing in urls:
        resp = requests.get(listing).text
        soup = BeautifulSoup(resp, 'html.parser')
        
        regex = re.compile('([0-9]){2}')
        content = soup.findAll("div", {"class": "lpc15"})[2].text
        if re.match(regex, content) is None:
            start_range = 8
        else:
            start_range = 10
            
        rooms = soup.findAll("div", {"class": "lpc15"})[start_range:]
        rooms_string = "".join(map(str, rooms))
        rooms_soup = BeautifulSoup(rooms_string, 'html.parser')
        rooms_soup_range = len(rooms_soup)
        
        room_keys = []
        room_values = []
        for i in range(start_range, start_range+rooms_soup_range):
            value = soup.findAll("div", {"class": "lpc15"})[i].text
            key_extract = str(soup.findAll("div", {"class": "lpc15"})[i].previousSibling)
            key = BeautifulSoup(key_extract).text
            
            room_keys.append(key)
            room_values.append(value)
            
        rooms_dict = dict(zip(room_keys, room_values))

        print [rooms_dict]

rooms_indiviudal_listing_scraper(test_1)

[{u'LIVING ROOM': u'Flat Level, 4.89m x 3.90mWood Floor, Combined W/Dining, East View', u'MASTER BEDROOM': u'Flat Level, 5.78m x 3.37mEnsuite Bath, Irregular Rm, Double Closet', u'DINING ROOM': u'Flat Level, 4.89m x 3.90mWood Floor, Combined W/Living', u'BATHROOMS': u'4-piece on flat level4-piece on flat level', u'KITCHEN': u'Flat Level, 3.55m x 3.52mStainless Steel Appl, Quartz Counter, O/Looks Living', u'2ND BEDROOM': u'Flat Level, 4.70m x 4.53mEnsuite Bath, Wood Floor, Closet'}]
[{u'BATHROOM': u'Main Level, 1.52m x 1.55mCeramic Floor', u'LIVING ROOM': u'Main Level, 5.54m x 2.80mLaminate, W/O To Balcony', u'MASTER BEDROOM': u'Main Level, 3.93m x 2.46mLaminate, Closet', u'BATHROOMS': u'3-piece on main level', u'KITCHEN': u'Main Level, 2.49m x 2.74mCeramic Floor, Granite Counter, Stainless Steel Appl', u'2ND BEDROOM': u'Main Level, 3.07m x 2.22mLaminate, Closet, French Doors'}]
[{u'LIVING ROOM': u'Ground Level, 3.30m x 5.56mParquet Floor, Sunken Room, W/O To Yard', u'MASTER BEDROOM': u

In [235]:
resp = requests.get('https://toronto.listing.ca/37-meadowcliffe-dr.E4317312.htm#15-4').text
print resp

In [23]:
#to extract features from a listing including sale price, sold comparables, listing url, address, community, type of dwelling, property, features, description, extras
def other_features_indiviudal_listing_scraper(individual_listing_urls):

        for listing in individual_listing_urls:
            resp = requests.get(listing).text
            soup = BeautifulSoup(resp, 'html.parser')
            
            #listing url and MLS ID
            x = [listing]
            
            #sale price and listing date
            x.append(soup.find("div", {"class": "sales"}).findChildren()[3].text)
            x.append(soup.find("div", {"class": "sales"}).findChildren()[1].text)
            
            #address, community
            x.extend([soup.findAll("div", {"class": "lpc15"})[0].text])
            
            #type of dwelling
            x.extend([soup.findAll("div", {"class": "lpc15"})[1].text])
            
            #Comparables
            comparables_class = soup.find("div", {"class": "sales", "class": "comparables"})
            
            comparable_sold_price_1 = comparables_class.find(text=re.compile('Sold Price')).findNext('div').contents[0]
            comparable_list_price_1 = comparables_class.find(text=re.compile('List Price')).findNext('div').contents[0]
            comparable_sold_date_1 = comparables_class.find(text=re.compile('Sold Date')).findNext('div').contents[0]
            
            comparable_sold_price_2 = comparables_class.find(text=re.compile('Sold Price')).findNext(text=re.compile('Sold Price')).findNext('div').contents[0]
            comparable_list_price_2 = comparables_class.find(text=re.compile('List Price')).findNext(text=re.compile('List Price')).findNext('div').contents[0]
            comparable_sold_date_2 = comparables_class.find(text=re.compile('Sold Date')).findNext(text=re.compile('Sold Date')).findNext('div').contents[0]
            
            comparable_sold_price_3 = comparables_class.find(text=re.compile('Sold Price')).findNext(text=re.compile('Sold Price')).findNext(text=re.compile('Sold Price')).findNext('div').contents[0]
            comparable_list_price_3 = comparables_class.find(text=re.compile('List Price')).findNext(text=re.compile('List Price')).findNext(text=re.compile('List Price')).findNext('div').contents[0]
            comparable_sold_date_3 = comparables_class.find(text=re.compile('Sold Date')).findNext(text=re.compile('Sold Date')).findNext(text=re.compile('Sold Date')).findNext('div').contents[0]
            
            comparable_sold_price_4 = comparables_class.find(text=re.compile('Sold Price')).findNext(text=re.compile('Sold Price')).findNext(text=re.compile('Sold Price')).findNext(text=re.compile('Sold Price')).findNext('div').contents[0]
            comparable_list_price_4 = comparables_class.find(text=re.compile('List Price')).findNext(text=re.compile('List Price')).findNext(text=re.compile('List Price')).findNext(text=re.compile('List Price')).findNext('div').contents[0]
            comparable_sold_date_4 = comparables_class.find(text=re.compile('Sold Date')).findNext(text=re.compile('Sold Date')).findNext(text=re.compile('Sold Date')).findNext(text=re.compile('Sold Date')).findNext('div').contents[0]
            
            comparable_sold_price_5 = comparables_class.find(text=re.compile('Sold Price')).findNext(text=re.compile('Sold Price')).findNext(text=re.compile('Sold Price')).findNext(text=re.compile('Sold Price')).findNext(text=re.compile('Sold Price')).findNext('div').contents[0]
            comparable_list_price_5 = comparables_class.find(text=re.compile('List Price')).findNext(text=re.compile('List Price')).findNext(text=re.compile('List Price')).findNext(text=re.compile('List Price')).findNext(text=re.compile('List Price')).findNext('div').contents[0]
            comparable_sold_date_5 = comparables_class.find(text=re.compile('Sold Date')).findNext(text=re.compile('Sold Date')).findNext(text=re.compile('Sold Date')).findNext(text=re.compile('Sold Date')).findNext(text=re.compile('Sold Date')).findNext('div').contents[0]
            
            x.extend([comparable_sold_price_1, 
                     comparable_list_price_1, 
                     comparable_sold_date_1,
                     comparable_sold_price_2,
                     comparable_list_price_2,
                     comparable_sold_date_2,
                     comparable_sold_price_3,
                     comparable_list_price_3,
                     comparable_sold_date_3,
                     comparable_sold_price_4,
                     comparable_list_price_4,
                     comparable_sold_date_4,
                     comparable_sold_price_5,
                     comparable_list_price_5,
                     comparable_sold_date_5])
            
            #property features, description, and extras
            #required to account for some listings with property layout element
            regex = re.compile('([0-9]){2}')
            content = soup.findAll("div", {"class": "lpc15"})[2].text
            if re.match(regex, content) is None:
                x.append([soup.findAll("div", {"class": "lpc15"})[2].text])
                x.append([soup.findAll("div", {"class": "lpc15"})[6].text])
                x.append([soup.findAll("div", {"class": "lpc15"})[7].text])
                
            else:
                x.append([soup.findAll("div", {"class": "lpc15"})[3].text])
                x.append([soup.findAll("div", {"class": "lpc15"})[7].text])
                x.append([soup.findAll("div", {"class": "lpc15"})[8].text])
            
            print x

other_features_indiviudal_listing_scraper(test_1)

['https://toronto.listing.ca/88-scott-st-504.C4361608.htm#15-l', u'$999,000', u'02/18/2019', u'88 Scott St 504, Church-Yonge Corridor, Toronto M5E0A9', u'Apartment, Condo Apartment5 Rooms Total, 1 Kitchen1 Parking Space', u'$1,195,000', u'$1,199,800', u'11/08/2018', u'$830,500', u'$799,800', u'10/29/2018', u'$1,300,069', u'$1,249,000', u'09/26/2018', u'$907,000', u'$929,000', u'01/23/2019', u'$685,000', u'$659,000', u'01/16/2019', [u'Hospital, Park, Place Of Worship, Public Transit, SchoolNone BasementConcrete, Stone Exterior'], [u'Unbeatable Location! This Gorgeous, Capacious, Bright And Airy, Brand New, 2 Bed, 2 Ensuite Bathsuite Is Located Steps From The Fin District, Union Station, St Lawrence Mkt, 24 Hr Metro, Ttc. Located On The Top Floor Of The Historic Loft Podium Of The Building, This Stunning Suite Boasts Soaring 11Ft Ceilings, Open Concept Living Area, Split Bedroom Plan For Maximum Privacy And Overlooks Quiet Scott St. Tons Of Storage Throughout. Parking And Locker Included

### 4. Build a cleaner for the data set
- categorical variables in extras and property features
- light NLP in description
- room sizes and organization
- comparables section
- MLS ID
- LAT and LON

In [152]:
#code for writing to CSV
#dataset keys
with open('test.csv','w') as f1:
    writer = csv.writer(f1, delimiter=',',lineterminator='\n',)
    writer.writerow(keys)
    writer.writerow(x)

    dict_writer = csv.DictWriter(f1, keys, extrasaction = 'ignore')
    dict_writer.writerows(z)
    
    keys = ['listing_url', 'sale_price', 'listing_date', 'address','dwelling_type',
        'comparable_sold_price_1', 'comparable_list_price_1', 'comparable_sold_date_1',
        'comparable_sold_price_2', 'comparable_list_price_2', 'comparable_sold_date_2',
        'comparable_sold_price_3', 'comparable_list_price_3', 'comparable_sold_date_3',
        'comparable_sold_price_4', 'comparable_list_price_4', 'comparable_sold_date_4',
        'comparable_sold_price_5', 'comparable_list_price_5', 'comparable_sold_date_5',
        'features', 'description', 'extras',
        'BATHROOMS', 'LIVING ROOM', 'DINING ROOM', 'KITCHEN', 'LIBRARY', 'FAMILY ROOM', 'REC', '5TH BEDROOM', '4TH BEDROOM', '3RD BEDROOM', '2ND BEDROOM', 'MASTER BEDROOM', 'OFFICE']

In [24]:
# find the term Lot Size
# soup.find("div", {"class": "lpc15"}).findNext("div", {"class": "lpc15"}).findNext("div", {"class": "lpc15"}).find_previous_sibling()
# soup.find('div', attrs={'style': 'color: #3a5fac; padding-top: 5px; font-size: 18px; '})

In [100]:
# !pip install geopy

# from geopy.geocoders import Nominatim
# geolocator = Nominatim(user_agent="specify_your_app_name_here")
# location = geolocator.geocode("88 Park Lawn Rd 2717, Mimico, Toronto M8Y0B5")
# print(location.address)
# print((location.latitude, location.longitude))


In [119]:
# Phase 2: Impact of Economic indicators
# interest rates ex. Bank of Canada
# index price ex. S&P 500 / past months trajectory
# Inflation Rate
# real estate etfs
# Employment Rate