# CKME136 - Capstone Project - Toronto Real Estate Listings
## Step 1: Data Extraction - Web Scraping with requests & BeautifulSoup

<div class="alert alert-block alert-info">
1A. Imports for webscraping and data cleaning

In [291]:
import csv
import itertools
import re
import requests
import time

from bs4 import BeautifulSoup
# Beautiful Soup Documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

<div class="alert alert-block alert-info">
1B. Create a list of all toronto listing pages
    
- example list page https://listing.ca/mls/?.cy.........12..$


In [23]:
listing_page_urls = []
def all_listing_page_urls(url_start, url_end, pages):
    for i in range(1,pages):
        generated_urls = url_start+str(i)+url_end
        listing_page_urls.append(generated_urls)


toronto_listing_url_start = 'https://listing.ca/mls/?.cy.........'
toronto_listing_url_end = '..$'
max_pages = 175

all_listing_page_urls(toronto_listing_url_start, toronto_listing_url_end, max_pages)

<div class="alert alert-block alert-info">
1C. Create CSV of specific listing urls
    
- https://toronto.listing.ca/168-bonis-ave-1211.E4349723.htm#15-1dr

In [26]:
def individual_listing_urls(listing_page_urls):
    all_page_text = []
    
    delays = [0.3, 0.4, 0.7, 0.9, 1.0, 1.3]

    #extract html from list of page urls
    for page_url in listing_page_urls:
        resp = requests.get(page_url).text
        soup = BeautifulSoup(resp)
        #find 'a' tags
        page_text = soup.find_all('a')
        all_page_text.extend(page_text)
        
        #sleep between requests to avoid getting blocked
        delay = np.random.choice(delays)
        time.sleep(delay)
        
    #extract urls from each text
    urls = [x.get("href") for x in all_page_text]
    #regex for urls that have street numbers
    r = re.compile('https://toronto.listing.ca/[0-9]')
    #create unique list of listing urls
    individual_listing_urls = list(set(filter(r.match, urls)))
    
    #save urls in a csv
    with open('individual_listing_urls.csv','w') as f1:
        writer = csv.writer(f1, delimiter=',', lineterminator='\n',)
        writer.writerow(individual_listing_urls)

In [None]:
#create a csv of all listing urls
individual_listing_urls(listing_page_urls)

<div class="alert alert-block alert-info">
1D. Create individual text files of raw html text for each listing

In [33]:
def raw_indiviudal_listing_scraper():
    
    with open("individual_listing_urls.csv",'r') as f1:
        reader = csv.reader(f1)
        individual_listing_urls = list(itertools.chain.from_iterable(zip(*reader)))
        delays = [0.3, 0.4, 0.7, 0.9, 1.0, 1.3, 1.5]
        
        for i, listing in enumerate(individual_listing_urls):
            txt_file= open("listing_html_"+str(i)+".txt","w")
            resp = requests.get(listing).text.encode('utf-8')
            txt_file.write(resp)
            txt_file.close()
        
            #sleep between requests to avoid getting blocked
            delay = np.random.choice(delays)
            if i % 7 == 0:
                time.sleep(delay)

In [None]:
raw_indiviudal_listing_scraper()

<div class="alert alert-block alert-info">
5. Create sample listing_url CSV and to create unique key set

- Use sample of 100 listings to determine key set for rooms

In [270]:
# make sample csv
with open('individual_listing_urls.csv','r') as f1:
    reader = csv.reader(f1)
    individual_listing_urls = []
    for row in reader:
        individual_listing_urls.extend(row)
    
    sample_listing_urls = individual_listing_urls[:100]
    
    with open('sample_individual_listing_urls.csv','w') as f2:
        writer = csv.writer(f2, delimiter=',', lineterminator='\n',)
        writer.writerow(sample_listing_urls)

In [32]:
# print list of unique keys
def key_finder_indiviudal_listing_scraper():
    with open("sample_individual_listing_urls.csv",'r') as f1:
        reader = csv.reader(f1)
        individual_listing_urls = list(itertools.chain.from_iterable(zip(*reader)))
        
    all_rooms_dicts = []

    for i, listing_url in enumerate(individual_listing_urls):
        with open("listing_html_"+str(i)+".txt",'r') as listing:
            soup = BeautifulSoup(listing, 'html.parser')
            
            regex = re.compile('([0-9]){2}')
            content = soup.findAll("div", {"class": "lpc15"})[2].text
            if re.match(regex, content) is None:
                start_range = 8
            else:
                start_range = 10
                
            rooms = soup.findAll("div", {"class": "lpc15"})[start_range:]
            rooms_string = "".join(map(str, rooms))
            rooms_soup = BeautifulSoup(rooms_string, 'html.parser')
            rooms_soup_range = len(rooms_soup)
            
            room_keys = []
            
            for i in range(start_range, start_range+rooms_soup_range):
                value = soup.findAll("div", {"class": "lpc15"})[i].text
                key_extract = str(soup.findAll("div", {"class": "lpc15"})[i].previousSibling)
                key = BeautifulSoup(key_extract).text
                
                room_keys.append(key)
            
            all_rooms_dicts.extend(room_keys)
            
    print list(set(all_rooms_dicts))

key_finder_indiviudal_listing_scraper()

[u'OFFICE', u'FAMILY ROOM', u'DINING ROOM', u'DEN', u'REC', u'FOYER', u'2ND BEDROOM', u'BATHROOM', u'BATHROOMS', u'LIBRARY', u'EXERCISE', u'KITCHEN', u'LIVING ROOM', u'MASTER BEDROOM', u'STUDY', u'4TH BEDROOM', u'3RD BEDROOM', u'OTHER', u'LEGAL DESCRIPTION', u'5TH BEDROOM', u'SUNROOM', u'GREAT ROOM', u'LAUNDRY', u'SITTING', u'SOLARIUM', u'LOCKER', u'BEDROOM', u'BREAKFAST', u'MEDIA/ENT', u'UTILITY']


<div class="alert alert-block alert-info">
1E. Extract rooms features from individual listing urls
    
- ex. FAMILY ROOM, BEDROOM, BATHROOM

In [33]:
#to extract rooms data from a listing
def rooms_indiviudal_listing_scraper():
    with open("individual_listing_urls.csv",'r') as f1:
        reader = csv.reader(f1)
        individual_listing_urls = list(itertools.chain.from_iterable(zip(*reader)))
        
    all_rooms_dicts = []

    for i, listing_url in enumerate(individual_listing_urls):
        with open("listing_html_"+str(i)+".txt",'r') as listing:
            soup = BeautifulSoup(listing, 'html.parser')
            
            regex = re.compile('([0-9]){2}')
            content = soup.findAll("div", {"class": "lpc15"})[2].text
            if re.match(regex, content) is None:
                start_range = 8
            else:
                start_range = 10
                
            rooms = soup.findAll("div", {"class": "lpc15"})[start_range:]
            rooms_string = "".join(map(str, rooms))
            rooms_soup = BeautifulSoup(rooms_string, 'html.parser')
            rooms_soup_range = len(rooms_soup)
            
            room_keys = ['listing_url']
            room_values = [listing_url]
            
            for i in range(start_range, start_range+rooms_soup_range):
                value = soup.findAll("div", {"class": "lpc15"})[i].text
                key_extract = str(soup.findAll("div", {"class": "lpc15"})[i].previousSibling)
                key = BeautifulSoup(key_extract).text
                
                
                room_keys.append(key)
                room_values.append(value)
                
            room_dict = dict(zip(room_keys, room_values))
            
            all_rooms_dicts.append(room_dict)
            
            listing.close()
            
    with open('individual_listing_rooms.csv','wb') as f2:
        keys = ['listing_url', 'OFFICE', 'FAMILY ROOM', 'DINING ROOM', 'DEN', 'REC', 'FOYER', '2ND BEDROOM', 'BATHROOM', 'BATHROOMS', 'LIBRARY', 'EXERCISE', 'KITCHEN', 'LIVING ROOM', 'MASTER BEDROOM', 'STUDY', '4TH BEDROOM', '3RD BEDROOM', 'OTHER', 'LEGAL DESCRIPTION', '5TH BEDROOM', 'SUNROOM', 'GREAT ROOM', 'LAUNDRY', 'SITTING', 'SOLARIUM', 'LOCKER', 'BEDROOM', 'BREAKFAST', 'MEDIA/ENT', 'UTILITY']
        dict_writer = csv.DictWriter(f2, keys, extrasaction = 'ignore')
        dict_writer.writeheader()
        dict_writer.writerows(all_rooms_dicts)

rooms_indiviudal_listing_scraper()

<div class="alert alert-block alert-info">
1F. Extract other listing features from individual listing urls

- ex. comparable sold data, listing price, dwelling type, address

In [286]:
#Functions to extract sold comparable data
def sold_data_1(data):
    try:
        return comparables_class.find(text=re.compile(data)).findNext('div').contents[0]
    except:
        return '0'

def sold_data_2(data):
    try:
        return comparables_class.find(text=re.compile(data)).findNext(text=re.compile(data)).findNext('div').contents[0]
    except:
        return '0'

def sold_data_3(data):
    try:
        return comparables_class.find(text=re.compile(data)).findNext(text=re.compile(data)).findNext(text=re.compile(data)).findNext('div').contents[0]
    except:
        return '0'

def sold_data_4(data):
    try:
        return comparables_class.find(text=re.compile(data)).findNext(text=re.compile(data)).findNext(text=re.compile(data)).findNext(text=re.compile(data)).findNext('div').contents[0]
    except:
        return '0'

def sold_data_5(data):
    try:
        return comparables_class.find(text=re.compile(data)).findNext(text=re.compile(data)).findNext(text=re.compile(data)).findNext(text=re.compile(data)).findNext(text=re.compile(data)).findNext('div').contents[0]
    except:
        return '0'

In [287]:
#to extract features from a listing including sale price, sold comparables, listing url, address, community, type of dwelling, property, features, description, extras
def other_features_indiviudal_listing_scraper():
    with open("individual_listing_urls.csv",'r') as f1:
        reader = csv.reader(f1)
        individual_listing_urls = list(itertools.chain.from_iterable(zip(*reader)))
        
        all_other_features = []

        for i, listing_url in enumerate(individual_listing_urls):
            with open("listing_html_"+str(i)+".txt",'r') as listing:
                soup = BeautifulSoup(listing, 'html.parser')
                
                #listing url and MLS ID
                x = [listing_url]
                
                #sale price and listing date
                x.append(soup.find("div", {"class": "sales"}).findChildren()[3].text)
                x.append(soup.find("div", {"class": "sales"}).findChildren()[1].text)
                
                #address, community
                x.extend([soup.findAll("div", {"class": "lpc15"})[0].text])
                
                #type of dwelling
                x.extend([soup.findAll("div", {"class": "lpc15"})[1].text])
                
                #Comparables
                comparables_class = soup.find("div", {"class": "sales", "class": "comparables"})
                global comparables_class
                
                comparable_sold_price_1 = sold_data_1('Sold Price')
                comparable_list_price_1 = sold_data_1('List Price')
                comparable_sold_date_1 = sold_data_1('Sold Date')
                
                comparable_sold_price_2 = sold_data_2('Sold Price')
                comparable_list_price_2 = sold_data_2('List Price')
                comparable_sold_date_2 = sold_data_2('Sold Date')
                
                comparable_sold_price_3 = sold_data_3('Sold Price')
                comparable_list_price_3 = sold_data_3('List Price')
                comparable_sold_date_3 = sold_data_3('Sold Date')
                
                comparable_sold_price_4 = sold_data_4('Sold Price')
                comparable_list_price_4 = sold_data_4('List Price')
                comparable_sold_date_4 = sold_data_4('Sold Date')
                
                comparable_sold_price_5 = sold_data_5('Sold Price')
                comparable_list_price_5 = sold_data_5('List Price')
                comparable_sold_date_5 = sold_data_5('Sold Date')

                x.extend([comparable_sold_price_1,
                          comparable_list_price_1,
                          comparable_sold_date_1,
                          comparable_sold_price_2,
                          comparable_list_price_2,
                          comparable_sold_date_2,
                          comparable_sold_price_3,
                          comparable_list_price_3,
                          comparable_sold_date_3,
                          comparable_sold_price_4,
                          comparable_list_price_4,
                          comparable_sold_date_4,
                          comparable_sold_price_5,
                          comparable_list_price_5,
                          comparable_sold_date_5])

                
                #property features, description, and extras
                #required to account for some listings with property layout element
                regex = re.compile('([0-9]){2}')
                content = soup.findAll("div", {"class": "lpc15"})[2].text
                if re.match(regex, content) is None:
                    try:
                        x.append([soup.findAll("div", {"class": "lpc15"})[2].text])
                    except:
                        x.append(None)
                    try:
                        x.append([soup.findAll("div", {"class": "lpc15"})[6].text])
                    except:
                        x.append(None)
                    try:
                        x.append([soup.findAll("div", {"class": "lpc15"})[7].text])
                    except:
                        x.append(None)
                    
                else:
                    try:
                        x.append([soup.findAll("div", {"class": "lpc15"})[3].text])
                    except:
                        x.append(None)
                    try:
                        x.append([soup.findAll("div", {"class": "lpc15"})[7].text])
                    except:
                        x.append(None)
                    try:
                        x.append([soup.findAll("div", {"class": "lpc15"})[8].text])
                    except:
                        x.append(None)
                
                listing.close()
                
                all_other_features.append(x)
                
        with open('individual_listing_other_features_v3.csv','w') as f2:
            writer = csv.writer(f2, delimiter=',',lineterminator='\n',)
            
            keys = ['listing_url', 'listing_price', 'listing_date', 'address', 'dwelling_type', 
                    'comparable_sold_price_1', 'comparable_list_price_1','comparable_sold_date_1','comparable_sold_price_2','comparable_list_price_2','comparable_sold_date_2','comparable_sold_price_3','comparable_list_price_3','comparable_sold_date_3','comparable_sold_price_4','comparable_list_price_4','comparable_sold_date_4','comparable_sold_price_5','comparable_list_price_5','comparable_sold_date_5',
                   'listing_features', 'listing_description', 'listing_extras']
            writer.writerow(keys)
            writer.writerows(all_other_features)

other_features_indiviudal_listing_scraper()

  global comparables_class
