In [9]:
import pandas as pd
import csv
import json
import time
import random
import chromedriver_autoinstaller

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementNotInteractableException
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
from lxml import etree

![alt text](images/tag_venue_home_page.png)
# Tagvenue Venue Web Scrape
### Introduction 

The [Tagvenue](https://www.tagvenue.com/) website is basically an Air BnB for finding and booking venues for an event. The website hosts thousands of venues in the UK that can be booked for events such as weddings, work drinks, birthdays etc. Each venue has one or more **spaces** available to be booked. A **space** is basically a room or area within the venue. Some venues have just a single space, often the whole venue, whilst others offer a selection of rooms, each offered as a separate space. Each Space has its own webpage on Tagvenue. This webpage contains all the data needed to choose which space to book for your event. Example data includes price, location, size, capacity, features, licensing etc. This notebook will scrape the data from all spaces on the [Tagvenue](https://www.tagvenue.com/) website that are located in **London**. 

### Key Variables
The following key variables define and tweak the specifics of the web scrape: 

- **progress_report_interval** - Periodic progress reports (% completed) are printed during scraping. This variable defines in seconds how often the report is output. 
- **connection_error_retry_time** - This defines how long in seconds the program will wait before trying to re-load a webpage when it fails to load due to a connection error. 
- **headless_mode** - Set to *True* if you want chrome to be launched in headless mode i.e. not visible. Set to *False* if you wish chrome to be visible while scraping.  
- **longitude_min**, **longitude_max**, **latitude_min** and **latitude_max** - Defines the area that will be searched for venues. The intersection of the four longitude / latitude lines defines a square area.

In [10]:
progress_report_interval = 1800 #1800 for normal run, 300 for test
connection_error_retry_time = 300  # 300 for normal run, 30 for test
# Set True to have chrome open in headless mode 
headless_mode = False 
# longitude and latidue max and min define four lines, the intersection 
# of these lines defines a square area used for the venue search
# Normal run values, comment out when not wanted 
latitude_min = 51.326626 
latitude_max = 51.7297765
longitude_min = -0.446500003
longitude_max = 0.2190751
# Test Values, comment out when not wanted 
#longitude_min = -0.100501
#longitude_max = -0.059614
#latitude_min = 51.494423
#latitude_max = 51.50697

### Initiate Web Scraper
We will use Selenium and Chromedriver / Chrome to crawl the Hire Space website and download data. An initial check is performed by *chromedriver_autoinstaller()* to ensure chromedriver is up to data. If it is not, then the latest version is downloaded. Selenium then initiates an instance of chrome that it can control. This instance will either be visible or invisible (headless mode) depending on the *headless_mode* variable.  

In [28]:
# Check if the current version of chromedriver exists
# and if it doesn't exist, download it automatically,
# then add chromedriver to path
chromedriver_autoinstaller.install()
# If headless_mode was True, open chrome in headless mode, 
# otherwise open a visible chrome browser
if (headless_mode):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
else:
    # Initialise chromedriver
    driver = webdriver.Chrome()

### Define Page Load Function
We will frequently load new webpages with Selenium. We want to wait a certain amount of time between successive page loads to minimise our impact on the server and avoid being detected as a bot. We also want to detect any connection errors that might occur during the loading of a page for example due to a wifi issue. For this purpose, we created the *load_page* function. This function basically takes a url then loads it into chrome once enough time has passed since the last webpage was loaded in chrome. 

The function has some extra features detailed below: 
- Handles *timeout* (page took longer than 30 seconds to load) and *connection* errors (couldn't connect to internet). In either case, the programme will wait some time then try to reload the page. If it still fails, the error is logged and any data scraping for that page is aborted.  
- Measures time it took for last page to load and then waits double this time (with a small random offset to appear less bot like) before the next page is loaded. This dynamically increases or decreases the frequency of requests according to how well the webserver is coping, ensuring we don't overwhelm it.   
- It only needs to take a single argument, the url you wish to load. However, it can take a second argument to assist with error reporting when the url you are loading did not come directly from the Hire Space search results. In other words, if you clicked on a search result and on the subsequent page you clicked on a link, it can be useful to connect this latest link to the original hire space search result url. This can be done by including the Hire Space search result url as the second argument.

In [12]:
# Set chromedriver timeout error to trigger if page takes more 
# than 30 seconds to load
driver.set_page_load_timeout(30)
# Initialise error log for page loading
scraping_error_log = []
# Note - the below function returns True when no errors occur 
# during page load and is designed to be put within an 'if' 
# i.e. if(load_page(url)): to only do the steps in the 'if' 
# when the page load doesn't have errors
def load_page(url):
    """Load provided url in chrome then sleep for interval of time. 
    
    Handles and logs timout and connection errors. Calculates 
    the time to wait by multiplying the time it took the page to 
    load by 2, then adding some random offset.
    
    keyword arguments: 
    url -- url you wish to load in chrome
    main_url -- Used for error logging. Provide the search results url
    whilst scraping spaces within a venue. 
    
    Returns -- True if page load was successful, returns False if there 
    was an error
    """
    
    try:
        # Loads url in chrome and calculates the time it took to load page
        time_of_request = time.time()
        driver.get(url)
        page_load_time = time.time() - time_of_request
        # Calculate time required to wait before next url is 
        # loaded
        wait_time_till_next_request = wait_time_calculation(page_load_time)
        time.sleep(wait_time_till_next_request)
        # Returns True to indicate page load had no errors
        return True
    # Execution pauses if timeout or connection issue occurs 
    except (TimeoutException, WebDriverException) as e:
        time.sleep(connection_error_retry_time)  
        try:
            # Loads url in chrome and calculates the time it took to load page
            time_of_request = time.time()
            driver.get(url)
            page_load_time = time.time() - time_of_request
            # Calculate time required to wait before next url is 
            # loaded (next time load_page is called)
            wait_time_till_next_request = wait_time_calculation(page_load_time)
            time.sleep(wait_time_till_next_request)
        except TimeoutException:
            scraping_error_log.append([url, 
                              'page failed to load, web page timed out'])
            # Returns False to indicate page load had an error
            return False
        except WebDriverException:
            scraping_error_log.append([url, 
                              'page failed to load, no internet connection'])
            # Returns False to indicate page load had an error
            return False
            
def wait_time_calculation(page_load_time):
    """Returns time required to wait before loading next url
    
    The wait time is 2 times the page_load_time, with 
    random variation"""
    
    average_wait_time = 2 * page_load_time
    upper_wait_time = 1.33333 * average_wait_time
    lower_wait_time = 0.77777 * average_wait_time
    return random.uniform(lower_wait_time, upper_wait_time)

### Search for Venues
We will use Tagvenue's [search page](https://www.tagvenue.com/) to find all venues located in London. The Tagvenue search requires an 'event type' to be chosen for the search. There are around **190** different 'event types' available to choose from. To find all venue's in London, we will have to repeat the search for all 190 available 'event types'. Below  we scrape the 'event type' options from the Tagvenue [search page](https://www.tagvenue.com/).  

In [13]:
tagvenue_search_page_url = 'https://www.tagvenue.com/'
if (load_page(tagvenue_search_page_url)):
    # Find event type html input element
    form_event_type_input = driver.find_element_by_xpath("//input[@name='room_tag_autocomplete']")
    # Click on event type html input element - this loads the 'event 
    # type' html elements that contain the event type options into the
    # webpage html
    form_event_type_input.click()
    # Find event type html elements 
    form_event_types_elements = driver.find_elements_by_xpath("//div[@class='autocomplete-suggestions']//div")
    # Extract text from event types html elements 
    form_event_types = [element.get_attribute('innerHTML')for element in form_event_types_elements]
    # replace spaces with '-', to make the event type conform to the 
    # url format used by Tagvenue - remove, from old approach 
    #event_types = [item.replace(' ','-') for item in event_types]
else: raise Exception('page load error - cannot find event types')

print(f"There are {len(form_event_types)} event types on Tagvenue")

There are 190 event types on Tagvenue


Not all event types are input in the url in the same way as shown in th auto-completion list. We correct these differences here manually. 

In [14]:
def find_url_event_type(event_type):
    load_page(tagvenue_search_page_url)
    event_type_input = driver.find_element_by_xpath("//input[@name='room_tag_autocomplete']")
    event_type_input.send_keys(event_type)
    event_type_input.send_keys(Keys.ENTER)
    search_button_element = driver.find_element_by_xpath("//button[@class='c-button-cta c-button-cta--big js-hero-search']")
    search_button_element.click()
    time.wait(5)
    search_url = driver.current_url
    end_of_event_type = search_url.find('?')
    start_of_event_type = search_url.rfind('/', 0, end_of_event_type) + 1
    return search_url[start_of_event_type : end_of_event_type]

In [15]:
# Ignore, left in for testing / debugging purposes
#form_event_types = form_event_types[0:5]

In [16]:
url_event_types = []
for form_event_type in form_event_types: 
    url_event_type = find_url_event_type(form_event_type)
    url_event_types.append(url_event_type)

In [17]:
len(url_event_types)

5

In [18]:
url_event_types

['18th-birthday-party',
 '30th-birthday-party',
 '40th-birthday-party',
 '50th-birthday-party',
 'academic-venues']

In [19]:
# Tagvenues changes the longitude and latitude values of the url in 
# Chrome after you load the url, so you need to recreate the whole url
# whenever you change page or event type to keep the results within 
# the desired longitude and latitude range. 
def create_search_url(event_type, page):
    """Build and return search url string"""
    return f"""https://www.tagvenue.com/uk/search/{event_type}?
           longitude_from={longitude_min}&longitude_to={longitude_max}
           &latitude_from={latitude_min}&latitude_to={latitude_max}&page={page}"""

We will incorporate the latitude, longitude and radius variables from above to define a url that will return all London venues in the search results. We then load this url into chrome, allowing us to see the search results. 

In [20]:
def find_total_results_pages():
    """Returns the number of pages of search results showing in Chrome"""
    # Find pagination html elements - these create the clickable page
    # numbers and arrows at bottom of search results page to naviagte 
    # through search results pages 
    pagination_elements = driver.find_elements_by_xpath("//div[@class='results-pagination results-pagination--center']/ul/li/a")
    # Convert pagination elements to text values  
    pagination = [element.get_attribute('innerHTML') for element in pagination_elements]
    # If list is not empty i.e. len > 0 then return second last 
    # element - this is the total number of pages
    if (len(pagination) > 1):
        return int(pagination[-2])
    # If list empty, then there is only one page, return 1
    else: return 1

In [None]:
# This block is just for creating a search url for debugging purposes 
event_type = 'pop-up-event'
#event_type = 'corporate-event'
page = 1
search_url = create_search_url('kids-partybus', page)
load_page(search_url) 

In [21]:
def get_space_urls():
    # Find the html elements of the urls of spaces returned by the search
    search_result_url_elements = driver.find_elements_by_xpath("//div[@class='v-search-results-items']/div/a")
    if(len(search_result_url_elements) == 0):
        try:
            no_search_results_message_element = driver.find_element_by_xpath("//h3")
            no_search_results_message = no_search_results_message_element.get_attribute('innerHTML')
            no_search_results_message = no_search_results_message.replace("'", "").lower().strip()
            expected_message = ('sorry, we couldnt find any venues matching your criteria.')
            if(no_search_results_message != expected_message):
                scraping_error_log.append([search_url, 'search url failed'])
        except NoSuchElementException: 
            scraping_error_log.append([search_url, 'search url failed'])

    return [element.get_attribute('href') for element in search_result_url_elements]

- Need a tagvenue specific failed url test OOPs error!!! - need errors to include event_type
- add progress complete and time taken bit
- manually fix event type issue - or worse case use selenium to manually search through 190 different auto-complete options and add the correct bit...

In [None]:
# Ignore, left in for testing / debugging purposes
#event_types = event_types[0:7]

In [30]:
space_urls = []
time_last_update = time.time()
total_event_types = len(url_event_types)

for event_number, event_type in enumerate(url_event_types): 
    if (time.time() - time_last_update > progress_report_interval):
        print(f"Scraped {event_number} of {total_event_types} event_types")
        pages_of_urls_scraped = len(space_urls)/36
        print(f"Approximately {pages_of_urls_scraped:0} pages of search results scraped\n")
        time_last_update = time.time()
    search_url = create_search_url(event_type, 1)
    load_page(search_url)
    total_pages = find_total_results_pages()
    for current_page in range(1, total_pages + 1):
        space_urls.extend(get_space_urls())
        if(current_page < total_pages):
            search_url = create_search_url(event_type, current_page + 1)
            load_page(search_url)

0.0003581047058105469
Scraped 1 of 5 event_types
Approximately 0.25 pages of search results scraped

0.1997997760772705
0.6479842662811279
Scraped 2 of 5 event_types
Approximately 2.138888888888889 pages of search results scraped

0.4792027473449707
0.5636961460113525
Scraped 3 of 5 event_types
Approximately 4.027777777777778 pages of search results scraped

0.4968390464782715
0.5072238445281982
Scraped 4 of 5 event_types
Approximately 5.916666666666667 pages of search results scraped

0.4110429286956787


note: if page fails to load then it will result in search url failed error as well - need to rerun whole search url and scrape all its pages rather than just redoing the pages that failed in case the 'total pages' calculation was incorrect due to page load error and calculated it as 1 

In [150]:
scrape_errors = pd.DataFrame(scraping_error_log, columns = ['url','error'])
# Function to make urls clickable in jupyter
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

scrape_errors.style.format({'url': make_clickable})

Unnamed: 0,url,error


In [151]:
space_urls_unique = list(set(space_urls))
print(f"There are {len(space_urls_unique)} spaces to scrape")

There are 5505 spaces to scrape


In [157]:
# Save space_urls_uniqe to file (as json)
with open("space_urls.json", 'w') as f:
    # indent=2 is not needed but makes the file human-readable
    json.dump(space_urls_unique, f, indent=2) 

If you wish to load a saved list of space urls, remove #s and run the below. 

In [157]:
#with open("space_urls.json", 'r') as f:
#    space_urls_unique = json.load(f)

#print(f"There are {len(space_urls_unique)} spaces to scrape")

In [233]:
urls_df = pd.DataFrame(space_urls_unique)

urls_with_qmark = urls_df[urls_df[0].str.contains('\?')].shape[0]
urls_with_qmark_event_offer = urls_df[urls_df[0].str.contains('\?event-offer')].shape[0]
if (urls_with_qmark == urls_with_qmark_event_offer): 
    print(f"All urls with a '?' are of form '?event-offer'" )
else: 
    print(f"There are urls with '?' not of the form '?event-offer'")

space_urls_cleaned = urls_df[~urls_df[0].str.contains('\?event-offer')].values.tolist()
print(f"Removed {urls_with_qmark_event_offer} urls containing '?event-offer' \nThere are now {len(space_urls_cleaned)} space urls")

All urls with a '?' are of form '?event-offer'
Removed 1017 urls containing '?event-offer' 
There are now 4488 space urls


Below shows all urls that contain a '?' but don't contain 'event-offer'

In [230]:
view = urls_df[(urls_df[0].str.contains('\?'))&(~urls_df[0].str.contains('event-offer'))]
view.style.format({0: make_clickable})

Unnamed: 0,0


The below code is useful for searching through the venue urls to see the different venues and to find different packages available at the venues. 

In [246]:
# Switch between space_urls_unique and space_urls_cleaned to 
# get with and without packages, and to find specific venues 
# or venues with packages e.g. search for '\?event-offer=wedding' 
# to get venues with wedding packages
urls_df = pd.DataFrame(space_urls_unique)
view = urls_df[urls_df[0].str.contains('south')]
view.style.format({0: make_clickable})

Unnamed: 0,0
280,https://www.tagvenue.com/rooms/london/3903/tanner-warehouse/tanner-warehouse-courtyard
2055,https://www.tagvenue.com/rooms/london/3308/tanner-warehouse/industrial-wedding
3639,https://www.tagvenue.com/rooms/london/321/tanner-warehouse/tanner-warehouse


In [None]:
import sys
from operator import itemgetter
local_vars = list(locals().items())
# Size gives us variable size in Bytes
size = [[var,sys.getsizeof(obj)] for var, obj in local_vars]
size = sorted(size, key=itemgetter(1), reverse = True)
for var, size in size:
    print(var,f"-> {size/1000000:,} MB")

In [130]:
get_space_urls()

[]

In [66]:
for b in ['a','b','c']:
    for a in range(1,2 + 1):
        print(a,b)

1 a
2 a
1 b
2 b
1 c
2 c


In [68]:
page = 1
search_url = create_search_url('Christmas-Dinner', page)
load_page(search_url) 

True

In [None]:
event_types

# Collect Venue URLs from Search Results
The Hire Space search results show 18 results on the page, where each result is a clickable picture of a venue. These venue's are never duplicated in the results, you only see the venue once in the results page even if the venue has multiple spaces available within it. To see more results, you must click the 'show more' button at the bottom of the page, which will reveal a further 18 results. This can be repeated until all results are showing.

We will use Selenium to click 'show more' every 10 seconds (to minimise the impact on Hire Space's servers) until all venues returned by the search are made visible. 

We then scrape the url that each search result links to when you click on it and store these urls in a list. This list of urls forms the scope of our web scrape - we want to scrape data from every venue in this list and therefore from each of these urls. We call these the *venue urls* because they provide a unique link from the search results to the venue (since a venue never appears twice in the search results). 

In [6]:
# Flag to indicate 'show more' button is clickable 
more_search_results = True
# Loop that clicks on 'show more' until all search results are visible 
# (loops until you can no longer click on 'show more') 
while (more_search_results):
    # Finds the html element of the 'show more' button
    show_more = driver.find_element_by_xpath(
        "//button[@class='btn btn-default btn-large btn-block']")
    # Make chrome click the 'show more' button. If this causes an error, 
    # then end the while loop (because all results are now visible)
    try:
        # Wait 10 seconds to minimise server impact
        time.sleep(10)
        show_more.click()
    except ElementNotInteractableException:  
        # End while loop
        more_search_results = False
    
# Finds and places the html element of each venue returned by the
# search (each venue appears as a clickible picture in chrome) into a list   
venues = driver.find_elements_by_xpath("//div[@class='searchresult']/a")
    
# Extract url each search result html element links to
venue_urls = [venue.get_attribute('href') for venue in venues]
print('There are', len(venue_urls), 'venues')

There are 2729 venues


In [7]:
# Save venue url list to file as a csv
with open('venue_url_list', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(venue_urls)

### Venue Crawl Strategy
Each venue has one or more spaces and each space has a dedicated webpage. We will scrape the full html code of every space's web page for all venues returned in the search. If *scrape_spaces* was set to False, we will only scrape a single space webpage from each venue. 

We scrape the full web page rather than scraping individual elements to avoid unexpected errors when scraping e.g. from an inconsistent html element. The web scraping process takes several hours to complete due to waiting several seconds between subsequent page loads. Once we have scraped the full set of web pages, we will extract the specific data we want. Any errors encountered due to inconsistent html or incompletely loaded pages etc. can be fixed quickly, without crashing the slow web scraping process.

The venue urls from the search results page each link to a different venue. They will take you to the web page of a space within the venue. If the venue offers more than one space, a button will be visible called 'See All Spaces Here'. This links to a general overview page for the venue, showing clickable links to all spaces within the venue. 

The following process was used to scrape the space web pages from a single venue url: 

1. Load venue url from search results page -> takes you to web page for a space in the venue
2. Save full page html code 
3. If 'See All Spaces Here' button exists
   1. Load 'See All Spaces Here' url -> takes you to venue overview page  
   2. Gather urls of each space within venue from venue overview page
   2. load each space url in turn and scrape full page html

If an error occurs when loading a webpage during this process, the scrape is aborted for that venue and the error is logged. 

In [8]:
# Stores the url and full html for every space web page scraped. 
# Each element of raw data is a list [url, html]. 
raw_data = []

def scrape_venue(venue_url):
    """Scrapes full html of all the spaces in the venue related to the 
    provided venue_url"""
    
    global error_flag
    # Loads the venue url in chrome. This opens the web page for a space
    # in the venue.  
    load_page(venue_url)
    # If there was a page load error, break out of function, dont scrape the venue
    if (error_flag):
        error_flag = False
        return
    # Saves html of the space web page currently showing in chrome 
    raw_data.append([venue_url, driver.page_source])
    # Find html element for 'see all spaces here' button, returns a list
    # of length 0 if the button is not available
    see_all_spaces_here = driver.find_elements_by_xpath(
        "//a[@class='btn btn-default btn-block btn-lgText']")
    # If more spaces exist that need to be downloaded (if 'see all spaces 
    # here' button exists) and scrape_spaces is True, load the venue 
    # overview, which contains the other spaces to be scraped. 
    if (len(see_all_spaces_here) == 1) & (scrape_spaces):
        venue_overview_url = see_all_spaces_here[0].get_attribute('href')
        load_page(venue_overview_url, venue_url)
        # If there was a page load error, break out of function, dont scrape
        # the rest of venue
        if (error_flag):
            error_flag = False
            return
        # get html elements for all spaces in venue
        all_spaces_for_venue = driver.find_elements_by_xpath(
            "//a[@class='btn']")
        # Extract url for every space in venue
        all_space_urls_for_venue = [space.get_attribute('href') 
                                    for space in all_spaces_for_venue]
        # Need to avoid re-scraping the space page loaded by venue_url. 
        # This is done by matching the space name in the venue_url with
        # the space name in the urls taken from the venue overview page.
        # Extract space name from venue_url
        space_name_from_venue_url = venue_url.split("/")[-2]
        for space_url in all_space_urls_for_venue:
            # Extract space name from url taken from overview page
            space_name_from_overview_url = space_url.split("/")[-2]
            # Checks url doesnt link to same space as venue_url 
            if (space_name_from_overview_url != space_name_from_venue_url):
                load_page(space_url, venue_url)
                # If there was a page load error, break out of function,
                # dont scrape the rest of venue
                if (error_flag):
                    error_flag = False
                    return
                # Saves html of the space web page currently showing in chrome
                raw_data.append([space_url, driver.page_source])

### Scrape Venues
The process described above was applied to every venue url taken from the search results page to scrape all the required venue data. This is a slow process due to the waiting several seconds between subsequent page loads, adding up to hours of waiting time.

A periodic process is run during the scraping to provide progress updates and save the scraped data to file in case of any crashes. 

In [9]:
total_venues = len(venue_urls)
time_last_progress_update = time.time() 
scraping_time = 0

# Loop through venue urls and scrape data from each venue. 
for venue_num, venue_url in enumerate(venue_urls):
    scrape_venue(venue_url)
    # Provide progress update and hard save scraped data every 30 mins 
    # in case of interruption to scrape
    if time.time() - time_last_progress_update > backup_interval:
        perc_complete = ((venue_num / total_venues))
        scraping_time += 0.5
        print(f"{perc_complete:.1%} Completed -> Scraped {venue_num} out of {total_venues} venues")
        print(f"Spent {scraping_time} hours scraping", '\n')   
        time_last_progress_update = time.time()
        # Save data scraped so far to csv
        with open("backup_html_pages.csv", "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerows(raw_data)

13.0% Completed -> Scraped 354 out of 2729 venues
Spent 0.5 hours scraping 

25.1% Completed -> Scraped 685 out of 2729 venues
Spent 1.0 hours scraping 

37.4% Completed -> Scraped 1021 out of 2729 venues
Spent 1.5 hours scraping 

49.0% Completed -> Scraped 1336 out of 2729 venues
Spent 2.0 hours scraping 

56.0% Completed -> Scraped 1528 out of 2729 venues
Spent 2.5 hours scraping 

67.5% Completed -> Scraped 1843 out of 2729 venues
Spent 3.0 hours scraping 

78.9% Completed -> Scraped 2153 out of 2729 venues
Spent 3.5 hours scraping 

88.5% Completed -> Scraped 2415 out of 2729 venues
Spent 4.0 hours scraping 



In [10]:
# Save all scraped data to csv
with open("html_pages.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(raw_data)

### Scraping Error Log
If the scraping of a venue was aborted due to a page load error, it is displayed below: 

In [159]:
errors = pd.DataFrame(load_page_error_log, columns = ['venue_url','problem_url','error'])
# Function to make urls clickable in jupyter
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

errors.style.format({'venue_url': make_clickable,'problem_url': make_clickable})

Unnamed: 0,venue_url,problem_url,error


### Load Saved HTML Data
The below commented out code allows you to load the html_pages.csv file if you want to come back to it without scraping all the data again. 

In [13]:
#csv.field_size_limit(sys.maxsize)
#with open('html_pages.csv', newline='') as f:
#    reader = csv.reader(f)
#    raw_data = list(reader)

### Extract Data From HTML
We will use the lxml library to extract useful data from the html code of each space's website. The extraction code handles 2 specific issues detailed below: 
- Some spaces were no longer hosted by Hire Space and therefore had no data available. The code excludes these venues by checking whether the start of the header reads 'venue no longer hosted'. 
- Some web pages didn't load properly during scraping. They were missing data and trigger an error during extraction. The extraction code deals with them by re-loading the webpage and trying to extract data from the reloaded page's html.  
Any other errors will be handled by aborting the extraction process and adding the error to an error log. 

In [14]:
# Error log for extraction errors
extraction_errors = []

def extract_data(website_data):
    """Returns extracted data from an element of raw_data. 
    
    Each element of raw_data is in form [url, html]. This
    function handles and logs errors during extraction and 
    sends the html to the extract_from_html function for extraction.
    
    Returns - Extracted data as a list or returns None if unable to 
    extract data
    """
    
    url = website_data[0]
    # check if the venue is no longer hosted
    if (check_page_not_hosted(website_data[1])):
        # Log extraction error due to venue no longer hosted 
        extraction_errors.append([url, 'venue no longer hosted',
                                  website_data[1]])
        return None
    try:
        # Extract the data from the html code
        return extract_from_html(url, website_data[1])
    except:
        # If initial data extraction fails, re-load the url in chrome
        load_page(url)
        if 
        try:
            # Try to extract the data using the html code of the 
            # re-loaded page
            return extract_from_html(url, driver.page_source)  
        except Exception as e:
            # Log error, including error message in log
            extraction_errors.append([url, e, website_data[1]])
            return None

In [15]:
# WHEN EXPANDING FOR RICH DATA - test and run this function before the 'extract data' 
# function

def extract_from_html(url, website_html):
    """Extracts and returns data from website_html."""
    # Parse html with lxml library
    tree = etree.HTML(website_html)
    
    # Extract venue and space name from h1 and h2 headers
    venue_name = tree.xpath("//h1/text()")[0].lower()
    space_name = tree.xpath("//h2/text()")[0]
    # Remove trailing ' in' text
    space_name = space_name[:-3].lower()

    # Find html element for map of venue location
    space_map = tree.xpath("//div[@id='map_canvas']")[0]
    # Extract 'style' attribute from element, this contains the 
    # longitude and latitude in a url
    space_map_style = space_map.get('style')
    # Extract the longitude and latitude from url, they are
    # between the first '=' and first '&' in the url 
    style_start_of_long_lat = space_map_style.find('=')
    style_end_of_long_lat = space_map_style.find('&')
    long_lat = space_map_style[style_start_of_long_lat : style_end_of_long_lat]
    # longitude and latitude in form of string '=latitude%2C-longitude'. 
    # Split longitude and latitude into list, using the % as delimiter
    # then clean up by removing excess text characters 
    long_lat = long_lat.split('%')
    long_lat[0] = long_lat[0].replace('=','')
    long_lat[1] = long_lat[1].replace('2C','')
    # Convert to float
    long_lat = [float(item) for item in long_lat]
    
    # Find the html elements of the address (5 seperate items in a list)
    address = tree.xpath("//div[@class='address-overlay']/ul//li")
    # Extracts text from each html element of the address. If no 
    # text exist, it returns 'None'
    address = [line.text for line in address]
    # put address in lower case, ignoring entries which say 'None' 
    address = [item.lower() if (item != None) else item for item in address]
    # remove space in postcode
    address[4] = address[4].replace(' ','')

    return [url, venue_name, space_name, long_lat[0], long_lat[1], 
                address[0], address[1], address[2], address[3], address[4]]

In [16]:
def check_page_not_hosted(website_html): 
    """Check if venue no longer hosted, return result (True or False)"""
    
    # Parse html with lxml library
    tree = etree.HTML(website_html)
    # Find text of h1 heading
    heading = tree.xpath("//h1/text()")
    # Ensure heading was returned (will return list length 0 if 
    # no heading found)
    if (len(heading) > 0):
        # Return true if heading matches expected heading 
        # for no longer hosted page
        return (heading[0][:34] == 'Hire Space does not currently list')
    return False

In [17]:
# Extract data from the space webpages that were scraped
data = [extract_data(website_data) for website_data in raw_data]
# Remove None entries in data (These are from the erroneous 
# web pages that couldn't be scraped)
data = [item for item in data if (item != None)]
print(f"{len(data)} spaces were successfully scraped")

2693 spaces were successfully scraped


In [18]:
print(f"{len(extraction_errors)} spaces could not be scraped due to errors while extracting the data")

36 spaces could not be scraped due to errors while extracting the data


### Extraction Error Log
If extraction failed on a webpage, display error below along with error message. 

In [19]:
errors_2 = pd.DataFrame(extraction_errors, columns = ['url','error','html'])

# View error log, making urls clickable and excluding the html code in
# the error log from view
errors_2[['url','error']].style.format({'url': make_clickable})

NameError: name 'make_clickable' is not defined

### Summarise and Save Data
Below we convert the extracted data to a dataframe, show first 10 rows and summary statistics. We then save the data to file as a csv. 

In [20]:
df = pd.DataFrame(data, columns = ['url','venue_name','space_name', 'longitude','latitude', 'address_line_1',
                                   'address_line_2','address_line_3','address_line_4','address_line_5'])
df.head()

Unnamed: 0,url,venue_name,space_name,longitude,latitude,address_line_1,address_line_2,address_line_3,address_line_4,address_line_5
0,https://hirespace.com/Spaces/London/187488/Ano...,anomalous space,georgian townhouse,51.532138,-0.108149,anomalous space,36-38 pentonville road,"angel, islington",london,n19hf
1,https://hirespace.com/Spaces/London/107767/The...,"the zetter townhouse, clerkenwell",the games room,51.52317,-0.103552,49-50 st john's square,,"clerkenwell, farringdon",,ec1v4jj
2,https://hirespace.com/Spaces/London/139311/The...,the postal museum,the courtyard,51.524727,-0.113533,15-20 phoenix place,phoenix place,farringdon,greater london,wc1x0da
3,https://hirespace.com/Spaces/London/131840/The...,the hac (honourable artillery company),prince consort rooms,51.523119,-0.087275,armoury house,city road,old street,london,ec1y2bq
4,https://hirespace.com/Spaces/London/189080/The...,the phoenix london,full venue hire,51.506317,-0.223909,"the phoenix london, westfield london shopping ...",,"shepherds bush, west london, london",,w127ga


In [21]:
df.describe(include='all')

Unnamed: 0,url,venue_name,space_name,longitude,latitude,address_line_1,address_line_2,address_line_3,address_line_4,address_line_5
count,2693,2693,2693,2693.0,2693.0,2693,936,2453,1529,2693
unique,2693,2688,1446,,,2610,810,638,113,2345
top,https://hirespace.com/Spaces/London/187488/Ano...,bianca road brew co,whole venue,,,"unit 3, greenwich business park, 53 norman rd,...",london,shoreditch,london,se100dx
freq,1,2,543,,,6,57,110,1066,8
mean,,,,51.513439,-0.121308,,,,,
std,,,,0.036504,0.073681,,,,,
min,,,,51.326626,-0.4465,,,,,
25%,,,,51.50192,-0.149493,,,,,
50%,,,,51.513676,-0.119519,,,,,
75%,,,,51.525684,-0.080269,,,,,


In [33]:
df.to_csv('hire_space_venue_data.csv', index=False)

In [34]:
# Close chrome page
driver.quit()

In [None]:
import sys
from operator import itemgetter
local_vars = list(locals().items())
# Size gives us variable size in Bytes
size = [[var,sys.getsizeof(obj)] for var, obj in local_vars]
size = sorted(size, key=itemgetter(1), reverse = True)
for var, size in size:
    print(var,f"-> {size/1000000:,} MB")

### Investigating Extraction Errors
Below provides examples of using the extraction_errors list to re-create the errors during extraction to aid with debugging.

In [31]:
# Extracts error message from first error 
Error_number = 0
repr(extraction_errors[Error_number][1])

"'venue no longer hosted'"

In [32]:
# Runs extraction on erroneous html, returning full original error
Error_number = 0
extract_from_html('url', extraction_errors[Error_number][2])

IndexError: list index out of range