In [1]:
import pandas as pd
import csv
import json
import time
import random
import chromedriver_autoinstaller

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementNotInteractableException
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from lxml import etree

![alt text](images/tag_venue_home_page.png)
# Tagvenue Venue Web Scrape
### Introduction 

The [Tagvenue](https://www.tagvenue.com/) website is basically an Air BnB for finding and booking venues for an event. The website hosts thousands of venues in the UK that can be booked for events such as weddings, work drinks, birthdays etc. Each venue has one or more **spaces** available to be booked. A **space** is basically a room or area within the venue. Some venues have a single space, often the whole venue, whilst others offer a selection of rooms, each offered as a separate space. Each Space has its own webpage on Tagvenue. This webpage contains all the data needed to choose which space to book for your event. Example data includes price, location, size, capacity, features, licensing etc. This notebook will scrape the data from all spaces on the [Tagvenue](https://www.tagvenue.com/) website that are located in **London**. At the time of writing this amounts to **~4400** spaces. 

### Key Variables
The following key variables define and tweak the specifics of the web scrape: 

- **progress_report_interval** - Periodic progress reports (% completed) are printed during scraping. This variable defines in seconds how often the report is output. 
- **connection_error_retry_time** - This defines how long in seconds the program will wait before trying to re-load a webpage when it fails to load due to a connection error. 
- **headless_mode** - Set to *True* if you want chrome to be launched in headless mode i.e. not visible. Set to *False* if you wish chrome to be visible while scraping.  
- **longitude_min**, **longitude_max**, **latitude_min** and **latitude_max** - Defines the area that will be searched for venues. The intersection of the four longitude / latitude lines defines a square area.

In [2]:
progress_report_interval = 1800 #1800 for normal run, 300 for test
connection_error_retry_time = 300  # 300 for normal run, 30 for test
# Set True to have chrome open in headless mode 
headless_mode = False 
# longitude and latidue max and min define four lines, the intersection 
# of these lines defines a square area used for the venue search
# Normal run values, comment out when not wanted 
latitude_min = 51.326626 
latitude_max = 51.7297765
longitude_min = -0.446500003
longitude_max = 0.2190751
# Test Values, comment out when not wanted 
#longitude_min = -0.100501
#longitude_max = -0.059614
#latitude_min = 51.494423
#latitude_max = 51.50697

### Initiate Web Scraper
We will use Selenium and Chromedriver / Chrome to crawl the Hire Space website and scrape data. An initial check is performed by *chromedriver_autoinstaller()* to ensure chromedriver is up to data. If it is not, then the latest version is downloaded. Selenium then initiates an instance of chrome that it can control. This instance will either be visible or invisible (headless mode) depending on the *headless_mode* variable.  

In [4]:
# Check if the current version of chromedriver exists
# and if it doesn't exist, download it automatically,
# then add chromedriver to path
chromedriver_autoinstaller.install()
# If headless_mode was True, open chrome in headless mode, 
# otherwise open a visible chrome browser
if (headless_mode):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
else:
    # Initialise chromedriver
    driver = webdriver.Chrome()

### Define Page Load Function
We will frequently load new webpages with Selenium. We want to wait a certain amount of time between successive page loads to minimise our impact on the server and avoid being detected as a bot. We also want to detect any connection errors that might occur during the loading of a page for example due to a wifi issue. 

For this purpose, we created the *load_page* function. This function basically takes a url and loads it into chrome. It then pauses the program for twice the time it took for the page to load (with random variation to look less like a bot). In this way, the strain we put on the server will dynamically change. The slower the server becomes, the more time we will wait between successive page loads and vice versa if the server speeds up. 

The function also Handles *timeout* (page took longer than 30 seconds to load) and *connection* errors (couldn't connect to internet). In either case, the programme will wait some time then try to reload the page. If it still fails, the error is logged.  

The function returns *True* if there were no errors loading the page and returns *False* when there were errors. It can be placed in an *if* statement so that the page will only be processed if the page loaded successfully.   

In [5]:
# Set chromedriver timeout error to trigger if page takes more 
# than 30 seconds to load
driver.set_page_load_timeout(30)
# Initialise scraping error log
scraping_error_log = []
# Note - the below function returns True when no errors occur 
# during page load and is designed to be put within an 'if' 
# i.e. if(load_page(url)): to only do the steps in the 'if' 
# when the page load doesn't have errors
def load_page(url):
    """Load provided url in chrome then sleep for interval of time. 
    
    Handles and logs timout and connection errors. Calculates 
    the time to wait by multiplying the time it took the page to 
    load by 2, then adding some random offset.
    
    Returns -- True if page load was successful, returns False if there 
    was an error
    """
    
    try:
        # Loads url in chrome and calculates the time it took to load page
        time_of_request = time.time()
        driver.get(url)
        page_load_time = time.time() - time_of_request
        # Calculate time required to wait before next url is 
        # loaded
        wait_time_till_next_request = wait_time_calculation(page_load_time)
        time.sleep(wait_time_till_next_request)
        # Returns True to indicate page load had no errors
        return True
    # Execution pauses if timeout or connection issue occurs 
    except (TimeoutException, WebDriverException) as e:
        time.sleep(connection_error_retry_time)  
        try:
            # Loads url in chrome and calculates the time it took to load page
            time_of_request = time.time()
            driver.get(url)
            page_load_time = time.time() - time_of_request
            # Calculate time required to wait before next url is 
            # loaded (next time load_page is called)
            wait_time_till_next_request = wait_time_calculation(page_load_time)
            time.sleep(wait_time_till_next_request)
        except TimeoutException:
            scraping_error_log.append([url, 
                              'page failed to load, web page timed out'])
            # Returns False to indicate page load had an error
            return False
        except WebDriverException:
            scraping_error_log.append([url, 
                              'page failed to load, no internet connection'])
            # Returns False to indicate page load had an error
            return False
            
def wait_time_calculation(page_load_time):
    """Returns time required to wait before loading next url
    
    The wait time is 2 times the page_load_time, with 
    random variation"""
    
    average_wait_time = 2 * page_load_time
    upper_wait_time = 1.33333 * average_wait_time
    lower_wait_time = 0.77777 * average_wait_time
    return random.uniform(lower_wait_time, upper_wait_time)

### Creating Search URL
![alt text](images/tag_venue_search_bar.png)
We will use Tagvenue's [search page](https://www.tagvenue.com/) to find all venues located in London. The Tagvenue search requires an 'event type' to be chosen for the search. There are around **190** different 'event types' available to choose from. To find all venue's hosted by the website, we will have to repeat the search for all 190 available 'event types'. When a user clicks on the 'event type' field on the search page, a list of options is shown for them to choose from. Below we scrape the 'event type' options provided to the user.  

In [13]:
tagvenue_search_page_url = 'https://www.tagvenue.com/'
if (load_page(tagvenue_search_page_url)):
    # Find event type html input element
    form_event_type_input = driver.find_element_by_xpath(
        "//input[@name='room_tag_autocomplete']")
    # Click on event type html input element - this loads the 'event 
    # type' html elements that contain the event type options into the
    # webpage html
    form_event_type_input.click()
    # Find event type html elements 
    form_event_types_elements = driver.find_elements_by_xpath(
        "//div[@class='autocomplete-suggestions']//div")
    # Extract text from event types html elements 
    form_event_types = [element.get_attribute('innerHTML')
                        for element in form_event_types_elements]
    # replace spaces with '-', to make the event type conform to the 
    # url format used by Tagvenue - remove, from old approach 
    #event_types = [item.replace(' ','-') for item in event_types]
else: raise Exception('page load error - cannot find event types')

print(f"There are {len(form_event_types)} event types on Tagvenue")

There are 190 event types on Tagvenue


After you select an 'event type' and click search, a search url is created and loaded into chrome. You are then taken to the search results page. The 'event type' forms a sub-directory of the search url. An example search url is shown below, where the 'event type' chosen was '18th Birthday Party'.  

https://www.tagvenue.com/uk/search/18th-birthday-party?location_id=6&people=&neighbourhood=London

The text of the 'event type' options provided to the user when completing the form is not consistent with the text used in the url e.g. the event type *'Academic'* becomes *'academic-venues'* in the search url. As such, we will use selenium to perform a search using each 'event-type' option and will extract the text of the 'event-type' from the search url. This will enable us to create our own custom urls with custom longitude and latitude values later, rather than having to rely on the functionality of the search page.  

In [14]:
def find_url_event_type(event_type):
    """Returns the event type text used in the search url for the
    given event type"""
    
    load_page(tagvenue_search_page_url)
    # Find event type input html element
    event_type_input = driver.find_element_by_xpath(
        "//input[@name='room_tag_autocomplete']")
    # Input the event type
    event_type_input.send_keys(event_type)
    # Press enter
    event_type_input.send_keys(Keys.ENTER)
    # Find search button html element
    search_button_element = driver.find_element_by_xpath(
        "//button[@class='c-button-cta c-button-cta--big js-hero-search']")
    # Click search button tom perform search
    search_button_element.click()
    # Wait to give search page time to load
    time.wait(5)
    # Extract search url used
    search_url = driver.current_url
    # Extract event type text from search url. Event type sits between
    # the last '/' and first '?' in the search url.
    end_of_event_type = search_url.find('?')
    start_of_event_type = search_url.rfind('/', 0, end_of_event_type) + 1
    return search_url[start_of_event_type : end_of_event_type]

In [15]:
# Ignore, left in for testing / debugging purposes
#form_event_types = form_event_types[0:5]

In [16]:
url_event_types = []
# Perform search on Tagvenue using each of the available event types
# and extract the text used in the search url to denote the event type
for form_event_type in form_event_types: 
    url_event_type = find_url_event_type(form_event_type)
    url_event_types.append(url_event_type)

In [40]:
print(f"Found {len(url_event_types)} search url event types")

Found 5 search url event types


In [43]:
# Show first 10 event types
url_event_types[0:10]

['18th-birthday-party',
 '30th-birthday-party',
 '40th-birthday-party',
 '50th-birthday-party',
 'academic-venues']

We can build a custom search url from an event type and latitude and longitude maximum and minimum values. This url will return search results showing all spaces within the defined latitude and longitude range that are suitable for the event type chosen. We chose to use a custom url so that we had full control over the latitude and longitude ranges. If we had used the Tagvenue search page to generate a search url, we would have to rely on Tagvenue's definition of the area of London.   

An example custom search url is shown below: 

https://www.tagvenue.com/uk/search/18th-birthday-party?longitude_from=-0.270&longitude_to=0.069&latitude_from=51.31&latitude_to=51.69&page=1

We also include the page number in the url. The search results display 36 results per page. The page number defines which page of the results to show. We included this in our custom url because the Tagvenue website alters the url in Chrome after it shows you the results. This alteration changes the latitude and longitude values. As such, if you navigate to another page by selecting a page using the page navigation links at the bottom of the results, it will display spaces from the new longitude and latitude range, and not from the range in the original search url. As a result, when we want to navigate to a new page of the search results, we create a new search result url with an updated page number. This keeps the latitude and longitude range constant as we navigate through results pages.   

Below, we define the function to create a custom search url from an event type and page number (note that the latitude and longitude ranges are defined in the Key Variables section above.)

In [19]:
# Tagvenues changes the longitude and latitude values of the url in 
# Chrome after you load the url, so you need to recreate the whole url
# whenever you change page or event type to keep the results within 
# the desired longitude and latitude range. 
def create_search_url(event_type, page):
    """Build and return search url string"""
    return f"""https://www.tagvenue.com/uk/search/{event_type}?
           longitude_from={longitude_min}&longitude_to={longitude_max}
           &latitude_from={latitude_min}&latitude_to={latitude_max}&page={page}"""

### Collect Space URLs From Search Results

Using custom search urls, we can search for space's within our defined latitude and longitude ranges. The url will return a search results page as shown in the image below:  

![alt text](images/tag_venue_search_results.png)

Each space returned by the search is shown as a clickable picture. Clicking on the picture will take you to the web page of that space. We want to gather the urls of every space web page in the results. Once we have all the space urls, we can start scraping data from them.

The results page will only show spaces which are suitable for the event type in the search url. As such, we will need to repeat the search for all ~190 event types to ensure that we find every space hosted on the website. The spaces will be duplicated in different search results i.e. sometimes the same space will appear for 'corporate event' and '18th Birthday party'. As such, we will need to remove duplicates at the end. We could have used a *set()* to hold the list of space urls, which would negate the need to remove duplicates at the end. However, it was determined that it was useful when reporting the progress of the scrape, to be able to count the total number of urls scraped. 

In [20]:
def find_total_results_pages():
    """Returns the number of pages of search results showing in Chrome"""
    # Find pagination html elements - these create the clickable page
    # numbers and arrows at bottom of search results page to naviagte 
    # through search results pages 
    pagination_elements = driver.find_elements_by_xpath(
        "//div[@class='results-pagination results-pagination--center']/ul/li/a")
    # Convert pagination elements to text values  
    pagination = [element.get_attribute('innerHTML') 
                  for element in pagination_elements]
    # If list is not empty i.e. len > 0 then return second last 
    # element - this is the total number of pages
    if (len(pagination) > 1):
        return int(pagination[-2])
    # If list empty, then there is only one page, return 1
    else: return 1

In [None]:
# This block is just for creating a search url for debugging purposes 
#event_type = 'pop-up-event'
#event_type = 'corporate-event'
#page = 1
#search_url = create_search_url('kids-partybus', page)
#load_page(search_url) 

In [21]:
def get_space_urls():
    """Returns the url of every space webpage result showing in Chrome."""
    # Find the html elements of the urls of spaces returned by the search
    search_result_url_elements = driver.find_elements_by_xpath("//div[@class='v-search-results-items']/div/a")
    if(len(search_result_url_elements) == 0):
        try:
            no_search_results_message_element = driver.find_element_by_xpath("//h3")
            no_search_results_message = no_search_results_message_element.get_attribute('innerHTML')
            no_search_results_message = no_search_results_message.replace("'", "").lower().strip()
            expected_message = ('sorry, we couldnt find any venues matching your criteria.')
            if(no_search_results_message != expected_message):
                scraping_error_log.append([search_url, 'search url failed'])
        except NoSuchElementException: 
            scraping_error_log.append([search_url, 'search url failed'])

    return [element.get_attribute('href') for element in search_result_url_elements]

- Need a tagvenue specific failed url test OOPs error!!! - need errors to include event_type
- add progress complete and time taken bit
- manually fix event type issue - or worse case use selenium to manually search through 190 different auto-complete options and add the correct bit...

In [None]:
# Ignore, left in for testing / debugging purposes
#event_types = event_types[0:7]

In [30]:
space_urls = []
time_last_update = time.time()
total_event_types = len(url_event_types)

for event_number, event_type in enumerate(url_event_types): 
    if (time.time() - time_last_update > progress_report_interval):
        print(f"Scraped {event_number} of {total_event_types} event_types")
        pages_of_urls_scraped = len(space_urls)/36
        print(f"Approximately {pages_of_urls_scraped:0} pages of search results scraped\n")
        time_last_update = time.time()
    search_url = create_search_url(event_type, 1)
    load_page(search_url)
    total_pages = find_total_results_pages()
    for current_page in range(1, total_pages + 1):
        space_urls.extend(get_space_urls())
        if(current_page < total_pages):
            search_url = create_search_url(event_type, current_page + 1)
            load_page(search_url)

0.0003581047058105469
Scraped 1 of 5 event_types
Approximately 0.25 pages of search results scraped

0.1997997760772705
0.6479842662811279
Scraped 2 of 5 event_types
Approximately 2.138888888888889 pages of search results scraped

0.4792027473449707
0.5636961460113525
Scraped 3 of 5 event_types
Approximately 4.027777777777778 pages of search results scraped

0.4968390464782715
0.5072238445281982
Scraped 4 of 5 event_types
Approximately 5.916666666666667 pages of search results scraped

0.4110429286956787


note: if page fails to load then it will result in search url failed error as well - need to rerun whole search url and scrape all its pages rather than just redoing the pages that failed in case the 'total pages' calculation was incorrect due to page load error and calculated it as 1 

### Scraping Error Log
If the scraping of a venue was aborted due to a page load error, it is displayed below: 

In [150]:
scrape_errors = pd.DataFrame(scraping_error_log, columns = ['url','error'])
# Function to make urls clickable in jupyter
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

scrape_errors.style.format({'url': make_clickable})

Unnamed: 0,url,error


In [151]:
space_urls_unique = list(set(space_urls))
print(f"There are {len(space_urls_unique)} spaces to scrape")

There are 5505 spaces to scrape


In [157]:
# Save space_urls_uniqe to file (as json)
with open("space_urls.json", 'w') as f:
    # indent=2 is not needed but makes the file human-readable
    json.dump(space_urls_unique, f, indent=2) 

If you wish to load a saved list of space urls, remove #s and run the below. 

In [49]:
#with open("space_urls.json", 'r') as f:
#    space_urls_unique = json.load(f)

#print(f"There are {len(space_urls_unique)} spaces to scrape")

There are 5505 spaces to scrape


In [50]:
urls_df = pd.DataFrame(space_urls_unique)

urls_with_qmark = urls_df[urls_df[0].str.contains('\?')].shape[0]
urls_with_qmark_event_offer = urls_df[urls_df[0].str.contains('\?event-offer')].shape[0]
if (urls_with_qmark == urls_with_qmark_event_offer): 
    print(f"All urls with a '?' are of form '?event-offer'" )
else: 
    print(f"There are urls with '?' not of the form '?event-offer'")

space_urls_cleaned = urls_df[~urls_df[0].str.contains('\?event-offer')][0].to_list()
print(f"Removed {urls_with_qmark_event_offer} urls containing '?event-offer' \nThere are now {len(space_urls_cleaned)} space urls")

All urls with a '?' are of form '?event-offer'
Removed 1017 urls containing '?event-offer' 
There are now 4488 space urls


Below shows all urls that contain a '?' but don't contain 'event-offer'

In [7]:
view = urls_df[(urls_df[0].str.contains('\?'))&(~urls_df[0].str.contains('event-offer'))]
view.style.format({0: make_clickable})

NameError: name 'make_clickable' is not defined

The below code is useful for searching through the venue urls to see the different venues and to find different packages available at the venues. 

In [246]:
# Switch between space_urls_unique and space_urls_cleaned to 
# get with and without packages, and to find specific venues 
# or venues with packages e.g. search for '\?event-offer=wedding' 
# to get venues with wedding packages
urls_df = pd.DataFrame(space_urls_unique)
view = urls_df[urls_df[0].str.contains('tanner')]
view.style.format({0: make_clickable})

Unnamed: 0,0
280,https://www.tagvenue.com/rooms/london/3903/tanner-warehouse/tanner-warehouse-courtyard
2055,https://www.tagvenue.com/rooms/london/3308/tanner-warehouse/industrial-wedding
3639,https://www.tagvenue.com/rooms/london/321/tanner-warehouse/tanner-warehouse


In [None]:
import sys
from operator import itemgetter
local_vars = list(locals().items())
# Size gives us variable size in Bytes
size = [[var,sys.getsizeof(obj)] for var, obj in local_vars]
size = sorted(size, key=itemgetter(1), reverse = True)
for var, size in size:
    print(var,f"-> {size/1000000:,} MB")

In [8]:
len(space_urls_cleaned)

4488

### Scrape Space Webpage HTML Code
- click on Read all to load venue review breakdown score 
- error logging 
- save results...

In [5]:
def click_read_all():
    """Clicks 'Read all' button to load breakdown of venue review score.  
    
    Each venue has an overall user review score. By clicking on 
    'Read all' you can see a breakdown of the overall review score 
    into 6 different scoring categories e.g. 'Catering', 
    'location', 'Value' etc. The function waits until the page has
    finished loading to ensure all data has loaded before scraping the html"""
    try:
        # Finds 'Read all' button html element
        read_all_element = driver.find_element_by_xpath(
            "//button[@class='c-button-link' and contains(text(),'Read all')]")
        read_all_element.click()
    except NoSuchElementException:
        # End function, no reviews so no 'Read all' button to click
        return
    
    try:
        # Wait until page finishes loading following the click 
        WebDriverWait(driver, 10).until(lambda d: d.execute_script('return document.readyState') == 'complete')
    except TimeoutException: 
        scraping_error_log.append([url, 'failed to load user review score breakdowns'])

In [10]:
# Dictionary storing the url and html of each space webpage, in 
# format {url:html} 
space_webpages = {}
total_urls_to_scrape = len(space_urls_cleaned)
time_last_update = time.time()

# Loop through space urls and scrape the html code for each space web page
for url_number, url in enumerate(space_urls_cleaned):
    if (load_page(url)):
        click_read_all()
        space_webpages[url] = driver.page_source
    if (time.time() - time_last_update > progress_report_interval):
        perc_complete = url_number / total_urls_to_scrape
        print(f"Scraped {url_number} of {total_urls_to_scrape} -> {perc_complete:.1%} Completed\n")
        time_last_update = time.time()

Scraped 382 of 4488 -> 8.5% Completed

Scraped 1035 of 4488 -> 23.1% Completed

Scraped 1672 of 4488 -> 37.3% Completed

Scraped 2291 of 4488 -> 51.0% Completed

Scraped 2898 of 4488 -> 64.6% Completed

Scraped 3476 of 4488 -> 77.5% Completed

Scraped 4034 of 4488 -> 89.9% Completed



### Errors During Space HTML Scraping
If the scraping of a venue was aborted due to a page load error, it is displayed below: 

In [11]:
scrape_errors = pd.DataFrame(scraping_error_log, columns = ['url','error'])
# Function to make urls clickable in jupyter
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

scrape_errors.style.format({'url': make_clickable})

Unnamed: 0,url,error


In [12]:
# Save space webpage html data to file 
with open('space_htmls.json', 'w') as fp:
    json.dump(space_webpages, fp)

In [6]:
# Uncomment if you wish to load space_htmls from file
#with open('space_htmls.json', 'r') as fp:
#    space_webpages = json.load(fp)

### Extract Data From HTML
We will use the lxml library to extract useful data from the html code of each space's website. The extraction code handles 2 specific issues detailed below: 
- Some spaces were no longer hosted by Hire Space and therefore had no data available. The code excludes these venues by checking whether the start of the header reads 'venue no longer hosted'. 
- Some web pages didn't load properly during scraping. They were missing data and trigger an error during extraction. The extraction code deals with them by re-loading the webpage and trying to extract data from the reloaded page's html.  
Any other errors will be handled by aborting the extraction process and adding the error to an error log. 

In [47]:
# Error log for extraction errors
extraction_error_log = []

def extract_data(url, html):
    """Attempts to extract data from the space url and html provided. 
    
    First, the function checks if the space is no longer hosted by Tagvenue. 
    If it is hosted, the url and html are passed to the extract_from_html 
    function which returns the extracted data. If an error occurs during 
    extraction, the error is logged. 
    
    Returns - Extracted data as a list or returns None if unable to 
    extract data due to venue no longer being hosted, or due to an 
    unexpected error. 
    """
    # check if the venue is no longer hosted
    if (check_page_not_hosted(html)):
        # Log extraction error due to venue no longer hosted 
        extraction_error_log.append([url, 'venue no longer hosted', html])
        return None
    try:
        # Extract the data from the html code
        return extract_from_html(url, html)
    except Exception as e:
        # Log unexpected error, including error message in log
        extraction_error_log.append([url, e, html])
        return None

In [8]:
def find_element_by_xpath(xpath):
    """Returns first element found using provided xpath. 
    
    If no element is found, an error is raised"""
    # Finds all elements from provided Xpath as a list
    elements = tree.xpath(xpath)
    try:
        return elements[0]
    except IndexError:
        raise IndexError('No element found via provided Xpath')
    
def find_elements_by_xpath(xpath):
    """Returns all elements found via provided xpath as a list"""
    return tree.xpath(xpath)

In [39]:
driver.get('https://www.tagvenue.com/rooms/london/26243/adam-house-event-space/cambric-suite')
tree = etree.HTML(driver.page_source)

In [41]:
find_element_by_xpath("//title/text()")[0:3]

'404'

In [51]:
# WHEN EXPANDING FOR RICH DATA - test and run this function before the 'extract data' 
# function

def extract_from_html(url, html):
    """Extracts and returns data from provided html."""
    # Make tree global so that custom lxml find html element functions 
    # work without having to pass the tree as an argument 
    global tree
    # Parse html with lxml library
    tree = etree.HTML(html)
    
    # Find h1 header html element that contains venue and space name 
    header_element = find_element_by_xpath("//h1")
    # Extract space and venue name string from html element
    header = header_element.get('title').lower()
    # header has general form 'space_name at venue_name'. Split venue
    # and space name into a list using ' at ' as separator. 
    venue_and_space_name = header.split(' at ')
    space_name = venue_and_space_name[0]
    venue_name = venue_and_space_name[1]
    # If list has more than 2 elements, an issue has occured
    if (len(venue_and_space_name) > 2):
        # log error - commented out, decided error only affects ~70 spaces and is fixed with below change
        #extraction_error_log.append([url, header
        #                          + '> error identifying space name and venue name', html])
        # Take venue and space name from url. End of url has 
        # format /venue_name/space_name. We split the url 
        # into a list with separator '/'
        url_split = url.split('/')
        # Last element of url split is space_name
        space_name = url_split[-1].replace('-',' ')
        # Second last element of url split is venue_name
        venue_name = url_split[-2].replace('-',' ')

    # Find address html element
    address_element = find_element_by_xpath(
        "//span[@class='c-room-header__text_link' and contains(text(),',')]")
    # Extract text, remove '\n's and whitespace
    address = address_element.text.replace('\n','').strip()
    
    # Find html element for map
    map_element = find_element_by_xpath("//a[@href='#map-modal']")
    # Extract latitude and convert to float
    latitude = float(map_element.get('data-lat'))
    # Extract longitude and convert to float
    longitude = float(map_element.get('data-long'))

    return [url, venue_name, space_name, latitude, longitude, address]

In [44]:
def check_page_not_hosted(html): 
    """Check if venue no longer hosted, return result (True or False)"""
    # Make tree global so that custom lxml find html element functions 
    # work without having to pass the tree as an argument 
    global tree
    # Parse html with lxml library
    tree = etree.HTML(html)
    # Find text of title
    title = find_element_by_xpath("//title/text()")
    # When not hosted, Tagvenue returns a 404 error at the 
    # beggining of the html title  
    if (title[0:3] == '404'):
        return True
    return False

In [10]:
space_webpages_test = {url:html for url, html in list(space_webpages.items())[0:10]}

In [11]:
space_webpages_test.keys()

dict_keys(['https://www.tagvenue.com/rooms/london/7552/graeae-theatre-company/creative-hub', 'https://www.tagvenue.com/rooms/london/6679/city-cruises-ltd/christmas-cruises', 'https://www.tagvenue.com/rooms/london/7134/jamies-tudor-street/ground-floor-room', 'https://www.tagvenue.com/rooms/london/20299/the-fisheries/the-boardroom', 'https://www.tagvenue.com/rooms/london/6854/brewers-hall/livery-hall', 'https://www.tagvenue.com/rooms/london/8287/1-wimpole-street/ent-room', 'https://www.tagvenue.com/rooms/london/24727/studio-spaces/full-venue', 'https://www.tagvenue.com/rooms/london/2451/3-perseverance-works/studio', 'https://www.tagvenue.com/rooms/london/23541/cococure/basement', 'https://www.tagvenue.com/rooms/london/1358/grand-ballroom-at-the-montcalm/grand-ballroom'])

In [51]:
space_urls_cleaned[0:10]

['https://www.tagvenue.com/rooms/london/7552/graeae-theatre-company/creative-hub',
 'https://www.tagvenue.com/rooms/london/6679/city-cruises-ltd/christmas-cruises',
 'https://www.tagvenue.com/rooms/london/7134/jamies-tudor-street/ground-floor-room',
 'https://www.tagvenue.com/rooms/london/20299/the-fisheries/the-boardroom',
 'https://www.tagvenue.com/rooms/london/6854/brewers-hall/livery-hall',
 'https://www.tagvenue.com/rooms/london/8287/1-wimpole-street/ent-room',
 'https://www.tagvenue.com/rooms/london/24727/studio-spaces/full-venue',
 'https://www.tagvenue.com/rooms/london/2451/3-perseverance-works/studio',
 'https://www.tagvenue.com/rooms/london/23541/cococure/basement',
 'https://www.tagvenue.com/rooms/london/1358/grand-ballroom-at-the-montcalm/grand-ballroom']

In [58]:
# Extract data from the space webpages that were scraped
data = [extract_data(url, html) for url, html in space_webpages.items()]
# Remove None entries in data (These are from the erroneous 
# web pages that couldn't be scraped)
data = [item for item in data if (item != None)]
print(f"{len(data)} spaces were successfully scraped")

4481 spaces were successfully scraped


### Extraction Error Log
If extraction failed on a webpage, display error below along with error message. 

In [59]:
extraction_errors = pd.DataFrame(extraction_error_log, columns = ['url','error', 'html'])
# Function to make urls clickable in jupyter
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

extraction_errors[['url','error']].style.format({'url': make_clickable})

Unnamed: 0,url,error
0,https://www.tagvenue.com/rooms/london/1358/grand-ballroom-at-the-montcalm/grand-ballroom,grand ballroom at grand ballroom at the montcalm> error identifying space name and venue name
1,https://www.tagvenue.com/rooms/london/1175/headspace-farringdon/kings-landing-at-headspace,kings landing at headspace! at headspace farringdon> error identifying space name and venue name
2,https://www.tagvenue.com/rooms/london/661/indigo-at-the-o2/indigo-at-the-o2-vip-lounge,indigo at the o2 vip lounge at indigo at the o2> error identifying space name and venue name
3,https://www.tagvenue.com/rooms/london/655/searcys-at-the-gherkin/exclusive-hire-of-helix-iris,exclusive hire of helix & iris at searcys at the gherkin> error identifying space name and venue name
4,https://www.tagvenue.com/rooms/london/7073/jamies-st-mary-at-hill/the-hooke-room,the hooke room at jamies st mary at hill> error identifying space name and venue name
5,https://www.tagvenue.com/rooms/london/6049/searcys-at-the-gherkin/exclusive-hire-level-38,exclusive hire level 38 at searcys at the gherkin> error identifying space name and venue name
6,https://www.tagvenue.com/rooms/london/26243/adam-house-event-space/cambric-suite,venue no longer hosted
7,https://www.tagvenue.com/rooms/london/7862/be-at-one-regent-street/whole-venue,whole venue at be at one regent street> error identifying space name and venue name
8,https://www.tagvenue.com/rooms/london/7024/jamies-st-mary-at-hill/whisky-room,whisky room at jamies st mary at hill> error identifying space name and venue name
9,https://www.tagvenue.com/rooms/london/12713/hello-darling/darling-house,venue no longer hosted


### Summarise and Save Data
Below we convert the extracted data to a dataframe, show first 10 rows and summary statistics. We then save the data to file as a csv. 

In [60]:
df = pd.DataFrame(data, columns = ['url','venue_name','space_name', 'latitude','longitude', 'address_line'])
df.head(10)

Unnamed: 0,url,venue_name,space_name,latitude,longitude,address_line
0,https://www.tagvenue.com/rooms/london/7552/gra...,graeae theatre company,creative hub,51.532108,-0.077022,"Kingsland Road, 138, London, E2 8DY"
1,https://www.tagvenue.com/rooms/london/6679/cit...,city cruises ltd,christmas cruises,51.50216,-0.123207,"Westminster Pier, Victoria Embankment, London,..."
2,https://www.tagvenue.com/rooms/london/7134/jam...,jamies tudor street,ground floor room,51.512615,-0.10879,"Tudor Street, 36, London, EC4Y 0BH"
3,https://www.tagvenue.com/rooms/london/20299/th...,the fisheries,the boardroom,51.540489,-0.057509,"Mentmore Terrace, 1, London, E8 3PN"
4,https://www.tagvenue.com/rooms/london/6854/bre...,brewers' hall,livery hall,51.517175,-0.092663,"Aldermanbury Square, London, EC2V 7HR"
5,https://www.tagvenue.com/rooms/london/8287/1-w...,1 wimpole street,ent room,51.516531,-0.147532,"Wimpole Street, 1, London, W1G 0AE"
6,https://www.tagvenue.com/rooms/london/24727/st...,studio spaces,full venue,51.508843,-0.061467,"Studio Spaces, Unit 2, 100 Pennington Street, ..."
7,https://www.tagvenue.com/rooms/london/2451/3-p...,3 perseverance works,studio,51.527873,-0.078011,"25-27 Hackney Road, London, E2 8DD"
8,https://www.tagvenue.com/rooms/london/23541/co...,cococure,basement,51.513407,-0.075685,"Minories, 5, London, EC3N 1BJ"
9,https://www.tagvenue.com/rooms/london/1358/gra...,grand ballroom at the montcalm,grand ballroom,51.515059,-0.159386,"2 Wallenberg PLace, Marble Arch, London, W1H 7TN"


In [61]:
df.describe(include='all')

Unnamed: 0,url,venue_name,space_name,latitude,longitude,address_line
count,4481,4481,4481,4481.0,4481.0,4481
unique,4481,1526,3142,,,1516
top,https://www.tagvenue.com/rooms/london/7552/gra...,radisson blu edwardian heathrow,whole venue,,,"Bath Road, 140, London, UB3 5AW"
freq,1,38,334,,,38
mean,,,,51.50955,-0.125521,
std,,,,0.030236,0.076415,
min,,,,51.329423,-0.443729,
25%,,,,51.501546,-0.147253,
50%,,,,51.512952,-0.124687,
75%,,,,51.521381,-0.085673,


In [62]:
df.to_csv('tag_venue_data.csv', index=False)

In [63]:
# Close chrome page
driver.quit()



In [None]:
import sys
from operator import itemgetter
local_vars = list(locals().items())
# Size gives us variable size in Bytes
size = [[var,sys.getsizeof(obj)] for var, obj in local_vars]
size = sorted(size, key=itemgetter(1), reverse = True)
for var, size in size:
    print(var,f"-> {size/1000000:,} MB")

### Investigating Extraction Errors
Below provides examples of using the extraction_errors list to re-create the errors during extraction to aid with debugging.

In [66]:
# Extracts error message from first error 
Error_number = 0
repr(extraction_error_log[Error_number][1])

"'grand ballroom at grand ballroom at the montcalm> error identifying space name and venue name'"

In [67]:
# Runs extraction on erroneous html, returning full original error
Error_number = 0
extract_from_html('url', extraction_error_log[Error_number][2])

IndexError: list index out of range

In [None]:
review_breakdown_title_elements = driver.find_elements_by_xpath(
            "//div[@class='reviews-modal-score__title' and contains(text(),'')]")
    review_breakdown_score_elements = driver.find_elements_by_xpath(
            "//div[@class='reviews-modal-score__score' and contains(text(),'')]")
    for element in review_breakdown_title_elements:
        print(element.get_attribute('innerHTML'))
        
    for element in review_breakdown_score_elements:
        print(element.get_attribute('innerHTML'))   