In [1]:
import pandas as pd
import numpy as np
import csv
import json
import time
from datetime import datetime
import random
import chromedriver_autoinstaller

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementNotInteractableException
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
import lxml.html
from IPython.core.display import HTML
from joblib import Parallel, delayed

![alt text](../images/tag_venue_home_page.png)
# Tagvenue Venue Web Scrape
### Introduction 

The [Tagvenue](https://www.tagvenue.com/) website is basically an Air BnB for finding and booking venues for an event. The website hosts thousands of venues in the UK that can be booked for events such as weddings, work drinks, birthdays etc. Each venue has one or more **spaces** available to be booked. A **space** is basically a room or area within the venue. Some venues have a single space, often the whole venue, whilst others offer a selection of rooms, each offered as a separate space. Each Space has its own webpage on Tagvenue. This webpage contains all the data needed to choose which space to book for your event. Example data includes price, location, size, capacity, features, licensing etc. This notebook will scrape the data from all spaces on the [Tagvenue](https://www.tagvenue.com/) website that are located in **London**. At the time of writing this amounts to **~4400** spaces. 

The data will be saved in 2 csv files detailed below: 
- **tag_venue_space_data.csv**: Stores general information on each space, e.g. location, area, capacity, catering details, features etc. One row per event space. 
- **tag_venue_space_prices.csv**: Stores price data for each space. The price data is a bit complex, with prices shown for different days of the week and for different time periods e.g. per hour or per day. Each row is one price offering for a single space on a single day of the week. Each space will have many price offerings and thus each space will have many rows in the csv.    

### Key Variables
The following key variables define and tweak the specifics of the web scrape: 

- **progress_report_interval** - Periodic progress reports (% completed) are printed during scraping. This variable defines in seconds how often the report is output. 
- **connection_error_retry_time** - This defines how long in seconds the program will wait before trying to re-load a webpage when it fails to load due to a connection error. 
- **headless_mode** - Set to *True* if you want chrome to be launched in headless mode i.e. not visible. Set to *False* if you wish chrome to be visible while scraping.  
- **longitude_min**, **longitude_max**, **latitude_min** and **latitude_max** - Defines the area that will be searched for venues. The intersection of the four longitude / latitude lines defines a square area.

In [2]:
progress_report_interval = 1800 #1800 for normal run, 300 for test
connection_error_retry_time = 300  # 300 for normal run, 30 for test
# Set True to have chrome open in headless mode 
headless_mode = False 
# longitude and latidue max and min define four lines, the intersection 
# of these lines defines a square area used for the venue search
# Normal run values, comment out when not wanted 
latitude_min = 51.326626 
latitude_max = 51.7297765
longitude_min = -0.446500003
longitude_max = 0.2190751
# Test Values, comment out when not wanted 
#longitude_min = -0.100501
#longitude_max = -0.059614
#latitude_min = 51.494423
#latitude_max = 51.50697

### Initiate Web Scraper
We will use Selenium and Chromedriver / Chrome to crawl the Hire Space website and scrape data. An initial check is performed by *chromedriver_autoinstaller()* to ensure chromedriver is up to data. If it is not, then the latest version is downloaded. Selenium then initiates an instance of chrome that it can control. This instance will either be visible or invisible (headless mode) depending on the *headless_mode* variable.  

In [3]:
# Check if the current version of chromedriver exists
# and if it doesn't exist, download it automatically,
# then add chromedriver to path
chromedriver_autoinstaller.install()
# If headless_mode was True, open chrome in headless mode, 
# otherwise open a visible chrome browser
if (headless_mode):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options)
else:
    # Initialise chromedriver
    driver = webdriver.Chrome()

### Define Page Load Function
We will frequently load new webpages with Selenium. We want to wait a certain amount of time between successive page loads to minimise our impact on the server and avoid being detected as a bot. We also want to detect any connection errors that might occur during the loading of a page for example due to a wifi issue. 

For this purpose, we created the *load_page* function. This function basically takes a url and loads it into chrome. It then pauses the program for twice the time it took for the page to load (with random variation to look less like a bot). In this way, the strain we put on the server will dynamically change. The slower the server becomes, the more time we will wait between successive page loads and vice versa if the server speeds up. 

The function also Handles *timeout* (page took longer than 30 seconds to load) and *connection* errors (couldn't connect to internet). In either case, the programme will wait some time then try to reload the page. If it still fails, the error is logged.  

The function returns *True* if there were no errors loading the page and returns *False* when there were errors. It can be placed in an *if* statement so that the page will only be processed if the page loaded successfully.   

In [4]:
# Set chromedriver timeout error to trigger if page takes more 
# than 30 seconds to load
driver.set_page_load_timeout(30)
# Initialise scraping error log
scraping_error_log = []
# Note - the below function returns True when no errors occur 
# during page load and is designed to be put within an 'if' 
# i.e. if(load_page(url)): to only do the steps in the 'if' 
# when the page load doesn't have errors
def load_page(url):
    """Load provided url in chrome then sleep for interval of time. 
    
    Handles and logs timout and connection errors. Calculates 
    the time to wait by multiplying the time it took the page to 
    load by 2, then adding some random offset.
    
    Returns -- True if page load was successful, returns False if there 
    was an error
    """
    
    try:
        # Loads url in chrome and calculates the time it took to load page
        time_of_request = time.time()
        driver.get(url)
        page_load_time = time.time() - time_of_request
        # Calculate time required to wait before next url is 
        # loaded
        wait_time_till_next_request = wait_time_calculation(page_load_time)
        time.sleep(wait_time_till_next_request)
        # Returns True to indicate page load had no errors
        return True
    # Execution pauses if timeout or connection issue occurs 
    except (TimeoutException, WebDriverException) as e:
        time.sleep(connection_error_retry_time)  
        try:
            # Loads url in chrome and calculates the time it took to load page
            time_of_request = time.time()
            driver.get(url)
            page_load_time = time.time() - time_of_request
            # Calculate time required to wait before next url is 
            # loaded (next time load_page is called)
            wait_time_till_next_request = wait_time_calculation(page_load_time)
            time.sleep(wait_time_till_next_request)
        except TimeoutException:
            scraping_error_log.append([url, 
                              'page failed to load, web page timed out'])
            # Returns False to indicate page load had an error
            return False
        except WebDriverException:
            scraping_error_log.append([url, 
                              'page failed to load, no internet connection'])
            # Returns False to indicate page load had an error
            return False
            
def wait_time_calculation(page_load_time):
    """Returns time required to wait before loading next url
    
    The wait time is 2 times the page_load_time, with 
    random variation"""
    
    average_wait_time = 2 * page_load_time
    upper_wait_time = 1.33333 * average_wait_time
    lower_wait_time = 0.77777 * average_wait_time
    return random.uniform(lower_wait_time, upper_wait_time)

### Creating Search URL
![alt text](../images/tag_venue_search_bar.png)
We will use Tagvenue's [search page](https://www.tagvenue.com/) to find all venues located in London. The Tagvenue search requires an 'event type' to be chosen for the search. There are around **190** different 'event types' available to choose from. To find all venue's hosted by the website, we will have to repeat the search for all 190 available 'event types'. When a user clicks on the 'event type' field on the search page, a list of options is shown for them to choose from. Below we scrape the 'event type' options provided to the user.  

In [5]:
tagvenue_search_page_url = 'https://www.tagvenue.com/'
if (load_page(tagvenue_search_page_url)):
    # Find event type html input element
    form_event_type_input = driver.find_element_by_xpath(
        "//input[@name='room_tag_autocomplete']")
    # Click on event type html input element - this loads the 'event 
    # type' html elements that contain the event type options into the
    # webpage html
    form_event_type_input.click()
    # Find event type html elements 
    form_event_types_elements = driver.find_elements_by_xpath(
        "//div[@class='autocomplete-suggestions']//div")
    # Extract text from event types html elements 
    form_event_types = [element.get_attribute('innerHTML')
                        for element in form_event_types_elements]
    # replace spaces with '-', to make the event type conform to the 
    # url format used by Tagvenue - remove, from old approach 
    #event_types = [item.replace(' ','-') for item in event_types]
else: raise Exception('page load error - cannot find event types')

print(f"There are {len(form_event_types)} event types on Tagvenue")

There are 190 event types on Tagvenue


After you select an 'event type' and click search, a search url is created and loaded into chrome. You are then taken to the search results page. The 'event type' forms a sub-directory of the search url. An example search url is shown below, where the 'event type' chosen was '18th Birthday Party'.  

https://www.tagvenue.com/uk/search/18th-birthday-party?location_id=6&people=&neighbourhood=London

The text of the 'event type' options provided to the user when completing the form is not consistent with the text used in the url e.g. the event type *'Academic'* becomes *'academic-venues'* in the search url. As such, we will use selenium to perform a search using each 'event-type' option and will extract the text of the 'event-type' from the search url. This will enable us to create our own custom urls with custom longitude and latitude values later, rather than having to rely on the functionality of the search page.  

In [6]:
def find_url_event_type(event_type):
    """Returns the event type text used in the search url for the
    given event type"""
    
    load_page(tagvenue_search_page_url)
    # Find event type input html element
    event_type_input = driver.find_element_by_xpath(
        "//input[@name='room_tag_autocomplete']")
    # Input the event type
    event_type_input.send_keys(event_type)
    # Press enter
    event_type_input.send_keys(Keys.ENTER)
    # Find search button html element
    search_button_element = driver.find_element_by_xpath(
        "//button[@class='c-button-cta c-button-cta--big js-hero-search']")
    # Click search button tom perform search
    search_button_element.click()
    try:
        # Wait until page finishes loading following the click 
        WebDriverWait(driver, 10).until(
            lambda d: d.execute_script('return document.readyState') == 'complete'
        )
    # If page fails to finish loading after 10 seconds, log error 
    except TimeoutException: 
        scraping_error_log.append([event_type, 'failed to load search results'])
    # Extract search url used
    search_url = driver.current_url
    # Extract event type text from search url. Event type sits between
    # the last '/' and first '?' in the search url.
    end_of_event_type = search_url.find('?')
    start_of_event_type = search_url.rfind('/', 0, end_of_event_type) + 1
    return search_url[start_of_event_type : end_of_event_type]

In [7]:
# Ignore, left in for testing / debugging purposes
#form_event_types = form_event_types[0:5]

In [8]:
url_event_types = []
# Perform search on Tagvenue using each of the available event types
# and extract the text used in the search url to denote the event type
for form_event_type in form_event_types: 
    url_event_type = find_url_event_type(form_event_type)
    url_event_types.append(url_event_type)

In [9]:
print(f"Found {len(url_event_types)} search url event types")

Found 190 search url event types


In [10]:
# Show first 10 event types
url_event_types[0:10]

['18th-birthday-party',
 '30th-birthday-party',
 '40th-birthday-party',
 '50th-birthday-party',
 'academic-venues',
 'activity-day',
 'afternoon-tea',
 'agm',
 'anniversary-party',
 'art-studio']

We can build a custom search url from an event type and latitude and longitude maximum and minimum values. This url will return search results showing all spaces within the defined latitude and longitude range that are suitable for the event type chosen. We chose to use a custom url so that we had full control over the latitude and longitude ranges. If we had used the Tagvenue search page to generate a search url, we would have to rely on Tagvenue's definition of the area of London.   

An example custom search url is shown below: 

https://www.tagvenue.com/uk/search/18th-birthday-party?longitude_from=-0.270&longitude_to=0.069&latitude_from=51.31&latitude_to=51.69&page=1

We also include the page number in the url. The search results display 36 results per page. The page number defines which page of the results to show. We included this in our custom url because the Tagvenue website alters the url in Chrome after it shows you the results. This alteration changes the latitude and longitude values. As such, if you navigate to another page by selecting a page using the page navigation links at the bottom of the results, it will display spaces from the new longitude and latitude range, and not from the range in the original search url. As a result, when we want to navigate to a new page of the search results, we create a new search result url with an updated page number. This keeps the latitude and longitude range constant as we navigate through results pages.   

Below, we define the function to create a custom search url from an event type and page number (note that the latitude and longitude ranges are defined in the Key Variables section above.)

In [11]:
# Tagvenues changes the longitude and latitude values of the url in 
# Chrome after you load the url, so you need to recreate the whole url
# whenever you change page or event type to keep the results within 
# the desired longitude and latitude range. 
def create_search_url(event_type, page):
    """Build and return search url string"""
    return f"""https://www.tagvenue.com/uk/search/{event_type}?
           longitude_from={longitude_min}&longitude_to={longitude_max}
           &latitude_from={latitude_min}&latitude_to={latitude_max}&page={page}"""

### Scraping Event Types Error Log
If an error occurred whilst scraping the event types it is displayed below.   

In [12]:
scrape_errors = pd.DataFrame(scraping_error_log, columns = ['event_type','error'])
# Function to make urls clickable in jupyter
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

scrape_errors.style.format({'url': make_clickable})

Unnamed: 0,event_type,error


### Collect Space URLs From Search Results

Using custom search urls, we can search for space's within our defined latitude and longitude ranges. The url will return a search results page as shown in the image below:  

![alt text](../images/tag_venue_search_results.png)

Each space returned by the search is shown as a clickable picture. Clicking on the picture will take you to the web page of that space. We want to navigate through every page of search results and to gather the urls of every space web page shown. The below functions will help us achieve this, the first returns the total number of pages of results and the second scrapes all the space web page urls showing on the current page. 

In [13]:
def find_total_results_pages():
    """Returns the number of pages of search results showing in Chrome"""
    # Find pagination html elements - these create the clickable page
    # numbers and arrows at bottom of search results page to naviagte 
    # through search results pages 
    pagination_elements = driver.find_elements_by_xpath(
        "//div[@class='results-pagination results-pagination--center']/ul/li/a")
    # Convert pagination elements to text values in a list e.g. it may 
    # look like [<< 1 2 3 ... 17 >>] 
    pagination = [element.get_attribute('innerHTML') 
                  for element in pagination_elements]
    # If list is not empty i.e. len > 0 then return second last 
    # element - this is the total number of pages
    if (len(pagination) > 1):
        return int(pagination[-2])
    # If list empty, then there is only one page, return 1
    else: return 1

In [14]:
# This block is just for creating a search url for debugging purposes 
#event_type = 'pop-up-event'
#event_type = 'corporate-event'
#page = 1
#search_url = create_search_url('kids-partybus', page)
#load_page(search_url) 

In [15]:
def get_space_urls():
    """Returns the url of every space webpage result showing in Chrome.
    
    Tagvenue will show a maximum of 36 results per page. This function
    will return the urls for these 36 (or fewer) results currently visible. 
    If no venues are returned by the search, the function checks that this 
    is due to the search not finding anything and not because an erroneous 
    page was loaded"""
    # Find the html elements that store the url of each space search result 
    # (this is the url that is opened when you click on a search result). 
    search_result_url_elements = driver.find_elements_by_xpath(
        "//div[@class='v-search-results-items']/div/a")
    # Checks whether no search result urls were found i.e. list was empty  
    if(len(search_result_url_elements) == 0):
        # When a search result returns no results, Tagvenue displays a h3 
        # html text message that says 'sorry, we couldnt find any venues 
        # matching your criteria.'
        try:
            # Find h3 html element
            no_search_results_message_element = (driver
                                                 .find_element_by_xpath("//h3")
                                            )
            # Extract text from h3 element 
            no_search_results_message = (no_search_results_message_element
                                         .get_attribute('innerHTML')
                                    )
            # remove whitespace and apostrophies and lower case of text message 
            no_search_results_message = (no_search_results_message
                                         .replace("'", "").lower().strip()
                                    )
            expected_message = (
                'sorry, we couldnt find any venues matching your criteria.')
            # If message doesn't match the expected search result message 
            # then log possible error  
            if(no_search_results_message != expected_message):
                scraping_error_log.append([search_url, 'search url failed'])
        # If no h3 element exists i.e. no search error message showing, 
        # log possible error. 
        except NoSuchElementException: 
            scraping_error_log.append([search_url, 'search url failed'])
    # Extract urls from html elements and return them. IF no urls found, 
    # this will return an empty list. 
    return [element.get_attribute('href') for element in search_result_url_elements]

The search results page will only show spaces which are suitable for the event type in the search url. As such, we will need to repeat the search for all ~190 event types to ensure that we find every space hosted on the website. The spaces will be duplicated in different search results i.e. sometimes the same space will appear for 'corporate event' and '18th Birthday party'. As such, we will need to remove duplicates at the end. We could have used a *set()* to hold the list of space urls, which would negate the need to remove duplicates at the end. However, it was determined that it was useful when reporting the progress of the scrape, to be able to count the total number of urls scraped.

Below, we perform a separate search for every available event type. For each search, we crawl through the results and gather all the urls for the spaces. A progress update is printed every 30 mins detailing the number of event type searches that have been completed and an approximate total number of results pages scraped.   

In [16]:
# Ignore, left in for testing / debugging purposes
#event_types = event_types[0:7]

In [17]:
space_urls = []
time_last_update = time.time()
total_event_types = len(url_event_types)
# Loop through event types 
for event_number, event_type in enumerate(url_event_types): 
    # Provide progress report periodically  
    if (time.time() - time_last_update > progress_report_interval):
        print(f"Scraped {event_number} of {total_event_types} event_types")
        # Approximate number of pages scraped is total number of urls 
        # found divided by the max results per page (36)
        pages_of_urls_scraped = len(space_urls)/36
        print(f"Approximately {pages_of_urls_scraped:.0f} pages of search results scraped\n")
        time_last_update = time.time()
    # Create initial search url for current event type i.e. page 1 
    # of search results 
    search_url = create_search_url(event_type, 1)
    load_page(search_url)
    total_pages = find_total_results_pages()
    # Loop the current page number, from 1 to the total number 
    # of pages in search results 
    for current_page in range(1, total_pages + 1):
        # Extract space urls from current page of search results
        space_urls.extend(get_space_urls())
        # If not on final page of search results
        if(current_page < total_pages):
            # Create new search url for the next page of 
            # search results (current page + 1) 
            search_url = create_search_url(event_type, current_page + 1)
            load_page(search_url)

Scraped 8 of 190 event_types
Approximately 142 pages of search results scraped

Scraped 19 of 190 event_types
Approximately 278 pages of search results scraped

Scraped 27 of 190 event_types
Approximately 405 pages of search results scraped

Scraped 40 of 190 event_types
Approximately 550 pages of search results scraped

Scraped 52 of 190 event_types
Approximately 671 pages of search results scraped

Scraped 58 of 190 event_types
Approximately 815 pages of search results scraped

Scraped 67 of 190 event_types
Approximately 957 pages of search results scraped

Scraped 77 of 190 event_types
Approximately 1074 pages of search results scraped

Scraped 87 of 190 event_types
Approximately 1194 pages of search results scraped

Scraped 102 of 190 event_types
Approximately 1323 pages of search results scraped

Scraped 111 of 190 event_types
Approximately 1443 pages of search results scraped

Scraped 118 of 190 event_types
Approximately 1554 pages of search results scraped

Scraped 129 of 190 ev

### Scraping Space URLs Error Log
If an error occurred whilst scraping the space urls it is displayed below. 

Note that if a page fails to load due to a connection issue, then it will result in a 'search url failed' error as well and you must re-scrape the entire search url that failed, not just the page that failed.   

In [18]:
scrape_errors = pd.DataFrame(scraping_error_log, columns = ['url','error'])
# Function to make urls clickable in jupyter
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

scrape_errors.style.format({'url': make_clickable})

Unnamed: 0,url,error


### Cleaning Space URLs

Below we remove duplicates from the space urls and save the data to file. 

In [19]:
# Remove duplicate space urls 
space_urls_unique = list(set(space_urls))
print(f"There are {len(space_urls_unique)} unique space urls")

There are 5511 unique space urls


In [20]:
# Save space_urls_uniqe to file (as json)
with open("space_urls.json", 'w') as f:
    # indent=2 is not needed but makes the file human-readable
    json.dump(space_urls_unique, f, indent=2) 

If you wish to load a saved list of space urls, remove #s and run the below. 

In [21]:
#with open("space_urls.json", 'r') as f:
#    space_urls_unique = json.load(f)

#print(f"There are {len(space_urls_unique)} spaces to scrape")

Every space web page has a standard url format, an example is shown below: 
https://www.tagvenue.com/rooms/london/6637/the-goldsmiths-centre/bench

By exploring the space urls we scraped, we found that sometimes the above format was tweaked by adding extra text to the end of it. This text had the form '?event-offer=offer' where 'offer' varied, and could be 'Christmas' or 'Wedding' or some other value. 

Below we show an example of a tweaked url:  
https://www.tagvenue.com/rooms/london/6637/the-goldsmiths-centre/bench?event-offer=christmas

Some web pages have a 'event offers and packages' section which lists one or more special packages available at the venue. The tweaked url would typically link to the same web page as the normal format url but would auto-expand a special package defined by the extra bit of url e.g. the extra bit of url '?event-offer=christmas' would auto-expand the Christmas package when the page loads.

Occasionally, the tweaked url would open a slightly different web page for the space. All the key space data e.g. price, location, square footage etc. would be the same. The only difference is that a unique special package would be available in the 'event offers and packages' section that is not available on the normal web page. 

For our purposes, the special package data is not useful. We chose to remove all the tweaked url's because they were effectively duplicated links to the same space web page as the normal format url. 

Below, we perform a quick check to ensure all urls with a '?' are of the form we have investigated, namely '?event-offer='. This is to make sure we don't have other unusual url formats that need to be analysed. 

In [22]:
urls_df = pd.DataFrame(space_urls_unique)

# Find number of urls that include a '?' 
urls_with_qmark = urls_df[urls_df[0].str.contains('\?')].shape[0]
# Find number of urls that include '?event-offer='
urls_with_qmark_event_offer = urls_df[urls_df[0]
                                      .str.contains('\?event-offer')].shape[0]
# Check all strings with '?' are of format '?event-offer='
if (urls_with_qmark == urls_with_qmark_event_offer): 
    print(f"All urls with a '?' are of form '?event-offer'" )
else: 
    print(f"There are urls with '?' not of the form '?event-offer'")

# Remove urls that contain '?event-offer=' and save as list
space_urls_cleaned = urls_df[~urls_df[0]
                             .str.contains('\?event-offer')][0].to_list()
                 
print(f"Removed {urls_with_qmark_event_offer} urls containing '?event-offer' \n")
print(f"There are now {len(space_urls_cleaned)} space urls")

All urls with a '?' are of form '?event-offer'
Removed 1010 urls containing '?event-offer' 

There are now 4501 space urls


Below we display any urls that contain a '?' but don't contain 'event-offer' (this should show nothing)

In [23]:
# view dataframe contains all urls with a '?' that do not 
# include the words 'event-offer'
view = urls_df[(urls_df[0].str.contains('\?'))
               &(~urls_df[0].str.contains('event-offer'))]
# Display view dataframe with clickable url links 
view.style.format({0: make_clickable})

Unnamed: 0,0


The below code is useful for searching through the venue urls to see the different venues and to find different packages available at the venues. It was used to investigate the '?event-offer' urls. 

In [24]:
# Switch between space_urls_unique and space_urls_cleaned to 
# get with and without packages, and to find specific venues 
# or venues with packages e.g. search for '\?event-offer=wedding' 
# to get venues with wedding packages
urls_df = pd.DataFrame(space_urls_unique)
# Filter dataframe by string inclusion
view = urls_df[urls_df[0].str.contains('tanner')]
# view urls with urls as clickable links 
view.style.format({0: make_clickable})

Unnamed: 0,0
1733,https://www.tagvenue.com/rooms/london/321/tanner-warehouse/tanner-warehouse
1943,https://www.tagvenue.com/rooms/london/3903/tanner-warehouse/tanner-warehouse-courtyard
2912,https://www.tagvenue.com/rooms/london/3308/tanner-warehouse/industrial-wedding


### Scrape Space Webpage HTML
We will now scrape the full html code from every space web page. We chose to scrape the full html code rather than scraping individual elements that we need. This decision was made because the html elements tend to be inconsistent between web pages. Anticipating and handling all the errors and issues that might crop up is very challenging. The scraping process itself is very slow because time is waited between successive page loads. This means when an unexpected error occurs during the scrape we need to restart the whole process. Pages that failed to scrape or didnt scrape as intended need to be re-loaded and re-scraped.     
It is much simpler, faster and less error prone to download the full html and then extract what we want separately.    

Below we scrape the full html code from all space web pages that we have a url for. This includes a periodic progress report showing the number and percentage of pages that have been scraped. 

In [25]:
# Dictionary storing the url and html of each space webpage, in 
# format {url:html} 
space_webpages = {}
total_urls_to_scrape = len(space_urls_cleaned)
time_last_update = time.time()

# Loop through space urls and scrape the html code for each space web page
for url_number, url in enumerate(space_urls_cleaned):
    # load space url, if page load successful (returns True) then go 
    # into 'if' code 
    if (load_page(url)):
        # Save space html code in space_webpages
        space_webpages[url] = driver.page_source
    # Periodic progress update 
    if (time.time() - time_last_update > progress_report_interval):
        perc_complete = url_number / total_urls_to_scrape
        print(f"Scraped {url_number} of {total_urls_to_scrape} -> {perc_complete:.1%} Completed\n")
        time_last_update = time.time()

Scraped 467 of 4501 -> 10.4% Completed

Scraped 934 of 4501 -> 20.8% Completed

Scraped 1389 of 4501 -> 30.9% Completed

Scraped 1835 of 4501 -> 40.8% Completed

Scraped 2278 of 4501 -> 50.6% Completed

Scraped 2703 of 4501 -> 60.1% Completed

Scraped 3108 of 4501 -> 69.1% Completed

Scraped 3538 of 4501 -> 78.6% Completed

Scraped 3933 of 4501 -> 87.4% Completed

Scraped 4329 of 4501 -> 96.2% Completed



### Errors During Space HTML Scraping
If an error occurred whilst scraping the space html code, it is displayed below: 

In [26]:
scrape_errors = pd.DataFrame(scraping_error_log, columns = ['url','error'])
# Function to make urls clickable in jupyter
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

scrape_errors.style.format({'url': make_clickable})

Unnamed: 0,url,error


In [27]:
# Save space webpage html data to file 
with open('space_htmls.json', 'w') as fp:
    json.dump(space_webpages, fp)

Uncomment the below code if you wush to load a saved space_htmls file

In [3]:
#with open('space_htmls.json', 'r') as fp:
#    space_webpages = json.load(fp)

### Extract Data From HTML
#### Approach

We will use the lxml library to extract the data we want from the html code of each space's website. The extraction code below loops through each space's web page html and extracts the data.
![alt text](../images/tag_venue_pricing_data.png)

The pricing data for each space is quite complex. It quotes one or several prices for each day of the week. The types of prices shown include a per hour, a per day and per morning price etc. Due to this complexity, we decided that after scraping the data, we would store the price data in a separate dataframe to the general space data. This would allow the complex pricing data to be extensively explored whilst keeping its size a bit more manageable. Thus, the results of this notebook will be **2 separate tables**, a **space data** table and a **space prices** table. 

Some spaces were no longer hosted by Tagvenue and therefore an error page was loaded into chrome when scraping their html rather than a Space web page. To handle these instances, the extraction code checks whether the title of the html page starts with '404' - if so the venue is no longer hosted and the extraction is aborted. A 'venue no longer hosted' error is then logged. If any other unexpected error occurs during extraction, the extraction is also aborted and the error message is logged. 

In [4]:
# Error log for extraction errors
extraction_error_log = []

def extract_data(url, html):
    """Attempts to extract data from the space url and html provided. 
    
    First, the function checks if the space is no longer hosted by Tagvenue. 
    If it is hosted, the url and html are passed to the extract_from_html 
    function which returns the extracted data. If an error occurs during 
    extraction, the error is logged. 
    
    Returns - Extracted data as a list or returns None if unable to 
    extract data due to venue no longer being hosted, or due to an 
    unexpected error. 
    """
    # check if the venue is no longer hosted
    if (check_page_not_hosted(html)):
        # Log extraction error due to venue no longer hosted 
        extraction_error_log.append([url, 'venue no longer hosted', html])
        return None
    try:
        # Extract the data from the html code
        return extract_from_html(url, html)
    except Exception as e:
        # Log unexpected error, including error message in log
        extraction_error_log.append([url, e, html])
        return None

#### Define custom extraction functions
The lxml html extraction method always returns a list of results and doesnt raise an error if no html elements are found. We prefer the Selenium methods which allow you to return either a list of results or a single result and return an error if it is not found. Below we defined some custom functions to make the lxml extraction methods function the same as the selenium find_elements_by_xpath and find_element_by_xpath methods. 

In [5]:
def find_element_by_xpath(xpath):
    """Returns first element found using provided xpath. 
    
    If no element is found, an error is raised"""
    # Finds all elements from provided Xpath as a list
    elements = tree.xpath(xpath)
    try:
        return elements[0]
    except IndexError:
        raise IndexError('No element found via provided Xpath')
    
def find_elements_by_xpath(xpath):
    """Returns all elements found via provided xpath as a list"""
    return tree.xpath(xpath)

In [6]:
def clean_list(raw_list):
    """Remove whitespace from all elements and remove elements which 
    contain only whitespace"""
    
    return [element.strip() for element in raw_list 
            if (element.strip() != '')]

#### Data Extraction Details
We will extract all space data (general data and pricing data) into a single list of lists that can easily be turned into a dataframe and separated out to into the separate *space prices* and *space data* dataframes. The extraction code below extracts all data we are interested in from a single space webpage and returns this in a list.

![alt text](../images/tag_venue_non_standard_list.png)

Some sections of the web pages include non-standard lists of strings (i.e. the items in the lists change from web page to web page). These lists come in 2 distinct flavours detailed below: 
  1. List of strings where the strings describe whether an item is available. E.g. the string will either be 'External Catering' or 'External Catering not allowed'. These are mutually exclusive declarations i.e. you do not see both strings since they contradict one another.
  2. List of strings that detail an option e.g. 'Wifi' and they are either greyed out or not to indicate whether they are available.
  
You can see both flavours in the Catering section screenshot shown above. The top section is the first type and the bottom section is the second. We decided that the easiest way to scrape this data was to save the data as either a list or dictionary and then to expand this data into separate columns later when making the dataframes. We deal with the first type by saving as a list of strings e.g. ['In-house catering', 'External catering not allowed']. We deal with the second type by saving as a dictionary where the keys are the strings shown and the value is True or False depending on whether the option is available or not e.g. {'Wi-Fi': True}.
![alt text](../images/tag_venue_capacity_section.png)

The *capacity* section of the website (shown above) was similar to the second string case. It shows the event layouts available at the space (up to 7 options available) and the maximum capacity for each layout. We stored this data as a dictionary where the layout was the key and max capacity was the value. 
![alt text](../images/tag_venue_pricing_data.png)

The price data (shown above) was stored as a list of lists, with the intention of separating this out into a separate dataframe later on. Each list has elements [day_of_week, pricing_period, time_period, price, price_type]. An example list would be ['Per day', '9:00 – 17:00',  '£660', 'hire fee'].

The below table details all data scraped from each Space web page (when available): 

Data |Data type|Description
:---|:---:|:---
venue_url|string|url of the venue web page on Tagvenue.com. The venue owns the space i.e. the event space resides within the venue. 
venue_name|string|Name of venue. The venue owns the space, i.e. the space resides within the venue. 
space_name |string|Name of event space within venue
latitude|float|Latitude of venue
longitude|float|Longitude of venue
address|string|Address of venue
nearest_tube_station|string|Nearest tube station to venue, includes distance from nearest tube in feet when available 
max_seating|int|maximum seating capacity of event space
max_standing|int|maximum standing capacity of event space
area_in_msqrd|int|Area of event space in metres squared
catering_offered_by_venue|Bool|Is catering offered by the venue for this event space (True / False)
external_catering_allowed|Bool|Is external catering allowed for this event space (True/False)
supervenue|Bool|Is this venue a supervenue (True / False) - 'Supervenue program is based on our customers' feedback and highlights venues that are most dedicated to providing outstanding hospitality, customer service and event experience' 
capacity|dictionary|Dictionary of layouts available at the venue (e.g. 'Boardroom') as keys with maximum capacity for that layout as the value. e.g. {'Boardroom':25}
top_catering_list|list of strings|List of string descriptions of catering options e.g. ['In-house catering', 'External catering not allowed']. This data is the top portion of the Catering section.
bottom_catering_dic|dictionary|Dictionary of catering options available (e.g. 'Halal menu') as keys and the value being True or False depending on whether the option is available. E.g. {'Halal menue':True}. This data is the bottom portion of the catering section. 
features|list of strings|Dictionary of all feature options on the web page, taken from the Accessibility, facilities, sound and music sections etc. The dictionary uses the option name as keys e.g. 'WiFi' and the value is True or False depending on whether the option is available. E.g. {'Wi-fi': False}
prices|list of lists|List of lists where each list is a row of the pricing data from the web page. Each list has elements [day_of_week, pricing_period, time_period, price, price_type]. An example list would be ['Per day', '9:00 – 17:00',  '£660', 'hire fee']

In [7]:
def extract_from_html(url, html):
    """Extracts and returns data from provided html as a list"""
    # Make tree global so that custom lxml find html element functions 
    # work without having to pass the tree as an argument 
    global tree
    # Parse html with lxml library
    tree = lxml.html.fromstring(html)
    
    # Find html element containing url link to venue page (stored 
    # within the only h2 with class='room-at-venue__section__title' 
    # that contains an <a> 
    venue_url_element = find_element_by_xpath(
        "//h2[@class='room-at-venue__section__title']/a")
    # Extract venue url from venue url element
    venue_url = venue_url_element.get('href')
    
    # Find h1 header html element that contains venue and space name 
    header_element = find_element_by_xpath("//h1")
    # Extract space and venue name string from html element
    header = header_element.text.strip().lower()
    # header has general form 'space_name at venue_name'. Split venue
    # and space name into a list using ' at ' as separator. 
    venue_and_space_name = header.split(' at ')
    space_name = venue_and_space_name[0]
    venue_name = venue_and_space_name[1]
    # If list has more than 2 elements, an issue has occured
    if (len(venue_and_space_name) > 2):
        # Take venue and space name from url. End of url has 
        # format /venue_name/space_name. We split the url 
        # into a list with separator '/'
        url_split = url.split('/')
        # Last element of url split is space_name
        space_name = url_split[-1].replace('-',' ')
        # Second last element of url split is venue_name
        venue_name = url_split[-2].replace('-',' ')

    # Find address html element
    address_element = find_element_by_xpath(
        "//span[@class='c-room-header__text_link' and contains(text(),',')]")
    # Extract text, remove '\n's and whitespace
    address = address_element.text.replace('\n','').strip()
    
    # Find html element for map
    map_element = find_element_by_xpath("//a[@href='#map-modal']")
    # Extract latitude and convert to float
    latitude = float(map_element.get('data-lat'))
    # Extract longitude and convert to float
    longitude = float(map_element.get('data-long'))
    
    # Extract nearest tube station text and remove whitespace 
    try:
        nearest_tube_station = find_element_by_xpath(
        "//div[@class='c-room-header__transport "
        + "js-open-map-modal']//span/text()").strip()
    except:
        nearest_tube_station = np.nan
    
    # Max seated and max standing data is not always present. when it is
    # not present, it means the respective max value is 0. We will set
    # both values to 0 initially and update with the extracted value if 
    # it can be found.  
    max_seated = 0
    max_standing = 0 
    
    # Extract list of capacity data. It has typical format [max seated,
    # max standing, area of venue] but will have fewer elements if max
    # seating or max standing is not available.  
    capacity_data = find_elements_by_xpath(
        "//a[@href='#capacitySection']//strong/text()")
    
    # Extracts the html elements for the capacity data, one element 
    # each for max standing, max sitting and area.
    capacity_data_text = find_elements_by_xpath(
        "//a[@href='#capacitySection']//div[@class='c-venue-feature__label']")
    # Extract text from each of the capacity html elements - this text
    # inlcudes strings that indicate if the value is for standing or 
    # sitting etc. We need to use this text to determine what max 
    # value is contained in each element because sometimes some of 
    # the elements are missing, so we dont know the index for 
    # seating or standing in that case.  
    capacity_data_text = [element.text_content().strip() 
                          for element 
                          in capacity_data_text]
    
    # Loop through capacity text and data, determine what value
    # each data element refers to by checking whether the text 
    # contains the words 'seats', 'standing' or 'm' and then record that value
    for text, data in zip(capacity_data_text, capacity_data):
        if 'seats' in text:
            max_seated = int(data)
        if 'standing' in text:
            max_standing = int(data)
        if 'm' in text:
            area_in_m2 = int(data[:-2])
    
    # Get list of venue catering data [catering_offered, external_catering]
    venue_catering_data = find_elements_by_xpath(
        "//a[@href='#cateringSection']/div/div[@class='c-" 
        + "venue-feature__label']/text()")
    
    catering_offered = venue_catering_data[0].strip()
    external_catering_allowed = venue_catering_data[1].strip()
    
    # Try to find the html element showing the venue is a super venue. If 
    # the element is found, flag the venue as a supervenue, otherwise an 
    # error is triggered and flag venue as not a super venue.
    try:
        find_element_by_xpath(
            "//img[@class='supervenue-badge-sm visible-xs-inline visible-md-inline']")
        supervenue = True
    except:
        supervenue = False
    
    # Extract list of capacity names available for space (Boardroom, 
    # theatre, cabaert etc.)
    capacity_names = find_elements_by_xpath(
        "//div[@class='capacity-name']/text()")
    # Extract list of maximum capacity numbers for each capacity type above
    capacity_maxes = find_elements_by_xpath(
        "//div[@class='capacity-number']/strong/text()")
    
    # Combine together into single dictionary {capacity_name : Capacity_max}
    capacity = {}
    for capacity_name, capacity_max in zip(capacity_names, capacity_maxes):
        capacity[capacity_name.strip() + '_max'] = int(capacity_max)
    
    # the catering info is split into 2 data types - one is the top_catering_list.
    # This is a list of strings that describe whether something is available 
    # e.g. 'External Catering Allowed'. The second data type is the 
    # bottom_catering_dic - this is a dictionary of catering facilities e.g. 
    # 'Halal menu' and either True or false to indicate if its available. 
    bottom_catering_dic = {}
    
    # Loop through different features sections of space webpage 
    for element in find_elements_by_xpath("//div[@class='row room__section']"):
        # Check if this section is the 'Catering Section' by seeing if the h2 
        # within the element says 'Catering' 
        if 'Catering' in element.xpath(".//h2")[0].text:
            # Extract the list of strings in the top portion of the catering
            # section. This contains a list of descriptive strings 
            # that say whether something is available or not. 
            raw_top_catering_list = element.xpath(
                ".//div[@class='c-room-feature-list "
                + "c-room-feature-list--pull-up']//span/text()")
            # The extraction includes many elements we dont want which just 
            # contain whitespace characters so we clean the list
            top_catering_list = clean_list(raw_top_catering_list) 
            # Extract the html elements for the bottom portion of 
            # Catering - this contains a series of catering attributes
            # which are either greyed out or not to indicate if they are 
            # available. 
            bottom_catering_element = element.xpath(
                ".//div[@class='js-catering-details']")[0]
            # Exctract the html element of each individual catering attribute
            bottom_catering_elements = bottom_catering_element.xpath(
                ".//div[contains(@class,'c-room-feature')]")
            # Loop through each catering attribute html element 
            for catering_element in bottom_catering_elements:
                try:
                    # Extract the text string of the attribute - this will be
                    # exracted as a list containing multiple whitespace
                    # elements we dont want 
                    raw_catering_attribute = catering_element.xpath(".//span/text()")
                    # Clean the attribute text to remove whistespace elements 
                    catering_attribute = clean_list(raw_catering_attribute)[0]
                    # If the class of the html element includes 'inactive' 
                    # then the catering attribute is not available 
                    if 'inactive' in catering_element.get('class'):
                        bottom_catering_dic[catering_attribute] = False
                    else:
                        bottom_catering_dic[catering_attribute] = True
                # Sometimes the raw_catering_attribute xpath returns only 
                # whitespace and no useful text. This results in an index
                # error when cleaning the attribute. We simply skip these 
                # errors, they do not impact our scraping, we still scrape
                # all catering attributes. 
                except IndexError:
                    pass
    
    # General space features such as 'wheelchair access' are either greyed 
    # out to indicate not available or shown in black text to indicate 
    # available. We will store this data in a dictionary of 
    # format {feature:True or False} where True would indicate the feature 
    # is available. 
    features = {}
    # Each feature is contained in a div with class name starting 
    # with 'room__feature'. First we find all these divs then loop 
    # through them.  
    features_elements = find_elements_by_xpath(
        "//div[contains(@class,'room__feature')]")
    for feature_element in features_elements:
        # We place in a try to avoid an index error that occurs when an 
        # element contains no text i.e. has no visble data on the web page. 
        try:
            # Extract the text from all spans within the element as a list
            # - this will describe the feature e.g. 'wheelchair access' 
            # but will also contain whitespace elements we dont want
            raw_feature_text = feature_element.xpath(".//span/text()")
            # Clean the feature text to extract only the text we want
            feature_text = clean_list(raw_feature_text)[0]
            # If the html element's class contains the words 'strikethrough'
            # then the feature is greyed out and therefore not available. 
            if 'strikethrough' in feature_element.get('class'):
                features[feature_text] = False
            else:
                features[feature_text] = True
        except IndexError:
            pass
    # Store price information in list of lists 
    prices = []
    # Price data for each day of the week (Monday, Tuesday etc.) is 
    # stored in a separate div with attribute 'data-day-of-week'. 
    # Loop through the 7 day of week pricing html elements. 
    for element in find_elements_by_xpath("//div[@data-day-of-week]"):
        # Extract the day of week of current element
        day_of_week = element.get('data-day-of-week')
        # Check if venue is closed on this day of week. If so, the 
        # class attribute of the element will contain 'closed'.
        if 'closed' in element.get('class'):
            # Add row to pricing list indicating closed on this day. 
            pricing_period = 'closed' 
            time_period = np.nan
            price = np.nan
            price_type = np.nan
            prices.append([day_of_week, pricing_period, time_period, 
                           price, price_type])
        else:
            # Each day of week has one or more rows of pricing data. Each 
            # row is stored in a separate div with attribute 
            # 'c-pricing-table__row-inner'. Below we extract all available 
            # row data html elements
            pricing_rows = element.xpath(".//div[@class='c-pricing-table__row-inner']")
            # Remove first row (it has no data)
            pricing_rows = pricing_rows[1:]
            # Loop through each price row's html element
            for pricing_row in pricing_rows:
                # Extract all text within element as a string - this will 
                # contain pricing data and lots of whitespace and '\n' 
                # between each bit of data. 
                raw_pricing_string = pricing_row.text_content()
                # Split string into list using '\n' as separator
                raw_pricing_list = raw_pricing_string.split('\n')
                # Clean list, removing all elements only containing whitespace. 
                pricing_list = clean_list(raw_pricing_list)
                # An example cleaned list is ['Per day', '9:00 – 17:00', 'from',
                # '£660', 'hire fee']. Extract the relevent pricing data and add 
                # to prices list. 
                pricing_period = pricing_list[0] 
                time_period = pricing_list[1]
                price = pricing_list[3]
                price_type = pricing_list[4]
           
                prices.append([day_of_week, pricing_period, time_period, 
                               price, price_type])
        
    return [url, venue_url, venue_name, space_name, latitude, longitude, address, 
            nearest_tube_station, max_seated, max_standing, area_in_m2,
            catering_offered, external_catering_allowed, supervenue, capacity,
           top_catering_list, bottom_catering_dic, features, prices]

In [8]:
def check_page_not_hosted(html): 
    """Check if venue no longer hosted, return result (True or False)"""
    # Make tree global so that custom lxml find html element functions 
    # work without having to pass the tree as an argument 
    global tree
    # Parse html with lxml library
    tree = lxml.html.fromstring(html)
    # Find text of title
    title = find_element_by_xpath("//title/text()")
    # When not hosted, Tagvenue returns a 404 error at the 
    # beggining of the html title  
    if (title[0:3] == '404'):
        return True
    return False

In [9]:
# ignore - used for testing and debugging
# space_webpages_test = {url:html for url, html in list(space_webpages.items())[0:10]}

In [10]:
# Extract data from the space webpages that were scraped (using Parallel 
# function from joblib library to use all computer cores simultaneously. 
# This is the equivalent of the following list comprehension:
# data = [extract_data(url, html) for url, html in space_webpages.items()] 
data = Parallel(n_jobs=-1, verbose = 6)(delayed(extract_data)(url, html) 
                                        for url, html 
                                        in space_webpages.items())
# Remove None entries in data (These are from the erroneous 
# web pages that couldn't be scraped)
data = [item for item in data if (item != None)]
print(f"{len(data)} spaces were successfully scraped")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 336 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 1032 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 1936 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 3032 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 4336 tasks      | elapsed:   12.5s


4501 spaces were successfully scraped


[Parallel(n_jobs=-1)]: Done 4501 out of 4501 | elapsed:   12.9s finished


### Extraction Error Log
If extraction failed on a webpage, display error below along with error message. 

In [11]:
# Remember to reset extraction error log when re-running extraction! 
#extraction_error_log = [] 

In [12]:
extraction_errors = pd.DataFrame(extraction_error_log, columns = ['url','error', 'html'])
# Function to make urls clickable in jupyter
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

extraction_errors[['url','error']].style.format({'url': make_clickable})

Unnamed: 0,url,error


### Build Space Prices and Space Data Dataframes
We will create 2 dataframes from our extracted data as detailed below: 
1. Space Prices dataframe (called *df_prices* in the code) that contains the price data for all spaces. 
2. Space Data dataframe (called *df_final* in the code) that contains general info about each space. 

#### Load Dataframe
Below we convert the extracted data to a dataframe and show the first few rows. 

In [13]:
pd.options.display.max_columns = 80

df = pd.DataFrame(data, columns = ['space_url', 'venue_url', 'venue_name', 
                                   'space_name', 'latitude',
                                   'longitude', 'address', 
                                   'nearest_tube_station', 'max_seated', 
                                   'max_standing', 'area_in_m2', 
                                   'catering_offered', 
                                   'external_catering_allowed', 'supervenue', 
                                   'capacity', 'top_catering_list', 
                                   'bottom_catering_dic', 'features',
                                  'prices'])
df.head()

Unnamed: 0,space_url,venue_url,venue_name,space_name,latitude,longitude,address,nearest_tube_station,max_seated,max_standing,area_in_m2,catering_offered,external_catering_allowed,supervenue,capacity,top_catering_list,bottom_catering_dic,features,prices
0,https://www.tagvenue.com/rooms/london/22452/te...,https://www.tagvenue.com/venues/london/10000/t...,techspace aldgate east,aldgate east 3.2 (pairing),51.51405,-0.070852,"32-38 Leman Street, London, E1 8EW",Aldgate East Station (200 yd),2,0,4,Venue doesn’t offer catering,External catering allowed,False,{'Boardroom_max': 2},"[No in-house catering, External catering allow...","{'Buyout fee for external catering': False, 'K...","{'Wi-Fi': True, 'Flatscreen TV': True, 'Whiteb...","[[Monday, Per hour, 9:00 – 17:00, £30, hire fe..."
1,https://www.tagvenue.com/rooms/london/4947/gre...,https://www.tagvenue.com/venues/london/2111/gr...,green rooms hotel,the gallery (3rd floor),51.59705,-0.11091,"13-27 STATION ROAD , WOOD GREEN , LONDON, N22 6UW",Wood Green Station (150 yd),88,120,111,Venue offers catering,External catering not allowed,False,"{'Standing_max': 120, 'Dining_max': 88, 'Theat...","[In-house catering, Approved caterers only, Ex...",{'Alcohol licence until 03:00 (extension avail...,"{'Wi-Fi': True, 'Projector': True, 'Flipchart'...","[[Monday, Per hour, 8:00 – 23:00, £110, hire f..."
2,https://www.tagvenue.com/rooms/london/18440/14...,https://www.tagvenue.com/venues/london/8258/14...,148 leadenhall street,john taylor,51.51347,-0.083224,"Leadenhall Street, 148, London, EC3V 4QT",Bank Station (450 yd),10,0,20,Venue doesn’t offer catering,External catering allowed,False,{'Boardroom_max': 10},"[No in-house catering, External catering allow...","{'Buyout fee for external catering': False, 'K...","{'Wi-Fi': True, 'Flatscreen TV': True, 'Whiteb...","[[Monday, Per morning, 8:30 – 13:00, £318, hir..."
3,https://www.tagvenue.com/rooms/london/419/conw...,https://www.tagvenue.com/venues/london/186/con...,conway hall,brockway room,51.519792,-0.118337,"25 Red Lion Square, London, WC1R 4RL",Holborn Station (350 yd),60,60,81,Venue offers catering,External catering not allowed,False,"{'Standing_max': 60, 'Dining_max': 40, 'Theatr...","[In-house catering, External catering not allo...","{'Halal menu': True, 'Kosher menu': True, 'Ext...","{'Wi-Fi': True, 'Natural light': True, 'Projec...","[[Monday, Per day, 9:00 – 17:00, £75, per pers..."
4,https://www.tagvenue.com/rooms/london/12387/wh...,https://www.tagvenue.com/venues/london/5716/wh...,whyte & brown,terrace,51.512505,-0.138641,"Whyte & Brown, Unit G2 Kingly Court, London, W...",Oxford Circus Station (400 yd),40,50,45,Venue offers catering,External catering allowed,False,"{'Standing_max': 50, 'Dining_max': 40}","[In-house catering, External catering allowed,...","{'Alcohol licence until 23:00': True, 'Buyout ...","{'Wi-Fi': True, 'Whiteboard': True, 'Flipchart...","[[Monday, Per morning, 8:00 – 11:00, £200, min..."


#### Build Space Prices Dataframe
First we will build the Space Prices dataframe by splitting the pricing data from the rest of the data and remove the pricing column from our main data. 

In [14]:
# Extract pricing data into seperate dataframe
df_prices = df[['space_url', 'venue_url', 'venue_name', 'space_name','latitude', 
                'longitude', 'prices']]
# Drop pricing column from main data
df.drop(columns = ['prices'], inplace = True)

Each row of the *prices* column contains a list of lists, where each of the lists is a row of the pricing data for that space. Each space (each row of df) thus has many rows of pricing data stored within the list of lists, each of which includes *day_of_week, pricing_period, time_period, price* and *price_type*. We will expand the list of lists data such that we have separate columns for each attribute in the lists (e.g. a *day of week* column) and will have a separate row for each pricing row (e.g. a single space will now have many rows within df_prices).  

In [15]:
# Melt the prices column into a dataframe with single column of lists 
# (i.e. no longer list of lists). It has 2 columns - 'value' which contains
# the lists and 'variable' which stores the index the list belonged to
# in df_prices 
df_melted = pd.DataFrame(df_prices.prices.tolist()).T.melt().dropna()
# Create new dataframe that expands the lists into columns and preserve 
# the index from df_prices that each row belonged to 
df_tmp = pd.DataFrame(df_melted.value.tolist(), 
             columns = ['day_of_week', 'pricing_period',
                        'time_period', 'price', 'price_type'],
             index = df_melted.variable)
# Join df_prices with df_tmp, expands rows of df_prices to match those of df_tmp 
df_prices = df_prices.join(df_tmp)
df_prices.drop(columns=['prices'], inplace = True)
df_prices.head()

Unnamed: 0,space_url,venue_url,venue_name,space_name,latitude,longitude,day_of_week,pricing_period,time_period,price,price_type
0,https://www.tagvenue.com/rooms/london/22452/te...,https://www.tagvenue.com/venues/london/10000/t...,techspace aldgate east,aldgate east 3.2 (pairing),51.51405,-0.070852,Monday,Per hour,9:00 – 17:00,£30,hire fee per hour
0,https://www.tagvenue.com/rooms/london/22452/te...,https://www.tagvenue.com/venues/london/10000/t...,techspace aldgate east,aldgate east 3.2 (pairing),51.51405,-0.070852,Tuesday,Per hour,9:00 – 17:00,£30,hire fee per hour
0,https://www.tagvenue.com/rooms/london/22452/te...,https://www.tagvenue.com/venues/london/10000/t...,techspace aldgate east,aldgate east 3.2 (pairing),51.51405,-0.070852,Wednesday,Per hour,9:00 – 17:00,£30,hire fee per hour
0,https://www.tagvenue.com/rooms/london/22452/te...,https://www.tagvenue.com/venues/london/10000/t...,techspace aldgate east,aldgate east 3.2 (pairing),51.51405,-0.070852,Thursday,Per hour,9:00 – 17:00,£30,hire fee per hour
0,https://www.tagvenue.com/rooms/london/22452/te...,https://www.tagvenue.com/venues/london/10000/t...,techspace aldgate east,aldgate east 3.2 (pairing),51.51405,-0.070852,Friday,Per hour,9:00 – 17:00,£30,hire fee per hour


#### Build Space Data Dataframe
We have created the *space prices* dataframe above and will now finalise the *Space Data* dataframe.  During the extraction of data from the html, we saved some data as dictionaries in the form {attribute:True or False} such that each row had a dictionary. We also saved data as a list of strings such that each row had a list. Below we expand these columns out so the data is in a tabular form.

When we expand the dictionary data out, we create a new column for every key (each key being a different attribute description e.g. 'Disabled access') within all dictionaries in the column. The value in each column will then be True or False, depending on what value was within the dictionary for that row, or null if the dictionary of that row did not contain the column attribute.

We perform a similar operation to expand the list of strings. A new column is created for the unique set of all strings in the column. The value in the column will then be 1 if the string is contained in the list of strings for that row or 0 if it is not.

Sometimes during this expansion process, we create the same column names in separate expansions. This happens when the same attribute is shown in different sections of the website e.g. 'Venue provides alcohol' may be included in the facilities section as well as the catering section, and thus occurs in 2 of our expansions.

To deal with this, we decided to merge each pair of columns with the same names together into a single column. The columns will either contain 0s and 1s or True, False or nan values. The merging logic is as follows:

When merging 2 columns which both contain True, False or nan values -> For each row, if either of the columns has a True then the result is recorded as True. Otherwise, if either of the columns contains False it is set to False. Otherwise the value is nan.

When merging a column with 0s and 1s to a column with True, False or nan values -> For each row, if either of the columns has a 1 or a True then the result is recorded as 1, otherwise it is 0.

The expansions were done in the following order (to align the columns in the most useful way):
 1. capacity 
 2. top_catering_list
 3. bottom_catering_dic
 4. features

In [16]:
def join_or_merge(df, new_df_columns):
    """Join new_df_columns to df and merge columns with the same names 
    within df and new_df_columns. 
    
    Identifies columns within new_df_columns that have the same name
    as columns in df. These columns will cause an error if you try to join
    the dataframes. As such, the columns with same names are removed from 
    new_df_columns and then it is joined to df. 
    
    The function then merges the data between the same named columns. 
    There are 2 intended situations for the merge. In situation 1 both 
    columns contain True, False or nan values. In this case, if either of 
    the columns contains True, the value is set to True. Otherwise, if 
    either of the columns contains False it is set to False. Otherwise the
    value is nan. 
    
    In situation 2, the df column will be integer value and the new_df_columns 
    will be True, False or nan. In this case, if either value is 1 or True the 
    value is set to 1, otherwise it is set to 0. 
    
    Arguments: 
    df: dataframe we are adding columns to
    new_df_columns: A dataframe of new columns being joined to df
    """
    duplicated_columns = []
    # get column labels of new_df_columns
    new_column_labels = new_df_columns.columns.tolist()
    # loop through df column labels and identify any labels that are
    # also in new_df_columns
    for column in df.columns:
        if column in new_column_labels:
            duplicated_columns.append(column)
    
    # Join df to all columns of new_df_columns that dont share the
    # same name as a column in df
    df = df.join(new_df_columns.drop(columns = duplicated_columns))
    
    # Loop through columns names shared in df and new_df_columns
    for column in duplicated_columns:
        # Check if the first value in the column is not an integer
        if type(df.at[0, column]) != np.int64:
            # update column in df to be True if the value is true in the
            # df or new_df_columns tables, is false if the value is false 
            # in the df or new_df_columns tables and otherwise is nan
            conditions = [(df[column] == True) 
                          | (new_df_columns[column] == True), 
                          (df[column] == False) 
                          | (new_df_columns[column] == False)]
            choices = [True, False]
            df[column] = np.select(conditions, choices, default=np.nan)
        else:
            # update column in df to be 1 if the value is 1 in 
            # df or True in new_df_columns tables, otherwise make it 0
            df[column] = np.where((df[column] == 1) 
                                  | (new_df_columns[column] == True), 1, 0)
    
    return df     

In [17]:
# Several columns contain dictionaries of variable length. This code 
# expands each such column into individual columns where the column names 
# are the full set of dictionary keys and the column data is the dictionary 
# value data. If the dictionary doesnt include a key, then the column 
# corresponding to that key after expansion is left null.   
df = df.join(pd.json_normalize(df.capacity))

# Some columns contain a list of strings that are descriptive e.g. 
# ['Approved caterers only', 'venue provides alcohol']. Below we 
# expand that list into a seperate column for each string, and the 
# value in each column is either a 1 or 0 depending on whether the 
# string was found within the list of that row.   
df = df.join(df.top_catering_list.str.join('|').str.get_dummies())
# Expand bottom_catering_dic column (column of dictionaries)
df = df.join(pd.json_normalize(df.bottom_catering_dic))
# Expand features column (column of dictionaries) and join
# to df using join_merge function which merges the columns with 
# same names in df and the expanded features column into a single column 
df = join_or_merge(df, pd.json_normalize(df.features))
# Drop dictionary and list of strings columns, not needed anymore 
df.drop(columns=['capacity', 'bottom_catering_dic', 'features', 
                 'top_catering_list'], inplace = True)

Expanding the 'features' column has added **~2000 columns** to the dataframe. Most of these columns contain mostly null values! This is because a lot of the feature attributes were tailored to individual spaces, so you get a lot of attributes that only appear on a single web page and only relate to a single space.  

We need a quick and easy way to compare our data for a space to the web page version so we can check it is correct. This is difficult because we now have ~2000 columns, most of which will have a null value! The below code selects a single row in the dataframe and for this row prints the value in each column if the value is non-null. 

In [18]:
# index of row we want to inspect
row = 4000
# Loop through the column headers and row data
for column, value in zip(df.columns, df.loc[row]):
    # Check if value in column is non-null
    if value == value:
        print(column,'->', value)
        
# alternative -> df.loc[0,df.loc[0].notnull()]

space_url -> https://www.tagvenue.com/rooms/london/13261/nhow-london/tech-lab
venue_url -> https://www.tagvenue.com/venues/london/5749/nhow-london
venue_name -> nhow london
space_name -> tech lab
latitude -> 51.529304751718
longitude -> -0.097192525863647
address -> Macclesfield Road, 2, London, SE25 4RZ
nearest_tube_station -> Angel Station (800 yd)
max_seated -> 8
max_standing -> 0
area_in_m2 -> 36
catering_offered -> Venue offers catering
external_catering_allowed -> External catering not allowed
supervenue -> False
Boardroom_max -> 8.0
Approved caterers only -> 0
BYO alcohol allowed -> 0
BYO alcohol not allowed -> 1
External catering allowed -> 0
External catering not allowed -> 1
In-house catering -> 1
No in-house catering -> 0
Venue doesn’t provide alcohol -> 0
Venue provides alcohol -> 1
Complimentary water -> 0.0
Complimentary tea and coffee -> 0.0
Halal menu -> False
Kosher menu -> False
Extensive vegan menu -> True
Extensive gluten-free menu -> True
Alcohol licence until 22:0

In [19]:
# Select all columns from 'Wi-Fi' onwards - these are the columns
# from expanding 'features' 
df_features = df.loc[:,'Wi-Fi':]

print(f"There are {df_features.shape[1]} features columns")

There are 2064 features columns


We have far too many columns with very sparse data (over 2000!) and we dont want to keep all this useless data. So before we save the data to csv, we will remove most of those columns which contain very little information. We will keep columns from the 'feature' expansion with more than 400 non-null entries. 400 was chosen because it is about 10% of the rows (there are about 4000 rows) so is a generous minimum threshold. Below we display the columns that meet this criteria.  

In [20]:
df_useful_features = df_features.loc[:,df_features.count() > 400]
df_useful_features.count()

Wi-Fi                                       4501
Flatscreen TV                               4501
Whiteboard                                  4501
Natural light                               4501
Projector                                   4501
Flipchart                                   4501
Conference call facilities                  4501
Air conditioning                            4501
Storage space                               4501
Accommodation available                     4501
Parking available                           2788
Own music allowed                           4501
Bring your own DJ                           4501
PA system / music speakers available        4501
Lift to all floors                           650
Wheelchair accessible                       4501
Promoted / ticketed events                  4501
Loud music / events                         4501
Wedding licence                             4501
Temporary event notices (TENs) available    4501
Free parking is avai

There are about **23** columns out of **~2000** that we actually want to keep. Most of them have no null data at all and thus must be mandatory data on Tagvenue. Below we drop the rest of the columns from the data. This completes the *Space Data* dataframe.

In [21]:
# find all columns in df_features that are not in df_useful_features
# - these are the columns we want to drop.  
columns_to_drop = (set(df_useful_features.columns.tolist()) 
                   ^ set(df_features.columns.tolist())
                )
df_final = df.drop(columns = columns_to_drop)
df_final.head()

Unnamed: 0,space_url,venue_url,venue_name,space_name,latitude,longitude,address,nearest_tube_station,max_seated,max_standing,area_in_m2,catering_offered,external_catering_allowed,supervenue,Boardroom_max,Standing_max,Dining_max,Theatre_max,Cabaret_max,U-Shaped_max,Classroom_max,Approved caterers only,BYO alcohol allowed,BYO alcohol not allowed,External catering allowed,External catering not allowed,In-house catering,No in-house catering,Venue doesn’t provide alcohol,Venue provides alcohol,Buyout fee for external catering,Kitchen facilities available for guests,Complimentary water,Complimentary tea and coffee,Alcohol licence until 03:00 (extension available),Halal menu,Kosher menu,Extensive vegan menu,Extensive gluten-free menu,Alcohol licence until 23:00,...,Alcohol licence until 22:30 (extension available),Alcohol licence until 00:30,Alcohol licence until 02:00,Alcohol licence until 23:30 (extension available),Alcohol licence until 05:00 (extension available),Alcohol licence until 23:30,Alcohol licence until 04:00 (extension available),Alcohol licence until 00:30 (extension available),Alcohol licence until 22:30,Alcohol licence until 05:00,Alcohol licence until 01:30,Alcohol licence until 21:00 (extension available),Alcohol licence until 22:00,Alcohol licence until 22:00 (extension available),Alcohol licence until 02:30,Alcohol licence until 02:30 (extension available),Alcohol licence until 21:00,Wi-Fi,Flatscreen TV,Whiteboard,Natural light,Projector,Flipchart,Conference call facilities,Air conditioning,Storage space,Accommodation available,Parking available,Own music allowed,Bring your own DJ,PA system / music speakers available,Lift to all floors,Wheelchair accessible,Promoted / ticketed events,Loud music / events,Wedding licence,Temporary event notices (TENs) available,Free parking is available on-site,Paid parking facilities available nearby,Disabled access toilets
0,https://www.tagvenue.com/rooms/london/22452/te...,https://www.tagvenue.com/venues/london/10000/t...,techspace aldgate east,aldgate east 3.2 (pairing),51.51405,-0.070852,"32-38 Leman Street, London, E1 8EW",Aldgate East Station (200 yd),2,0,4,Venue doesn’t offer catering,External catering allowed,False,2.0,,,,,,,0,0,1,1,0,0,1,1,0,False,True,1.0,1.0,,,,,,,...,,,,,,,,,,,,,,,,,,True,True,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,,,
1,https://www.tagvenue.com/rooms/london/4947/gre...,https://www.tagvenue.com/venues/london/2111/gr...,green rooms hotel,the gallery (3rd floor),51.59705,-0.11091,"13-27 STATION ROAD , WOOD GREEN , LONDON, N22 6UW",Wood Green Station (150 yd),88,120,111,Venue offers catering,External catering not allowed,False,50.0,120.0,88.0,80.0,8.0,40.0,80.0,1,0,1,0,1,1,0,0,1,,,0.0,0.0,True,True,True,True,True,,...,,,,,,,,,,,,,,,,,,True,False,False,True,True,True,False,False,True,True,,True,False,True,,False,True,True,False,False,True,,
2,https://www.tagvenue.com/rooms/london/18440/14...,https://www.tagvenue.com/venues/london/8258/14...,148 leadenhall street,john taylor,51.51347,-0.083224,"Leadenhall Street, 148, London, EC3V 4QT",Bank Station (450 yd),10,0,20,Venue doesn’t offer catering,External catering allowed,False,10.0,,,,,,,0,0,1,1,0,0,1,1,0,False,False,0.0,0.0,,,,,,,...,,,,,,,,,,,,,,,,,,True,True,True,True,False,True,False,True,False,False,False,False,False,True,,False,False,False,False,False,,,
3,https://www.tagvenue.com/rooms/london/419/conw...,https://www.tagvenue.com/venues/london/186/con...,conway hall,brockway room,51.519792,-0.118337,"25 Red Lion Square, London, WC1R 4RL",Holborn Station (350 yd),60,60,81,Venue offers catering,External catering not allowed,False,28.0,60.0,40.0,60.0,30.0,,36.0,0,0,1,0,1,1,0,1,0,,,1.0,1.0,,True,True,True,True,,...,,,,,,,,,,,,,,,,,,True,False,False,True,False,False,False,False,False,False,False,False,False,True,,False,False,False,False,False,,,
4,https://www.tagvenue.com/rooms/london/12387/wh...,https://www.tagvenue.com/venues/london/5716/wh...,whyte & brown,terrace,51.512505,-0.138641,"Whyte & Brown, Unit G2 Kingly Court, London, W...",Oxford Circus Station (400 yd),40,50,45,Venue offers catering,External catering allowed,False,,50.0,40.0,,,,,0,0,1,1,0,1,0,0,1,True,False,1.0,0.0,,False,True,True,True,True,...,,,,,,,,,,,,,,,,,,True,False,True,True,False,True,False,False,False,False,False,True,False,False,,True,False,False,False,True,,,


### Summary Statistics and Save to File
Below we show summary statistics for the data we have scraped and save it to file. 

In [22]:
df_final.describe(include='all')

Unnamed: 0,space_url,venue_url,venue_name,space_name,latitude,longitude,address,nearest_tube_station,max_seated,max_standing,area_in_m2,catering_offered,external_catering_allowed,supervenue,Boardroom_max,Standing_max,Dining_max,Theatre_max,Cabaret_max,U-Shaped_max,Classroom_max,Approved caterers only,BYO alcohol allowed,BYO alcohol not allowed,External catering allowed,External catering not allowed,In-house catering,No in-house catering,Venue doesn’t provide alcohol,Venue provides alcohol,Buyout fee for external catering,Kitchen facilities available for guests,Complimentary water,Complimentary tea and coffee,Alcohol licence until 03:00 (extension available),Halal menu,Kosher menu,Extensive vegan menu,Extensive gluten-free menu,Alcohol licence until 23:00,...,Alcohol licence until 22:30 (extension available),Alcohol licence until 00:30,Alcohol licence until 02:00,Alcohol licence until 23:30 (extension available),Alcohol licence until 05:00 (extension available),Alcohol licence until 23:30,Alcohol licence until 04:00 (extension available),Alcohol licence until 00:30 (extension available),Alcohol licence until 22:30,Alcohol licence until 05:00,Alcohol licence until 01:30,Alcohol licence until 21:00 (extension available),Alcohol licence until 22:00,Alcohol licence until 22:00 (extension available),Alcohol licence until 02:30,Alcohol licence until 02:30 (extension available),Alcohol licence until 21:00,Wi-Fi,Flatscreen TV,Whiteboard,Natural light,Projector,Flipchart,Conference call facilities,Air conditioning,Storage space,Accommodation available,Parking available,Own music allowed,Bring your own DJ,PA system / music speakers available,Lift to all floors,Wheelchair accessible,Promoted / ticketed events,Loud music / events,Wedding licence,Temporary event notices (TENs) available,Free parking is available on-site,Paid parking facilities available nearby,Disabled access toilets
count,4501,4501,4501,4501,4501.0,4501.0,4501,4144,4501.0,4501.0,4501.0,4501,4501,4501,2240.0,3355.0,2952.0,1954.0,1376.0,1190.0,1338.0,4501.0,4501.0,4501.0,4501.0,4501.0,4501.0,4501.0,4501.0,4501.0,1237,1237,4501.0,4501.0,139,3807,3807,3807,3807,501,...,1,52,130,67,35,73,22,41,17,2,13,12,19,15,3,3,3,4501,4501,4501,4501,4501,4501,4501,4501,4501,4501,2788,4501,4501,4501,650,4501,4501,4501,4501,4501,913,700,509
unique,4501,1520,1517,3075,,,1504,831,,,,2,2,2,,,,,,,,,,,,,,,,,2,2,,,1,2,2,2,2,1,...,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,1,2,2,2,1,2,2,2,2,2,1,1,1
top,https://www.tagvenue.com/rooms/london/22452/te...,https://www.tagvenue.com/venues/london/4657/ra...,radisson blu edwardian heathrow,whole venue,,,"Bath Road, 140, London, UB3 5AW",Oxford Circus Station (400 yd),,,,Venue offers catering,External catering not allowed,False,,,,,,,,,,,,,,,,,False,False,,,True,True,False,True,True,True,...,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True,False,False,False,True,False,False,False,True,False,True,True,True,False,False,False,False,True,True,True
freq,1,38,38,308,,,38,50,,,,3807,3264,4234,,,,,,,,,,,,,,,,,816,798,,,139,1919,2538,1988,1985,501,...,1,52,130,67,35,73,22,41,17,2,13,12,19,15,3,3,3,4196,2258,3278,3192,2453,2450,3341,3205,3429,3601,2788,2570,3957,2719,650,2546,2745,2866,3740,3089,913,700,509
mean,,,,,51.510236,-0.124579,,,87.098423,124.127749,215.842257,,,,24.525,166.527273,96.065718,118.969806,77.758721,35.047059,59.41704,0.21884,0.253721,0.746279,0.274828,0.725172,0.778938,0.221062,0.281049,0.718951,,,0.580538,0.182626,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
std,,,,,0.030372,0.075606,,,354.307044,425.771496,1132.187006,,,,27.504245,485.962022,401.185838,253.423322,134.129842,80.465777,95.996818,0.413506,0.435188,0.435188,0.446477,0.446477,0.415008,0.415008,0.449561,0.449561,,,0.493526,0.386403,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
min,,,,,51.329423,-0.443729,,,0.0,0.0,0.0,,,,1.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25%,,,,,51.501706,-0.146393,,,14.0,0.0,30.0,,,,11.0,40.0,24.0,35.0,28.0,18.0,20.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,0.0,0.0,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
50%,,,,,51.513283,-0.123736,,,40.0,50.0,70.0,,,,20.0,80.0,50.0,60.0,45.0,26.0,33.5,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,,,1.0,0.0,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
75%,,,,,51.521982,-0.085345,,,90.0,130.0,160.0,,,,30.0,180.0,100.0,120.0,80.0,40.0,60.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,,,1.0,0.0,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [23]:
df_prices.describe(include='all')

Unnamed: 0,space_url,venue_url,venue_name,space_name,latitude,longitude,day_of_week,pricing_period,time_period,price,price_type
count,53494,53494,53494,53494,53494.0,53494.0,53494,53494,50346,50346,50346
unique,4501,1520,1517,3075,,,7,7,465,1008,6
top,https://www.tagvenue.com/rooms/london/12696/fl...,https://www.tagvenue.com/venues/london/4657/ra...,radisson blu edwardian heathrow,whole venue,,,Wednesday,Per day,9:00 – 17:00,£500,hire fee
freq,56,707,707,2880,,,8156,16797,5516,2333,21686
mean,,,,,51.510616,-0.125695,,,,,
std,,,,,0.028595,0.074932,,,,,
min,,,,,51.329423,-0.443729,,,,,
25%,,,,,51.503571,-0.14541,,,,,
50%,,,,,51.513309,-0.126191,,,,,
75%,,,,,51.521936,-0.087097,,,,,


In [24]:
todays_date = datetime.today().strftime('%d-%b-%y')

df_final.to_csv('tag_venue_space_data_' + todays_date + '.csv', index=False)
df_prices.to_csv('tag_venue_space_prices_' + todays_date + '.csv', index=False)

In [None]:
# Close chrome page
driver.quit()

### Investigating Extraction Errors
Below provides examples of using the extraction_errors list to re-create the errors during extraction to aid with debugging.

In [None]:
# Extracts error message from first error 
Error_number = 0
repr(extraction_error_log[Error_number][1])

In [None]:
# Runs extraction on erroneous html, returning full original error
Error_number = 0
extract_from_html('url', extraction_error_log[Error_number][2])

In [None]:
# Displays the erroneous space web page html as html
# HTML(space_webpages[extraction_error_log[Error_number][0]])

### Debugging Tools
The below tools can be ignored, they are useful when debugging the data extraction. 

In [None]:
list(space_webpages.keys())[0:10]

In [None]:
#with open('website.html', 'a') as the_file:
#    the_file.write(space_webpages['https://www.tagvenue.com/rooms/london/4472/bma-house/aldrich-blake'])

In [None]:
#driver.get('https://www.tagvenue.com/rooms/london/4472/bma-house/aldrich-blake')
#click_read_all()
#with open('website.html', 'a') as the_file:
#    the_file.write(driver.page_source)

In [None]:
#HTML(space_webpages['https://www.tagvenue.com/rooms/london/4472/bma-house/aldrich-blake'])