In [33]:
import pandas as pd
from datetime import datetime
from selenium import webdriver
import chromedriver_autoinstaller
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, InvalidArgumentException, StaleElementReferenceException
import re
from IPython.display import clear_output

# Scrape Email Addresses From Websites
The aim of this notebook is to scrape email addresses from websites. It takes a list of website urls, loads each website url in a browser and will attempt to find and scrape any email addresses present within the html of the page. Most websites have some variation on the 'contact us' page that contains contact details including email addresses. To try and find this, it will look for any html links on the webpage that contain the word 'contact'. It will open the pages they link to and will scrape any email addresses found on these pages. 

## Load Website URLs
We load the website urls from a csv file. The csv must have the following format: 
- Be named **'websites.csv'**
- Have a column called **'website'** that contains a single website url per row. This define the websites that will be scraped for email addresses.
- It can have other columns e.g. below we have 'venue_name' and 'fsq_id' which is metadata for each website url. 

In [2]:
df = pd.read_csv('websites.csv')
df.head()

Unnamed: 0,fsq_id,venue_name,website
0,4ae47fb4f964a520049b21e3,The Mudlark,http://www.nicholsonspubs.co.uk/restaurants/lo...
1,54e0c3e7498e1f5a17d47621,London Grind,http://grind.co.uk/londongrind
2,4c07d1fa3cbed13a820c0cc0,The Refectory,http://www.digbytrout.co.uk
3,4be4730b477d9c743091e62d,PizzaExpress,http://www.pizzaexpress.com/
4,5d70cdc529ed6f0008fd9dc0,The Secret Boxing Gym,http://www.thesecret.london


## Define Scraping Functions
The main email scraping function is called **get_email**. It requires a website url and regex pattern as arguments. It performs the following: 
- Loads the url into Chrome (using chromedriver / selenium) using the **load_url** function. This will handle and log any errors that occur during the load. 
- Finds any emails on the webpage using the **find_emails** function. This uses the regex pattern to identify email addresses in the html. 
- Finds any html links (i.e. hyperlinks) that contain the word 'contact'. Most websites have some variation on the 'contact us' page and this is our attempt to navigate to it.
- Extracts the urls contained in these html links and uses the **get_emails_from_contact_us** function to open each one and scrape any emails found in them. 

In [3]:
# Regex pattern to identify email addresses. Essentially accepts anything 
# of the form joebloggs@domain.something
pattern = re.compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")

def get_email(website, pattern):
    """Loads website url in chrome and scrapes email addresses. 
    
    Args: 
        website (string): url of website you wish to scrape
        pattern (regex): regex pattern to identify email addresses in html
        
    returns: 
        (list): A list of email address strings found."""
    
    
    # Load webpage
    if load_url(website, 'Homepage'):
        # if webpage fails to load (load_url returns True), ends function
        return []
    
    # Scrape emails from current webpage
    emails = find_emails(website, 'Homepage')
    
    # xpath that performs case insensitive search for html links 
    # that contain the word 'contact' 
    xpath_to_find_contact_links = "//a[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'contact')]"
    # Find elements in current page that are contact links
    elements = driver.find_elements_by_xpath(xpath_to_find_contact_links)
    
    # Extract url from each html link element
    try:
        urls = [element.get_attribute('href') for element in elements]
    except StaleElementReferenceException:
        # If Stale Element Error, re-load page and try again
        if load_url(website, 'Homepage'):
            # If page reload fails (load_url returns True), make emails found
            # lower case
            emails = [email.lower() for email in emails]
            # Remove duplicates on emails
            emails = list(set(emails))
            # Return emails and end function
            return emails
        # Find elements in current page that are contact links
        elements = driver.find_elements_by_xpath(xpath_to_find_contact_links)
        try:
            # Extract url from each html link element
            urls = [element.get_attribute('href') for element in elements]
        except StaleElementReferenceException:
            # If Stale Element Error Re-Occurs, log error
            errors.append([website, 'Contact_us', 
                           'Stale Element Exception (could not unpack urls from elements)'])
            # make emails found lower case
            emails = [email.lower() for email in emails]
            # Remove duplicates on emails
            emails = list(set(emails))
            # return emails and end function
            return emails
    
    # remove duplicates on urls
    urls = list(set(urls))
    # Scrape emails from each url and add to emails list
    emails.extend(get_emails_from_contact_us(urls))
    # Convert all emails to lowercase
    emails = [email.lower() for email in emails]
    # Remove duplicates on emails
    emails = list(set(emails))
    return emails

def load_url(website, scrape_stage):
    """Load website url into chrome and log error if page doesnt load.
    
    Args: 
        website (string): The full website url you wish to load into chrome.
        scrape_stage (string): A description used in the error log (if an 
            error occurs when loading the page) that indicates during which 
            stage of the scrape the error occured. This is typically either 
            'Homepage' for the initial website load and 'Contact_us' when 
            loading links that say 'contact'. 
            
    Returns: 
       (bool): False if website loaded, True if website failed to load."""
    try: 
        # Load webpage
        driver.get(website)
        return False
    except InvalidArgumentException:
        # Log Invalid Argument error in error log
        errors.append([website, scrape_stage, 'invalid_website_url'])
        return True
    except TimeoutException:
        # Log Timeout error in error log
        errors.append([website, scrape_stage, 'website_timeout'])
        return True
    except Exception as e:
        # Catch any other exception and log 
        error_message = getattr(e, 'message', repr(e))
        errors.append([website, scrape_stage, error_message])
        return True

def find_emails(website, scrape_stage):
    """Find email addresses in currently opened webpage. 
    
    Args: 
        website (string): url of website currently open in chromedriver. This is 
            used solely for error loggin if there is an error during email 
            scraping. 
        scrape_stage (string): A description used in the error log (if an 
            error occurs when scraping emails) that indicates during which 
            stage of the scrape the error occured. This is typically either 
            'Homepage' for the initial website load and 'Contact_us' when 
            loading links that say 'contact'.
    
    Returns: 
        (list): List of email addresses found on the currently open webpage."""
    try:
        # Extract html of webpage
        html = driver.page_source
    except Exception as e:
        # Catch any exception and log 
        error_message = getattr(e, 'message', repr(e))
        errors.append([website, scrape_stage + ' - while extracting html', error_message])
        # End function (return empty list)
        return []
    
    # Extract emails from html
    emails = re.findall(pattern, html)
    # Return emails
    return emails

def get_emails_from_contact_us(urls):
    """Load each url provided and return all emails found on these web pages.
    
    Args: 
        urls (list): The urls you wish to load and scrape for email addresses.
        
    Returns: 
        (list): emails found during scraping.
        
    """
    more_emails = []
    # Loop through urls
    for url in urls: 
        # Open new tab
        driver.execute_script("window.open('');")
        # Switch to new tab (tab index starts at 0)
        driver.switch_to.window(driver.window_handles[1])
        # Load url in new tab
        if not load_url(url, 'Contact_us'):
            # If load successful (returns False) then find emails on the webpage
            more_emails.extend(find_emails(url, 'Contact_us'))
        # Close new tab
        driver.close()
        # Switch back to original tab
        driver.switch_to.window(driver.window_handles[0])
    return more_emails

## Perform Scrape
Below we perform the email address scrape by initiating the browser via chromedriver / selenium and looping through the email address column of the dataframe and scraping each webpage. 

In [4]:
# Check if latest version of chromedriver installed, if not this will install it. 
chromedriver_autoinstaller.install()

# Set chromedriver to headless mode i.e. can't see the browser window
chrome_options = Options()
chrome_options.add_argument("--headless")
# Initiate chromedriver
driver = webdriver.Chrome(options=chrome_options)

#driver = webdriver.Chrome()
# By default, selenium tries loading page forever. This sets the seconds
# for selenium to wait till a time out error is triggered. 
driver.set_page_load_timeout(30)

email = []
# Error log to record any errors during scraping
errors = []
total_websites = df.shape[0]

# Loop through website urls and scrape email addresses from each website. 
for number, website in enumerate(df.website):
    email.append(get_email(website, pattern))
    # Print progress update after every 250 websites scraped
    if (number + 1) % 50 == 0:
        clear_output(wait = True)
        print(f"Completed {number + 1} out of {total_websites}")
    
df['email'] = email
driver.quit()

Completed 2950 out of 2970


## Display Error Log
Below we display the error log and have made the website urls clickable so you can easily open them from your browser to check if they are broken. Note that errors with a Scrape Stage label of 'Homepage' are errors that occurred when loading the provided url. Errors with a Scrape Stage label of 'Contact_us' are errors that occurred when loading html links that contained the word 'contact'.     

In [11]:
# Build error log df
error_df = pd.DataFrame(errors, columns = ['url', 'Scrape Stage','error desc'])

def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val,val)

# Display error log with clickable website url column
error_df.style.format({'url' : make_clickable})

Unnamed: 0,url,Scrape Stage,error desc
0,http://counsellingincentrallondon.co.uk,Homepage,"WebDriverException('unknown error: net::ERR_NAME_NOT_RESOLVED\n (Session info: headless chrome=108.0.5359.124)', None, None)"
1,javascript:app.openGeneralEnquiry();,Contact_us,"WebDriverException('unknown error: unsupported protocol\n (Session info: headless chrome=108.0.5359.124)', None, None)"
2,http://www.cuisson.co.uk/section/140/1/popdown,Homepage,"WebDriverException('unknown error: net::ERR_CONNECTION_REFUSED\n (Session info: headless chrome=108.0.5359.124)', None, None)"
3,http://www.borobistro.co.uk,Homepage,website_timeout
4,http://www.london-translations.co.uk,Homepage,"WebDriverException('unknown error: net::ERR_NAME_NOT_RESOLVED\n (Session info: headless chrome=108.0.5359.124)', None, None)"
5,http://www.costaireland.ie,Homepage,website_timeout
6,http://www.capricciforlondon.co.uk,Homepage,website_timeout
7,http://www.removalsbermondsey.com,Homepage,"WebDriverException('unknown error: net::ERR_NAME_NOT_RESOLVED\n (Session info: headless chrome=108.0.5359.124)', None, None)"
8,http://www.telstraglobal.com,Homepage,website_timeout
9,http://www.ipcmedia.com,Homepage,website_timeout


Below we summarise the error log with a value count: 

In [32]:
# Shorten string length of error messages to make value counts readable
error_df['error desc short'] = error_df['error desc'].str[0:60]
error_df[['Scrape Stage','error desc short']].value_counts().sort_index()

Scrape Stage                        error desc short                                            
Contact_us                          Stale Element Exception (could not unpack urls from elements      1
                                    WebDriverException('unknown error: net::ERR_NAME_NOT_RESOLVE      1
                                    WebDriverException('unknown error: unsupported protocol\n  (     11
                                    invalid_website_url                                              24
Contact_us - while extracting html  UnexpectedAlertPresentException('unexpected alert open: {Ale      1
Homepage                            WebDriverException('unknown error: net::ERR_ADDRESS_UNREACHA      4
                                    WebDriverException('unknown error: net::ERR_CONNECTION_CLOSE      1
                                    WebDriverException('unknown error: net::ERR_CONNECTION_REFUS     10
                                    WebDriverException('unknown error: 

**Error Findings:**

During the 'Contact Us' Scraping stage (scraping from html links containing the word 'contact') we see the following: 
- 24 Invalid_website_url errors caused by the url in the html link being invalid (its often a string saying 'None' or javascript code). 
- A Stale Element Exception and Unexpected Alert Exception, both caused by website specific issues i.e. issues with the websites themselves. 
- The remaining errors were from webpages that cannot load and presumably no longer exist.
- There were no time out errors during the contact us phase. 

During the 'Homepage' Scraping Stage (when initially loading the provided url) we see the following:
- There were 165 website timeouts. We loaded a handful of these urls after the scrape and they loaded fine. It is probably worth re-running the webscrape for these urls alone to see if they can be scraped successfully. 
- All remaining errors were likely due to the websites no longer existing. We loaded a sample of websites within each error category. Each time, the page was unable to load and it looked like the website no longer existed. 

## Cleaning Email URLs
The regex we used to identify email addresses in the html of the websites was not perfect and would return strings which were not valid websites. These tended to be image files such as jpgs or pngs, placeholder email adresses (joebloggs\@email.com) or other elements of html which contained an @. Some examples are shown below: 

- '8c4075d5481d476e945486754f783364\@sentry.io'
- 'fancybox_sprite\@2x.png'
- 'address\@email.com'

Below we filter out these non-email addresses: 

In [8]:
def filter_emails(emails):
    """Filter non-email addresses from list of email addresses. 
    
    Uses the check() function to test whether an email address 
    string is indeed an email"""
    emails_filtered = [email for email in emails if check(email)]
    return ';'.join(emails_filtered)

def check(email):
    """Returns True if email is an email address, otherwise returns false."""
    
    # checks email address doesnt contain .jpg, .png or .gif
    email_not_jpg = '.jpg' not in email
    email_not_png = '.png' not in email
    email_not_gif = '.gif' not in email
    placeholder_domains = ['@email.com', '@example.com', '@domain.com']
    # Checks that no placeholder_domains are in email address
    email_not_placeholder = not any(placeholder_domain in email 
                                    for placeholder_domain in placeholder_domains)
    
    email_not_too_long = len(email) <= 40
    number_digits_in_email = sum(char.isdigit() for char in email)
    email_not_too_many_digits = number_digits_in_email <= 10
    
    # Returns True if ALL conditions above are True, otherwise returns False
    return all([email_not_jpg, email_not_png, email_not_gif, email_not_placeholder,
                email_not_too_long, email_not_too_many_digits])
    
# Create cleaned_emails column
df['cleaned_emails'] = [filter_emails(emails) for emails in df.email]

In [9]:
df

Unnamed: 0,fsq_id,venue_name,website,email,cleaned_emails
0,4ae47fb4f964a520049b21e3,The Mudlark,http://www.nicholsonspubs.co.uk/restaurants/lo...,[],
1,54e0c3e7498e1f5a17d47621,London Grind,http://grind.co.uk/londongrind,"[grind@uktc.fospha.com, londonbridge@grind.co....",grind@uktc.fospha.com;londonbridge@grind.co.uk...
2,4c07d1fa3cbed13a820c0cc0,The Refectory,http://www.digbytrout.co.uk,[],
3,4be4730b477d9c743091e62d,PizzaExpress,http://www.pizzaexpress.com/,[],
4,5d70cdc529ed6f0008fd9dc0,The Secret Boxing Gym,http://www.thesecret.london,[contact@thesecret.london],contact@thesecret.london
...,...,...,...,...,...
2965,4e4cfe75d164e69742d20ebf,Jamie's,http://www.jamiesbars.co.uk,[],
2966,568bc92b26db4f48ad33323b,Justmatz UK,http://www.justmatz.co.uk,[],
2967,e8f8bf48f8b6461741842ea2,Henry Ives Personal Training,https://www.hplusperformance.co.uk,[henry@hplusperformance.co.uk],henry@hplusperformance.co.uk
2968,5b53530b8a144c256bbd9d92,E B 7,http://www.eb7.co.uk,[info@eb7.co.uk],info@eb7.co.uk


## Save to CSV
Save data to csv with todays date in the name. 

In [10]:
todays_date = datetime.today().strftime('%d-%b-%y')
df.to_csv('scraped_emails' + todays_date + '.csv',
                       index = False)

## Debugging
The below code cells are useful for debugging (if the code encounters an error and stops during the email scrape). It has taken the basic elements of the scraping code and runs them in separate cells.  

**Load webpage and Scrape any emails present in html**

In [16]:
driver = webdriver.Chrome()

driver.get('https://www.graingerplc.co.uk/')
find_emails('blag','blag')

[]

**Find any Contact Us Clickable Links in page**

In [17]:
elements = driver.find_elements_by_xpath("//a[contains(translate(text(),'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'contact')]")
elements

[<selenium.webdriver.remote.webelement.WebElement (session="f0757077245fcc9f43258d2ff5951062", element="70648369-cb27-4c9d-af1d-f585a7adba58")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f0757077245fcc9f43258d2ff5951062", element="236036f4-9b6c-4d3e-a9c7-f8ff7489d474")>]

**Extract URLs from HTML Elements**

In [18]:
urls = [element.get_attribute('href') for element in elements]

**Loop through URLS and Open each in new Tab and find emails**

In [20]:
for url in urls: 
    # Open new tab
    driver.execute_script("window.open('');")
    # Switch to new tab (tab index starts at 0)
    driver.switch_to.window(driver.window_handles[1])
    # Load new url with driver (on new tab)
    driver.get(url)
    print(find_emails('blag','blag'))
    # Close new tab
    driver.close()
    # Switch back to original tab
    driver.switch_to.window(driver.window_handles[0])

['lettings@graingerplc.co.uk', 'lettings@graingerplc.co.uk', 'info@graingerplc.co.uk', 'kmueller@graingerplc.co.uk', 'kmueller@graingerplc.co.uk', 'grainger@camarco.co.uk', 'grainger@camarco.co.uk', 'kmueller@graingerplc.co.uk', 'kmueller@graingerplc.co.uk', 'chopkinson@graingerplc.co.uk', 'chopkinson@graingerplc.co.uk', 'amcghin@graingerplc.co.uk', 'amcghin@graingerplc.co.uk', 'mailtoJBlackledge@graingerplc.co.uk', 'JBlackledge@graingerplc.co.uk', 'help@graingerplc.co.uk', 'help@graingerplc.co.uk']
['lettings@graingerplc.co.uk', 'lettings@graingerplc.co.uk', 'info@graingerplc.co.uk', 'kmueller@graingerplc.co.uk', 'kmueller@graingerplc.co.uk', 'grainger@camarco.co.uk', 'grainger@camarco.co.uk', 'kmueller@graingerplc.co.uk', 'kmueller@graingerplc.co.uk', 'chopkinson@graingerplc.co.uk', 'chopkinson@graingerplc.co.uk', 'amcghin@graingerplc.co.uk', 'amcghin@graingerplc.co.uk', 'mailtoJBlackledge@graingerplc.co.uk', 'JBlackledge@graingerplc.co.uk', 'help@graingerplc.co.uk', 'help@graingerpl

## Appendix - Simple Scraping Function
The following function was an older approach to scraping that only scraped the url site provided, it did not search for links to contacts pages and look for emails there as well. 

In [54]:
# Regex pattern to identify email addresses. Essentially accepts anything 
# of the form joebloggs@domain.something
pattern = re.compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")

def get_email(website, pattern):
    """Loads website url in chrome and scrapes email addresses
    
    Args: 
        website (string): url of website you wish to scrape
        pattern (regex): regex pattern to identify email addresses in html
        
    returns: 
        (list): A list of email address strings. If there was an error,
        returns a list containing the error description as a string."""
    
    try: 
        # Load webpage
        driver.get(website)
        # Extract html of webpage
        html = driver.page_source
        # Extract emails from html
        emails = re.findall(pattern, html)
        # Convert all emails to lowercase
        emails = [email.lower() for email in emails]
        # Remove duplicates on emails
        emails = list(set(emails))
        # Return emails
        return emails
    except InvalidArgumentException:
        # Log Invalid Argument error in error log
        errors.append([website, 'invalid_website_url'])
        return ['invalid_website_url']
    except TimeoutException:
        # Log Timeout error in error log
        errors.append([website, 'website_timeout'])
        return ['website_timeout']
    except Exception as e:
        # Catch any other exception and log 
        error_message = getattr(e, 'message', repr(e))
        errors.append([website, error_message])
        return [error_message]