In [15]:
import re
import requests
import requests.exceptions
from urllib.parse import urlsplit
from collections import deque
from bs4 import BeautifulSoup

def websiteScrapper(url_of_interest, depth_from_site):
    # starting url.
    starting_url = url_of_interest
    if starting_url.endswith('/'):
        starting_url = starting_url[:-1]

    starting_base_url = "{0.scheme}://{0.netloc}".format(urlsplit(starting_url))

    unprocessed_urls = deque([starting_url])
    processed_urls = set()

    emails = set()
    email_dict = dict()

    counter = 0
    depth = depth_from_site

    # process urls one by one from unprocessed_url queue until queue is empty
    while len(unprocessed_urls) and counter < depth:

        # move next url from the queue to the set of processed urls
        url = unprocessed_urls.popleft()
        processed_urls.add(url)

        # extract base url to resolve relative links
        parts = urlsplit(url)
        base_url = "{0.scheme}://{0.netloc}".format(parts)
        if base_url[-1] == '.':
            base_url = base_url[:-1]
        path = url[:url.rfind('/')+1] if '/' in parts.path else url

        counter = checkBase(base_url, starting_base_url, counter)

        # get url's content
        print("Crawling URL %s" % url)
        try:
            response = requests.get(url)
        except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
            print('error accessing page')
            continue
        except:
            print('final error')
            continue

        # get email addresses
        new_emails = set(re.findall(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)", response.text, re.I))
        emails.update(new_emails)
        for e in new_emails:
            if '.jpg' not in e or '.png' not in e:
                if e not in email_dict:
                    email_dict[e] = set()
                email_dict[e].add(url)

        # create a beutiful soup for the html document
        soup = BeautifulSoup(response.text, 'lxml')

        # Once this document is parsed and processed, now find and process all the anchors i.e. linked urls in this document
        for anchor in soup.find_all("a"):
            # extract link url from the anchor
            link = anchor.attrs["href"] if "href" in anchor.attrs else ''
            # resolve relative links (starting with /)
            if link.startswith('/'):
                link = base_url + link
            elif not link.startswith('http'):
                link = path + link
            # add the new url to the queue if it was not in unprocessed list nor in processed list yet
            if not link in unprocessed_urls and not link in processed_urls and 'twitter' not in link and 'facebook' not in link and '.pdf' not in link:
                unprocessed_urls.append(link)
    print('Url crawled')
    return email_dict

def checkBase(base_url, starting_base_url, counter):
    mod_base_url = base_url
    mod_starting_base_url = starting_base_url
    if 'http:' in base_url:
        mod_base_url = re.sub('http://','',url)
    if 'https:' in base_url:
        mod_base_url = re.sub('https://','',url)
    if 'http:' in starting_base_url:
        mod_starting_base_url = re.sub('http://','',url)
    if 'https:' in starting_base_url:
        mod_starting_base_url = re.sub('https://','',url)
    if mod_base_url != mod_starting_base_url:
        counter += 1
    else:
        counter = 0
    return counter

In [16]:
url_list = ['http://feve.org']
depth_from_website = 20
collected_emails = dict()
for url in url_list:
    email_dictionary = websiteScrapper(url, depth_from_website)
    collected_emails.update(email_dictionary)
print('All Done!')

Crawling URL http://feve.org
Crawling URL https://feve.org/about-glass/
Crawling URL https://feve.org/about-glass/european-glass-container-industry/
Crawling URL https://feve.org/about-glass/facts-product-details/
Crawling URL https://feve.org/about-glass/visions/
Crawling URL https://feve.org/about-glass/statistics/
Crawling URL https://feve.org/glass-news/
Crawling URL https://feve.org/category/feve-press-office/
Crawling URL https://feve.org/category/industry-positions/
Crawling URL https://feve.org/category/industry-news/
Crawling URL https://feve.org/category/events/
Crawling URL https://feve.org/glass-news/media-partnerships/
Crawling URL https://feve.org/about-feve/
Crawling URL https://feve.org/about-feve/who-is-feve/
Crawling URL https://feve.org/about-feve/feve-team/
Crawling URL https://feve.org/about-feve/feve-jobs/
Crawling URL https://feve.org/about-feve/feve-members/
Crawling URL https://feve.org/about-feve/partners/
Crawling URL https://feve.org/about-feve/library/
Craw

In [13]:
import pandas as pd
email_dataframe = pd.DataFrame.from_dict(collected_emails, orient='index')

In [14]:
email_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
secretariat@feve.org,https://feve.org/about-glass/facts-product-det...,https://feve.org/category/feve-press-office/,https://feve.org/about-feve/feve-publications/,https://feve.org/about-glass/visions/,https://feve.org/about-feve/,https://feve.org/,https://feve.org/about-glass/european-glass-co...,https://feve.org/feve-plants/,https://feve.org/category/industry-positions/,https://feve.org/production-2017/,...,https://feve.org/category/industry-news/,https://feve.org/about-glass/statistics/,https://feve.org/category/events/,https://feve.org/about-glass/visions/health/,http://feve.org,https://feve.org/feve-voicing-glass-flaconnage...,https://feve.org/the-gob/,https://feve.org/about-feve/who-is-feve/,https://feve.org/about-feve/feve-jobs/,https://feve.org/new-life-cycle-assessment-pro...
ts@packagingeurope.com,https://feve.org/glass-news/media-partnerships/,,,,,,,,,,...,,,,,,,,,,
a.farrelly@feve.org,https://feve.org/about-feve/feve-team/,,,,,,,,,,...,,,,,,,,,,
l.vanderbrugge@feve.org,https://feve.org/about-feve/feve-team/,,,,,,,,,,...,,,,,,,,,,
m.delleselve@feve.org,https://feve.org/about-feve/feve-team/,,,,,,,,,,...,,,,,,,,,,
f.rivet@feve.org,https://feve.org/about-feve/feve-team/,,,,,,,,,,...,,,,,,,,,,
jp.judson@feve.org,https://feve.org/about-feve/feve-team/,,,,,,,,,,...,,,,,,,,,,
info@saverglass.com,https://feve.org/about-feve/feve-members/,,,,,,,,,,...,,,,,,,,,,
info.kipfenberg@sgdgroup.com,https://feve.org/about-feve/feve-members/,,,,,,,,,,...,,,,,,,,,,
bavidro@baglass.com,https://feve.org/about-feve/feve-members/,,,,,,,,,,...,,,,,,,,,,


In [23]:
email_dataframe.to_csv('email_addresses.csv', sep=",")