In [19]:
from bs4 import BeautifulSoup as bs
import requests
import re

def get_main_url(url):
    pattern = r"(?:https\:\/\/)?(?:www\.)?([\w\-]+\.\w+)"
    match = re.search(pattern, url)
    if match is not None:
        return match.group(1)
    else:
        return None

class EmailScrapper:
    def __init__(self, original_url):
        self.original_url = original_url
        self.main_url = get_main_url(original_url)
        self.visited_urls = set()
        self.all_urls = set()
        self.all_emails = set()
        self.max_depth = 20

    def get_all_links(self, soup):
        try:
            for tag in soup.find_all():
                if tag.has_attr("href"):
                    url = tag.get("href")
                    if (get_main_url(url)==self.main_url) and (url not in self.visited_urls):
                        self.all_urls.add(url)
        except Exception as ex:
            print(ex)
    
    def get_all_emails(self, soup):
        pattern = r"([\w\-\.]+\@(?:[\w\-]+\.)+[\w\-]{2,4})"
        content = str(soup.body)
        for email in re.findall(pattern, content):
            self.all_emails.add(email)
    
    def get_content(self, url):
        try:
            request = requests.get(url)
            soup = bs(request.text, features="html.parser")
            return soup
        except Exception as ex:
            print(ex)

    def run(self):
        soup = self.get_content(self.original_url)
        self.get_all_emails(soup)
        self.get_all_links(soup)
        i = 0
        while (len(self.all_urls) > 0) and (i < self.max_depth):
            i += 1
            url = self.all_urls.pop()
            soup = self.get_content(url)
            self.get_all_emails(soup)
            self.get_all_links(soup)            
            self.visited_urls.add(url)

In [20]:
email = EmailScrapper("https://www.12pinguinos.com/newweb/en/home/")

In [21]:
email.run()



In [31]:
len(email.all_urls)

100

In [5]:
def get_main_url(url):
    pattern = r"(?:https\:\/\/)?(?:www\.)?([\w\-]+\.\w+)"
    match = re.search(pattern, url)
    if match is not None:
        return match.group(1)
    else:
        return None

In [6]:
original_url = "https://www.12pinguinos.com/newweb/en/home/"
main_url = get_main_url(original_url)
visited_urls = set()
all_urls = set()
all_emails = set()
max_depth = 20

In [13]:
def get_all_links(soup):
    try:
        for h in soup.find_all('a'):
            for url in h.get('href'):
                if (get_main_url(url) in main_url) and (url not in visited_urls):
                    all_urls.add(url)
    except Exception as ex:
        print(ex)
    
def get_all_emails(soup):
    pattern = r"([\w\-\.]+\@(?:[\w\-]+\.)+[\w\-]{2,4})"
    content = str(soup.body)
    for email in re.findall(pattern, content):
        all_emails.add(email)

In [14]:
request = requests.get(original_url)
soup = bs(request.text, features="html.parser")
get_all_emails(soup)
get_all_links(soup)
i = 0

'in <string>' requires string as left operand, not NoneType


In [30]:
for tag in soup.find_all():
    if tag.has_attr("href"):
        url = tag.get("href")
        print(url, type(url))
        if (get_main_url(url)==main_url) and (url not in visited_urls):
            all_urls.add(url)

https://gmpg.org/xfn/11 <class 'str'>
//s.w.org <class 'str'>
https://www.12pinguinos.com/newweb/en/feed/ <class 'str'>
https://www.12pinguinos.com/newweb/en/comments/feed/ <class 'str'>
https://www.12pinguinos.com/newweb/wp-content/plugins/column-shortcodes//assets/css/shortcodes.css?ver=1.0.1 <class 'str'>
https://www.12pinguinos.com/newweb/wp-content/themes/twentyseventeen/assets/fonts/font-libre-franklin.css?ver=20230328 <class 'str'>
https://www.12pinguinos.com/newweb/wp-content/themes/twentyseventeen/style.css?ver=20240116 <class 'str'>
https://www.12pinguinos.com/newweb/wp-content/themes/twentyseventeen/assets/css/blocks.css?ver=20220912 <class 'str'>
https://www.12pinguinos.com/newweb/wp-content/plugins/youtube-embed-plus/styles/ytprefs.min.css?ver=14.2.1 <class 'str'>
https://www.12pinguinos.com/newweb/wp-json/ <class 'str'>
https://www.12pinguinos.com/newweb/xmlrpc.php?rsd <class 'str'>
https://www.12pinguinos.com/newweb/wp-includes/wlwmanifest.xml <class 'str'>
https://www.1

In [63]:
a = {1,2,3,4,5}

{2, 3, 4, 5}


In [77]:
a.pop()
print(a)

KeyError: 'pop from an empty set'

In [17]:
all_emails

{'12pinguinos@12pinguinos.com'}

In [None]:
while (len(all_urls)>0) or (i < max_depth):
    i += 1
    