#  Web scrapping for NLP task.

The goal of this notebook is to build a tool that can scrape text from a given list of websites, in order to use it later for clustering the sites. 

The task indicates that we should get text from the landing page, as well as text from the links contained in the landing page. 

Since many requests will be necessary, some mechanism has to be put in place in order to avoid being blocked. 
(user-agents, proxy, etc.)

As each page contains many links, parallel processing can be implemented in order to speed up the scrapping. 

The final product should be able to take a list of websites and build text files with the contents of each site. 
Additional parameters could be included for managing, for instance, the pareallel processing, or maybe some further filtering of the contents. 

##  Scrapping from one site

Let's use one of the given URLs to get an idea of the kind of websites we have. 

For example, this british pipe supplier: http://www.besseges-vtf.co.uk/

In [None]:
import requests
import random
import time
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from multiprocessing import Pool

In [None]:
def get_header():
    """
    Returns a random header dictionary to be passed to requests. 
    """

    # Headers for user agent rotation:
    # Full headers obtained from hhttpbin.org
    # Firefox 84 Ubuntu


    h1 =  {
        "Accept": 	"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding":	"gzip, deflate, br",
        "Accept-Language":	"en-US,en;q=0.5",
        "Connection":	"keep-alive",
        "Host":	"httpbin.org",
        "TE":	"Trailers",
        "Upgrade-Insecure-Requests":	"1",
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
      }

    #Firefox 84 Windows 10

    h2 = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 
        "Accept-Encoding": "gzip, deflate, br", 
        "Accept-Language": "en-GB,en;q=0.5", 
        "Host": "httpbin.org", 
        "Upgrade-Insecure-Requests": "1", 
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
       }

    # Chrome 87 Ubuntu

    h3 = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
        "Accept-Encoding": "gzip, deflate, br", 
        "Accept-Language": "en-US,en;q=0.9,fr;q=0.8,es;q=0.7", 
        "Host": "httpbin.org", 
        "Sec-Fetch-Dest": "document", 
        "Sec-Fetch-Mode": "navigate", 
        "Sec-Fetch-Site": "none", 
        "Sec-Fetch-User": "?1", 
        "Upgrade-Insecure-Requests": "1", 
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36", 
      }

    #Chrome 87 Windows 10

    h4 = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
        "Accept-Encoding": "gzip, deflate", 
        "Accept-Language": "es-419,es;q=0.9,fr;q=0.8,en;q=0.7", 
        "Host": "httpbin.org", 
        "Upgrade-Insecure-Requests": "1", 
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
      }

    # Microsoft Edge 87 Windows 10

    h5 = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
        "Accept-Encoding": "gzip, deflate, br", 
        "Accept-Language": "en-US,en;q=0.9", 
        "Host": "httpbin.org", 
        "Sec-Fetch-Dest": "document", 
        "Sec-Fetch-Mode": "navigate", 
        "Sec-Fetch-Site": "none", 
        "Sec-Fetch-User": "?1", 
        "Upgrade-Insecure-Requests": "1", 
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75"
      }

    headers_list = [h1, h2, h3, h4, h5]
    
    return random.choice(headers_list)

In [None]:
def get_links(soup, tags = 'a'):
    """ Get get all the links for the given tags from a parsed page.  
    soup: an html page parsed with beautifulsoup.
    tags: string or list of strings indicating the html tags to search. 
    """
    
    links = [] # list to store the links found
    
    for tag in tags:
        for link in soup.find_all(tag, href=True):
            links.append(link['href'])
           
    # avoiding repetitions
    links = list(set(links))
        
        
    return links

In [None]:
def filter_links(home, links_list):
    """
    Takes a home address and a list of links, and filters out links to external sites
    and to some common file types.
    home: string. The URL of the home page.
    links_list: list of strings with the links found on the page, as produced by get_links.
    """
    
    domain = urlparse(home).netloc # domain to to check for external links.
    
    # path to include before an internal link. Remove final '/' if present.
    path = home[:-1] if home.endswith('/') else home 

    unwanted_starts = ('javascript:', 'mailto:', 'tel:', '#', '..', '../') 
    
    unwanted_endings = ('.pdf', '.jpg', '.jpeg', '.png', '.gif', '.exe', '.js',
                        '.zip', '.tar', '.gz', '.7z', '.rar'
                       )
    
    filtered_links = list(filter(lambda link: not (link.lower().startswith(unwanted_starts) or 
                                                   link.lower().endswith(unwanted_endings)),links_list
                                )
                         )
    
    # get internal links that don't have the full URL
    internal_links = [link for link in filtered_links if not link.startswith('http') ]

    # Ensure starting '/'  
    for j, intlink in enumerate(internal_links):
        if not intlink.startswith('/'):
            internal_links[j]='/'+intlink
            
    internal_links = [path + intlink for intlink in internal_links]
    
    # removing external links
    filtered_links = list(filter(lambda link: (link.lower().startswith('http') and
                                                domain in link.lower()), filtered_links
                                )
                         )
    
    # include internal links
    filtered_links.extend(internal_links)
    
    # keeping disntinct elements only
    
    filtered_links = list(set(filtered_links))
    
    # remove home url if present.    
    try:
        filtered_links.remove(path)
    except(ValueError):
        pass
    try:
        filtered_links.remove(path+'/')
    except(ValueError):
        pass
        
    return filtered_links
    

In [None]:
def scrape_main(main_url):
    
    """
    Takes the URL of the main site, scrapes the text and the links. 
    site_url: string. url of the desired site.
    """
    
    random_header = get_header()
    
    page = requests.get(main_url, {'header': random_header}, timeout=(2, 5))
    soup = BeautifulSoup(page.content, 'html.parser')
    
    
    page_text = soup.get_text(separator = '\n', strip=True) 
       
    page_links = get_links(soup)
    page_links = filter_links(main_url, page_links)
        
    return page_text, page_links   

In [None]:
def scrape_links(link_url):
    
    """
    Takes the URL from one of the link, scrapesa and returns the text. 
    link_url: string. url of the desired site.
    """
    
    random_header = get_header()
    
      
    page = requests.get(link_url, params = {'header': random_header}, timeout=(2, 5))
    soup = BeautifulSoup(page.content, 'html.parser')
        
    page_text = soup.get_text(separator = '\n', strip=True) 
    
    #print(f'Retrieved text from {link_url}')
    
    return page_text

In [None]:
def scrape_links_try(mylink_url):
    
    try:
        scrapping_result = scrape_links(mylink_url)
    except requests.exceptions.RequestException as err:
        scrapping_result = [mylink_url, err]
        
    return scrapping_result

Trial URLs

    *'http://www.besseges-vtf.co.uk'
    *'http://lumaquin.com'
    *'https://www.degso.com'
    *'http://www.ictsl.net'
    *'https://barrocorestaurante.mx'
    *'https://www.gummigoetz.de'
    *'http://www.suppliersof.com'


In [None]:
SITES_LIST = []

with open('./site_lists/01_websites.csv', 'r', newline = '') as f:
    for site in f.readlines():
        SITES_LIST.append(site.strip())

In [None]:
MAX_LINKS = 20

CONTENTS_DIR = './site_contents/'
LOG_DIR = './logs/'

failed_requests = []
succes_requests = []

for site in SITES_LIST:
    
    domain = urlparse(site).netloc
    
    start_time = time.localtime()
    timestr = time.strftime("%Y%m%d_%H%M", start_time)
    
    file_name = CONTENTS_DIR + domain

    # get links and text from the main site

    try:
        text, links_list = scrape_main(site)
        succes_requests.append(site)
    except requests.exceptions.RequestException as err:
        failed_requests.append((site,str(err)))
        print(f'Error in site: {site}')
    else:   
        
        # Create file and write text of the main site
        with open(file_name, 'w') as f:
            f.write(text)

        print(f'Text from main page {domain} written to {file_name}')
          
        start_time_p = time.time()
        print('BEGIN PARALLELL LINK SCRAPPING')
        
        links_list = links_list[:MAX_LINKS]
        if __name__ == '__main__':
            with Pool(6) as p:
                link_scrap_results = p.map(scrape_links_try, links_list)
                       
        duration_p = time.time() - start_time_p
        
        print(f'{len(links_list)} links scrapped in {duration_p:.2f} seconds. ')


        text = list(filter(lambda result: type(result) is str,link_scrap_results))
        link_errors = list(filter(lambda result: type(result) is list,link_scrap_results))
        link_errors = [[site, str(err)] for site, err in link_errors]
        
        
        with open(file_name, 'a') as f:
            f.write('\n'.join(text))
        
        # Log link errors if more than half fail.
        if len(link_errors) >= len(link_scrap_results)/2 : 

            link_errors = ['\n'.join(error) for error in link_errors]

            with open(LOG_DIR+timestr+ '_link_report_' + domain, 'w') as link_log:

                    link_log.writelines(link_errors)

report_str = (f'{len(SITES_LIST) } requested.' + '\n' 
              + f'{len(succes_requests) } SUCCESFUL.' + '\n'
              + f'{len(failed_requests) } FAILURES.' + '\n'
              + '='*20 +'\n FAILED SITES \n' + '='*20 + '\n\n')
                
               
with open(LOG_DIR+timestr+'_scrapping_repport', 'w') as scrapping_log:
    
    scrapping_log.write(report_str)
    scrapping_log.writelines(['\n'.join(failure)+'\n\n' for failure in failed_requests])
    
    scrapping_log.write('\n\n SUCCESFUL REQUESTS \n\n')
    scrapping_log.write('\n'.join(succes_requests))
        