#  Web scrapping for NLP task.

The goal of this notebook is to build a tool that can scrape text from a given list of websites, in order to use it later for clustering the sites. 

The task indicates that we should get text from the landing page, as well as text from the links contained in the landing page. 

Since many requests will be necessary, some mechanism has to be put in place in order to avoid being blocked. 
(user-agents, proxy, etc.)

As each page contains many links, parallel processing can be implemented in order to speed up the scrapping. 

The final product should be able to take a list of websites and build text files with the contents of each site. 
Additional parameters could be included for managing, for instance, the pareallel processing, or maybe some further filtering of the contents. 

##  Scrapping from one site

Let's use one of the given URLs to get an idea of the kind of websites we have. 

For example, this british pipe supplier: http://www.besseges-vtf.co.uk/

In [None]:
import requests
import random
from bs4 import BeautifulSoup
from urllib.parse import urlparse

In [None]:
# Headers for user agent rotation:
# Full headers obtained from hhttpbin.org

# Firefox 84 Ubuntu

h1 = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding":"gzip, deflate, br",
    "Accept-Language":"en-US,en;q=0.5",
    "Connection":"keep-alive",
    "Host":"httpbin.org",
    "TE":"Trailers",
    "Upgrade-Insecure-Requests":"1",
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"
  }
# Chrome 87 Ubuntu
h2= {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 
    "Accept-Encoding": "gzip, deflate, br", 
    "Accept-Language": "en-US,en;q=0.9,fr;q=0.8,es;q=0.7", 
    "Host": "httpbin.org", 
    "Sec-Fetch-Dest": "document", 
    "Sec-Fetch-Mode": "navigate", 
    "Sec-Fetch-Site": "none", 
    "Sec-Fetch-User": "?1", 
    "Upgrade-Insecure-Requests": "1", 
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36", 
  }

headers_list = [h1,h2]

In [None]:
def get_links(soup, tags = 'a', remove_duplicates = True):
    """ Get get all the links for the given tags from a parsed page.  
    soup: an html page parsed with beautifulsoup.
    tags: string or list of strings indicating the html tags to search. 
    """
    
    links = [] # list to store the links found
    
    for tag in tags:
        for link in soup.find_all(tag, href=True):
            links.append(link['href'])
           
    #remove repeated elements
    if remove_duplicates:
        links = list(set(links))
    
    #links = [link['href'] for tag in tags for link in soup.find_all(tag, href=True) ]        
        
    return links

In [None]:
def filter_links(home, links_list):
    """
    Takes a home address and a list of links, and filters out links to external sites
    and to some common file types.
    home: string. The URL of the home page.
    links_list: list of strings with the links found on the page, as produced by get_links.
    """
    
    domain = urlparse(home).netloc # domain to to check for external links.
    
    # path to include before an internal link. Remove final '/' if present.
    path = home[:-1] if home.endswith('/') else home 

    unwanted_starts = ('javascript:', 'mailto:', 'tel:', '#', '..', '../') 
    
    unwanted_endings = ('.pdf', '.jpg', '.jpeg', '.png', '.gif', '.exe', '.js',
                        '.zip', '.tar', '.gz', '.7z', '.rar'
                       )
    
    filtered_links = list(filter(lambda link: not (link.lower().startswith(unwanted_starts) or 
                                                   link.lower().endswith(unwanted_endings)),links_list
                                )
                         )
    
    # get internal links that don't have the full URL
    internal_links = [link for link in filtered_links if not link.startswith('http') ]

    # Ensure starting '/'  
    for j, intlink in enumerate(internal_links):
        if not intlink.startswith('/'):
            internal_links[j]='/'+intlink
            
    internal_links = [path + intlink for intlink in internal_links]
    
    # removing external links
    filtered_links = list(filter(lambda link: (link.lower().startswith('http') and
                                                domain in link.lower()), filtered_links
                                )
                         )
    
    # include internal links
    filtered_links.extend(internal_links)
    
    # remove home url if present.    
    try:
        filtered_links.remove(path)
    except(ValueError):
        pass
    try:
        filtered_links.remove(path+'/')
    except(ValueError):
        pass
    
    
    return filtered_links
    

In [None]:
def write_page(file, text):
    
    with open(file,'a') as websitetext:
        websitetext.write(text)
        
    return None

Trial URLs

    *'http://www.besseges-vtf.co.uk'
    *'http://lumaquin.com'
    *'https://www.degso.com'
    *'http://www.ictsl.net'
    *'https://barrocorestaurante.mx'
    *'https://www.gummigoetz.de'
    *'http://www.suppliersof.com'


In [None]:
# Set landing page URL

MAIN_URL = 'http://www.besseges-vtf.co.uk'

FILES_DIRECTORY = './site_contents/'

In [None]:
# Request contents

random_header = random.choice(headers_list)

landing_page = requests.get(MAIN_URL, {'header': random_header})
landing_html = BeautifulSoup(landing_page.content, 'html.parser')

In [None]:
# Set a file name for the website and write the text of the main page.
file_name = FILES_DIRECTORY + urlparse(MAIN_URL).netloc

landing_page_text=landing_html.get_text(separator = '\n', strip=True)

write_page(file_name, landing_page_text)

In [None]:
link_list = get_links(landing_html)

In [None]:
link_list

In [None]:
link_list = filter_links(MAIN_URL, link_list)

In [None]:
link_list