TODO:

*   Some image links are just lead to the website containing the image, need to find a better way to get the full URL of the image (ideally one that doesn't involve messy string manipulation)
*   Still results in some broken/inaccesible links, will need to either filter those out in real time or remove them after processing
*   Doing "catch all" error handling can hide real errors, would like to be able to list and catch specific errors
*   Would like to store image URLs along with the keywords their alt contained so that they can be used to help with data cleaning later



In [None]:
# Import dependencies
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser
import random
import requests
import urllib
import time
import threading

# Google drive integration
from google.colab import drive
drive.mount("/content/gdrive")

In [129]:
class Scraper():
  def __init__(self, PATH_TO_KEYWORDS, PATH_TO_SEED_URLS
               ,FOLLOW_ONLY_IF_IMAGES_FOUND = True
               ,MAX_TIMEOUT_IN_SECONDS = 3
               ,NUM_IMAGES_TO_COLLECT = 1000
               ,POLITE_MODE = True
               ,THREADS = 10
               ,VERBOSE = 1):
    # Parameters
    self.FOLLOW_ONLY_IF_IMAGES_FOUND = FOLLOW_ONLY_IF_IMAGES_FOUND # Only scrape links on a page if images were actually found on that page
    self.MAX_HOST_SELECTION_RETRIES = 5 # Maximum number of times the scraper can attempt to find a new host to scrape if a host has already been recently scraped
    self.MAX_TIMEOUT_IN_SECONDS = MAX_TIMEOUT_IN_SECONDS # Maximum timeout for requests
    self.NUM_IMAGES_TO_COLLECT = NUM_IMAGES_TO_COLLECT # Total number of image links to collect before the scraper stops
    self.PATH_TO_KEYWORDS = PATH_TO_KEYWORDS # Path to file containing keywords split by newlines
    self.PATH_TO_SEED_URLS = PATH_TO_SEED_URLS # Path to file containing URLs split by newlines
    self.POLITE_MODE = POLITE_MODE # Whether to respect robots.txt or not
    self.THREADS = THREADS # Number of threads to create each scrape
    self.VERBOSE = VERBOSE # Changes the amount of information shown, should be 0, 1, or 2

    # Variable initialization
    self.keywords = None # List containing keywords any images downloaded must contain in their alts
    self.seed_urls = None # Seed urls for the scraper to start on
    self.hosts = dict() # Dictionary sorting URLs by their hosts
    self.already_visited = set() # Links that have already been searched
    self.image_urls = set() # Scraped image URLs
    self.no_scrape_hosts = set() # Don't bother searching URLs under hosts in this set
    self.lock = threading.Lock() # Lock used by threading to ensure safety

    # State variables
    self.loaded_keywords = False
    self.loaded_seed_urls = False

  def begin_scrape(self):
    self._loadData()
    self._sortUrlsByHosts()
    while len(self.image_urls) < self.NUM_IMAGES_TO_COLLECT:
      try:
        hosts_to_scrape = self._getHostsToScrape()
        self._startScraperThreads(hosts_to_scrape)
      except Exception as e: # A variety of errors might be encountered due to incorrectly formatted sites or whatnot
        if e is KeyboardInterrupt:
          return
        self._verboseLog(e, 2)

  def _scrape(self, host_to_scrape):
    try:
      url = self._getRandomUrlFromHost(host_to_scrape)
      if self._stopIfScrapingNotAllowed(url, host_to_scrape):
        return
      parsed = self._parseHTML(url)
      num_images_found = self._findAndStoreImages(parsed, url)
      if self._shouldContinueSearch(num_images_found):
        links = self._getLinks(parsed, url)
        self._appendFoundLinksToAppropriateHosts(links)
    except Exception as e:
        if e is KeyboardInterrupt:
          raise e
        self._verboseLog(e, 2)

  def _startScraperThreads(self, hosts_to_scrape):
    threads = []
    for host in hosts_to_scrape:
      threads.append(threading.Thread(target=self._scrape, args=((host,))))
    for thread in threads:
      thread.start()
    for thread in threads:
      thread.join()
    self._verboseLog("Total image count: "+str(len(self.image_urls)), 1)

  def _getHostsToScrape(self):
    hosts_to_scrape = []
    already_selected = set()
    for i in range(self.THREADS):
      attempt = 0
      while True:
        host = random.choice(list(self.hosts.keys()))
        already_selected.add(host)
        if host in self.no_scrape_hosts:
          del(self.hosts[host])
          continue # If this is a no-scrape host, try again
        if host in already_selected and attempt < self.MAX_HOST_SELECTION_RETRIES:
          attempt += 1
          continue # We want to avoid scraping the same host a lot
        hosts_to_scrape.append(host)
        break
    return hosts_to_scrape

  def _shouldContinueSearch(self, num_images_found):
    if not self.FOLLOW_ONLY_IF_IMAGES_FOUND:
      return True
    elif num_images_found > 0:
      return True
    else:
      self._verboseLog("Website did not yield any images, will not follow links", 2)
      return False

  def _appendFoundLinksToAppropriateHosts(self, links):
    # Append links to their appropriate hosts
      for url in links:
        if url in self.already_visited:
          continue
        if self._link_is_invalid(url):
          continue
        hostname = urlparse(url).hostname
        with self.lock:
          if hostname not in self.hosts:
            self.hosts[hostname] = []
          self.hosts[hostname].append(url)

  def _findAndStoreImages(self, parsed, url):
    found_imgs = self._getImages(parsed, url)
    num_imgs_found = len(found_imgs)
    self._verboseLog("Found "+str(num_imgs_found)+" images", 2)
    with self.lock:
      self.image_urls = self.image_urls.union(found_imgs)
    return num_imgs_found

  def _parseHTML(self, url):
    html = requests.get(url, timeout=self.MAX_TIMEOUT_IN_SECONDS).text # this could probably be done with urllib in order to reduce dependencies, but requests is easier (and safer) to use
    self._verboseLog("Pulling HTML from " + url, 2)
    parsed = BeautifulSoup(html)
    return parsed

  def _getRandomUrlFromHost(self, host):
    index = random.randint(0, len(self.hosts[host])-1)
    url = self.hosts[host][index]
    return url

  def _stopIfScrapingNotAllowed(self, url, host_to_scrape):
    if self.POLITE_MODE:
      if not self._websiteAllowsScraping(url, host_to_scrape):
        self._verboseLog("robots.txt doesn't allow scraping, finding a new url to scrape", 2)
        with self.lock:
          try:
            del(self.hosts[host_to_scrape])
          except KeyError:
            self._verboseLog("Failed to remove a host, maybe it was removed by another thread already?", 2)
          self.no_scrape_hosts.add(host_to_scrape)
        return True
      return False

  def _websiteAllowsScraping(self, url, host_to_scrape):
    rp = TimeoutRobotFileParser(timeout=self.MAX_TIMEOUT_IN_SECONDS)
    rp.set_url(url)
    rp.read()
    canFetch = rp.can_fetch("*", host_to_scrape + "/robots.txt")
    if canFetch:
      return True
    return False

  def _sortUrlsByHosts(self):
    for url in self.seed_urls:
      hostname = urlparse(url).hostname
      if hostname not in self.hosts:
        self.hosts[hostname] = []
      self.hosts[hostname].append(url)

  def _loadData(self):
    self._loadSeedUrls()
    self._loadKeywords()

  def _loadKeywords(self):
    self.keywords = self._loadFromFile(self.PATH_TO_KEYWORDS)
    self.loaded_keywords = True

  def _loadSeedUrls(self):
    self.seed_urls = self._loadFromFile(self.PATH_TO_SEED_URLS)
    self.loaded_seed_urls = True

  def _loadFromFile(self, PATH):
    with open(PATH, 'r') as f:
      txt = f.read()
      lst = txt.lower().split('\n')
      # remove trailing newlines if present
      lst = self._removeEmptyIndexes(lst)
      return lst

  def _removeEmptyIndexes(self, l):
    l = [elem for elem in l if elem != '']
    return l

  def _getLinks(self, parsed, url):
    links = []
    for element in parsed.find_all("a"):
      link = element.get("href")
      if link == None or link == "": # If the link is empty, ignore it
        continue
      link = self._ensureUniversal(link, url)
      links.append(link)
    self._verboseLog("Collected "+str(len(links))+" links", 2)
    return links

  def _ensureUniversal(self, link, url):
    try:
      if link[0] == "/" and link[1] == "/":
        link = link[2:]
      elif link[0] == "/": # If the link starts with /, append the host url to the beginning to make it a complete URL
        if url[-1] == "/":
          link = url[:-1] + link
        else:
          link = url + link
    except IndexError:
      pass # The link is probably broken, just don't do anything and it will get removed later
    return link

  def _getImages(self, parsed, url):
    links = set()
    for element in parsed.find_all("img"):
      link = element.get("src")
      try:
        alt = element.get("alt").lower()
      except:
        continue # If the image doesn't have an alt caption, ignore it
      # Ensure that keywords are present in alt
      keywords_present = False
      for keyword in self.keywords:
        if keyword in alt:
          keywords_present = True
      if not keywords_present:
        continue
      if link == None or link == "": # If the link is empty, ignore it
        continue
      link = self._ensureUniversal(link, url)
      links.add(link)
    return links

  def _link_is_invalid(self, x): # Taken from https://stackoverflow.com/a/38020041
    try:
        result = urlparse(x)
        return not all([result.scheme, result.netloc])
    except AttributeError:
        return True

  def _verboseLog(self, msg, lvl):
    if self.VERBOSE >= lvl:
      print(msg)


In [130]:
# Taken from https://stackoverflow.com/a/15235475
# Subclass with override of read method that allows for a shorter timout
class TimeoutRobotFileParser(RobotFileParser):
    def __init__(self, url='', timeout=60):
        super().__init__(url)
        self.timeout = timeout

    def read(self):
        """Reads the robots.txt URL and feeds it to the parser."""
        try:
            f = urllib.request.urlopen(self.url, timeout=self.timeout)
        except urllib.error.HTTPError as err:
            if err.code in (401, 403):
                self.disallow_all = True
            elif err.code >= 400:
                self.allow_all = True
        else:
            raw = f.read()
            self.parse(raw.decode("utf-8").splitlines())

In [131]:
KEYWORDS_PATH = "gdrive/MyDrive/Programming/Web Scraper/keywords.txt"
SEED_URLS_PATH = "gdrive/MyDrive/Programming/Web Scraper/seed-urls.txt"
scraper = Scraper(KEYWORDS_PATH, SEED_URLS_PATH)

In [None]:
scraper.begin_scrape()