crawlers/Way-Back/waybacktrack.py

"""waybacktrack.py
Use this to extract Way Back Machine's
url-archives of any given domain!
TODO: reiterate entire design!
"""
import time
import os
import urllib2
import random
from math import ceil

try:
    from cStringIO import StringIO as BytesIO
except ImportError:
    from io import BytesIO

from lxml import html
from lxml.html import clean

ARCHIVE_DOMAIN = "http://web.archive.org"

CURR_DIR = os.path.dirname(__file__)

DATASET_DIR = os.path.join(CURR_DIR, '../../dataset/')


def archive_domain(domain, year, dir_path=DATASET_DIR,
                   percent=0, debug=False, throttle=1):
    """
    domain

    @type domain: string
    @param domain: the domain of the website ie. www.nytimes.com

    @type year: int
    @param year: the year to extract archives from

    @type dir_path: string
    @param dir_path: the directory path to store archive, if
                     empty, directory will automatically be created
                     TODO: Think of better solution to storing
                     downloaded archives

    @type percent: int
    @param percent: the percentage of Way Back archives to crawl

    @rtype:
    @return: Returns a list of archived sites
    """
    # TODO: Improve this for module portability
    # WARNING: Module will likely break if used outside of
    # crawl-to-the-future project
    # automatically find or eventually create directory
    # based off domain name

    # Found way to check if file is being ran in crawl-to-the-future
    # super "hacky" though
    # TODO: Find better way to check if module is getting ran in
    # in crawl-to-the-future project
    if os.path.split(
        os.path.abspath(os.path.join(__file__, os.pardir)))[1] != "Way-Back":
        raise Exception("Please manually specify 'dir_name' value")


    if dir_path is DATASET_DIR:
        dir_path = os.path.join(dir_path, domain + '/')


    if not os.path.exists(dir_path):
        #raise IOError("[Errno 2] No such file or directory: '" + dir_path + "'")
        # this part is shady
        os.makedirs(dir_path)

    if not isinstance(dir_path, basestring):
        raise Exception("Directory - third arg. - path must be a string.")

    ia_year_url = ARCHIVE_DOMAIN + "/web/" + str(year) + \
                  "*/http://" + domain + "/"

    ia_parsed = html.parse(ia_year_url)

    domain_snapshots = list(set(ia_parsed.xpath('//*[starts-with(@id,"' +
                                                str(year) + '-")]//a/@href')))
    
    #snapshot_age_span is a percentage of total snapshots to process from
    #the given year
    #ie. if percent is 100, and there are a total of 50 snapshots for
    #www.cnn.com, we will crawl (to a depth of 1 atm) all 50 snapshots
    snapshot_age_span = 1 if percent <= 0 \
           else len(domain_snapshots) - 1 \
           if percent >= 100 \
           else int(percent*len(domain_snapshots)/100)

    if debug:
        print "Extracting links from: ", domain

        # http://margerytech.blogspot.com/2011/06/python-get-last-directory-name-in-path.html
        print "Current directory: ", os.path.split(
            os.path.abspath(os.path.join(__file__, os.pardir)))[1]

        print "Storing files in: ", os.path.abspath(dir_path)

        print "Number of domain snapshots: ", len(domain_snapshots)

        print "Number of domain snapshots to process: ", snapshot_age_span + 1
        
    random.shuffle(domain_snapshots)
    
    forward_links = []
    
    #for snapshot in domain_snapshots[:snapshot_age_span]:
    for snapshot in domain_snapshots[:3]:
        
        curr_snapshot_flinks = get_forwardlink_snapshots(snapshot)

        forward_links.extend(curr_snapshot_flinks)
        
        if debug:
            print "snapshot url: ", snapshot
            
            print "forward link count: ", len(curr_snapshot_flinks)

    
    random.shuffle(forward_links)
    
    if debug:
        print "total number of foward links to download: ", len(forward_links)
        
    random.shuffle(forward_links)
    
    # archive forward links
    archived_links = []
    duds = []
    for forwardlink in forward_links:
        if archive(forwardlink, year, dir_path, debug, throttle):
            archived_links.append(forwardlink)
        else:
            duds.append(forwardlink)

    if debug:
        print "Number of archived forward links: ", len(archived_links)
        print "Number of duds: ", len(duds)
    return archived_links, duds


# I know I'm breaking so many rules by not seperating concerns
def archive(page, year, dir_path, debug=False, throttle=1):
    """
    Check to see if downloaded forward link
    satisfies the archival year specification
    ie. (2000, 2005, 2010)
    """
    #files = [f for f in os.listdir(dir_path) if os.path.isfile(f)]
    if debug:
        print "requesting ", page
            
    page_file = page.rsplit('/web/')[1].replace('http://', '').replace('-','_')
    page_file = page_file.replace('/', '_').replace(':', '_').replace('&','_')
    page_file = page_file.replace('?', '_').replace('*','_').replace('=','_')
    
    file_path = dir_path + page_file
    if os.path.isfile(file_path):
        if debug:
            print "Already saved: ", page_file
            print
        return False
    
    try:
        html_file = urllib2.urlopen(ARCHIVE_DOMAIN + page)
    except IOError:
        if debug:
            print "Failed to open request for ", ARCHIVE_DOMAIN + page
            print
        return False

    if html_file.getcode() == 302:
        if debug:
            print "Got HTTP 302 response for ", ARCHIVE_DOMAIN + page
            print
        return False

    html_string = str(html_file.read())

    if html_string.find("HTTP 302 response") != -1:
        if debug:
            print "Got HTTP 302 response for ", ARCHIVE_DOMAIN + page
            print
        return False

    archival_year_spec = ARCHIVE_DOMAIN + '/web/' + str(year)

    page_url = html_file.geturl()

    if page_url.startswith(archival_year_spec):

        if debug:
            print "saving ", page_url
            print
            
        try:
            with open(file_path, 'wb') as f:
                f.write(BytesIO(html_string).read())

            time.sleep(throttle)
            
        except IOError as e:
            if debug:
                print "Got error: ", e
            return False

        return True
    else:
        return False


def get_forwardlink_snapshots(parent_site):
    """
    @type index: string
    @param index: the index.html page from which to extract forward links

    @type year: int
    @param year: the year to extract archives from
    """
    try:
        parsed_parent_site = html.parse(ARCHIVE_DOMAIN+parent_site)
    except IOError:
        print "Did not get extract links in ", ARCHIVE_DOMAIN+parent_site
        return []
        
    #cleaner = html.clean.Cleaner(scripts=True, javascript=True,style=True, kill_tags = ["img"])
    cleaner = clean.Cleaner(scripts=True, javascript=True, comments=True,
        style=True, meta=True, processing_instructions=True, embedded=True,
        frames=True, forms=True, kill_tags=["noscript", "iframe", "img"])
    
    parsed_parent_site = cleaner.clean_html(parsed_parent_site)
    
    # spec archival year
    # check to see if the archival year of a forwark link
    # is that of the parent (ie. 2000|2005|2010)
    all_forwardlinks = parsed_parent_site.xpath('//a[starts-with(@href,"' +
                                                parent_site[:9] +'")]/@href')
    
    return all_forwardlinks