# The Dark Engine of Naggaroth : A tool for summoning data from the dead

In this notebook, we dig for dead links in the scientific literature,
raising their corpses and turn them loose.

In [1]:
import xml.etree.ElementTree as ET
from glob import glob
from pyprind import ProgBar
import pandas
from requests import head, get, ConnectionError, ConnectTimeout, ReadTimeout
from urlparse import urlparse
from socket import getaddrinfo, gaierror

First, we parse through PubMed Central's archived XML files and extract links from
the article bodies. These are pushed into a Panda's DataFrame.

In [2]:
def is_valid( url ) :
    r = urlparse( url )
    return r.scheme in ( 'http', 'https', 'ftp' ) and r.netloc is not ''

file_list = glob('*.nxml')

progbar = ProgBar( len(file_list), monitor=True, title='processing files...' )

records = []
for fname in file_list :
    record = {}
    tree = ET.parse( fname )
    root = tree.getroot()
    front = root.find('front')
    body  = root.find('body')
    
    for element in front.find('article-meta').findall('article-id') :
        idtype = element.attrib['pub-id-type']
        if idtype == 'pmid' :
            pmid = element.text
        if idtype == 'doi' :
            doi = element.text
    
    record['pmid'] = pmid
    record['doi'] = doi
    
    for key in ['journal-title', 'subject', 'year' ] :
        for element in front.iter(tag=key) :
            record[key] = element.text

    haslinks = False
    for link in body.iter(tag='ext-link') :
        if link.attrib['ext-link-type'] == 'uri' :
            url = link.attrib['{http://www.w3.org/1999/xlink}href']
            if is_valid( url ) :
                record['link'] = url
                records.append( dict( record ) )
                haslinks = True
    if not haslinks :
        records.append( record )
    
    progbar.update()

processing files...
0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:12


In [3]:
papertable = pandas.DataFrame(records)
papertable.head()

Unnamed: 0,doi,journal-title,link,pmid,subject,year
0,10.1186/1471-2105-7-431,BMC Bioinformatics,http://bisearch.enzim.hu,17022803,Software,2006
1,10.1186/1471-2105-7-289,BMC Bioinformatics,http://genesis.UGent.be/cell_motility,16762054,Software,2006
2,10.1186/1471-2105-7-409,BMC Bioinformatics,http://www.ub.es/softevol/variscan,16968531,Software,2006
3,10.1186/1471-2105-7-167,BMC Bioinformatics,,16553946,Research Article,2006
4,10.1186/1471-2105-6-230,BMC Bioinformatics,,16171528,Research Article,2005


Next, we check we check to see if the links actually work. Instead of downloading anything,
just check the response from the HTTP server, and before that, if the domain resolves. The
results are added as a new column in the DataFrame.

In [4]:
from urlparse import urlparse
from threading import Thread
import httplib, sys
from Queue import Queue
from time import sleep
from random import random
from socket import getaddrinfo, gaierror
from requests import head, ConnectionError, ConnectTimeout, ReadTimeout

concurrent = 200

statusrecords = []

progbar = ProgBar( sum(papertable['link'].notnull()), monitor=True, title='testing urls...' )

def doWork():
    sleep( random() )
    Thread.daemon = True
    while True:
        sleep( random() * 0.01 )
        rowid, url = q.get()
        status, url = getStatus(rowid, url)
        doSomethingWithResult(rowid, status)
        q.task_done()

def getStatus(rowid, ourl):
    try:
        url = urlparse(ourl)
        ip = getaddrinfo( url.netloc, None )
        resp = head( ourl, timeout=10, allow_redirects=True )
        return resp.status_code, ourl
    except gaierror :
        return 'NXDOMAIN', ourl
    except ConnectionError, ConnectTimeout :
        return 'TCPFAIL', ourl
    except ReadTimeout :
        return 'TIMEOUT', ourl
    #except :
    #    return "FAIL", ourl
    
def doSomethingWithResult(rowid, status):
    statusrecords.append( (rowid, status) )
    progbar.update()

q = Queue(concurrent * 2)
for i in range(concurrent):
    t = Thread(target=doWork)
    t.daemon = True
    t.start()
try:
    for urlrecord in papertable['link'][ papertable['link'].notnull() ].iteritems() :
        q.put(urlrecord)
    q.join()
except KeyboardInterrupt:
    sys.exit(1)

testing urls...
0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:25


In [5]:
rowids, results = zip( *statusrecords )
papertable['status'] = pandas.Series( results, index=rowids )
papertable.head()

Unnamed: 0,doi,journal-title,link,pmid,subject,year,status
0,10.1186/1471-2105-7-431,BMC Bioinformatics,http://bisearch.enzim.hu,17022803,Software,2006,200.0
1,10.1186/1471-2105-7-289,BMC Bioinformatics,http://genesis.UGent.be/cell_motility,16762054,Software,2006,404.0
2,10.1186/1471-2105-7-409,BMC Bioinformatics,http://www.ub.es/softevol/variscan,16968531,Software,2006,200.0
3,10.1186/1471-2105-7-167,BMC Bioinformatics,,16553946,Research Article,2006,
4,10.1186/1471-2105-6-230,BMC Bioinformatics,,16171528,Research Article,2005,


In [6]:
set(papertable['status'])

{nan, 200, 403, 404, 500, 'NXDOMAIN', 'TCPFAIL', 'TIMEOUT'}

Finally, we want to see if the dead links have snapshots in the Internet 
Archive's Wayback Machine. The archive.org URLS are easy to construct, so
we'll just decode the time stamp.

In [7]:
from requests import get
import json
import datetime

def wayback( url ) :
    response = get( 'http://archive.org/wayback/available', 
                    params={'url' : url} )
    if response.status_code == 200 :
        result = json.loads( response.content )
        snapshots = result['archived_snapshots']
        if snapshots :
            timestamp = snapshots['closest']['timestamp']
            return datetime.datetime.strptime( timestamp, '%Y%m%d%H%M%S' )
        else :
            return None

The API responds somewhat slowly, so we spawn 20 worker threads to query it.
We only ask for URLs that failed. The time stamps of URLs that have have been
archived are added in a new column in the DataFrame.

In [9]:
from urlparse import urlparse
from threading import Thread
import httplib, sys
from Queue import Queue
from time import sleep
from random import random
from socket import getaddrinfo, gaierror
from requests import head, ConnectionError, ConnectTimeout, ReadTimeout

concurrent = 40

timestamps = []

deadlinks = papertable[ papertable['link'].notnull() ][ papertable['status'] != 200 ]['link']

progbar = ProgBar( sum(deadlinks.notnull()), monitor=True, title='checking archive.org...' )

def doWork():
    sleep( random() )
    Thread.daemon = True
    while True:
        sleep( random() * 0.01 )
        rowid, url = q.get()
        timestamp, url = getStatus(rowid, url)
        doSomethingWithResult(rowid, timestamp )
        q.task_done()

def getStatus(rowid, ourl):
    return wayback(ourl), ourl
    
def doSomethingWithResult(rowid, timestamp):
    timestamps.append( (rowid, timestamp) )
    progbar.update()

q = Queue(concurrent * 2)
for i in range(concurrent):
    t = Thread(target=doWork)
    t.daemon = True
    t.start()
try:
    for urlrecord in deadlinks.iteritems() :
        q.put(urlrecord)
    q.join()
except KeyboardInterrupt:
    sys.exit(1)

checking archive.org...
0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:14


In [10]:
rowids, results = zip( *timestamps )
papertable['archived'] = pandas.Series( results, index=rowids )
papertable

Unnamed: 0,doi,journal-title,link,pmid,subject,year,status,archived
0,10.1186/1471-2105-7-431,BMC Bioinformatics,http://bisearch.enzim.hu,17022803,Software,2006,200,
1,10.1186/1471-2105-7-289,BMC Bioinformatics,http://genesis.UGent.be/cell_motility,16762054,Software,2006,404,
2,10.1186/1471-2105-7-409,BMC Bioinformatics,http://www.ub.es/softevol/variscan,16968531,Software,2006,200,
3,10.1186/1471-2105-7-167,BMC Bioinformatics,,16553946,Research Article,2006,,
4,10.1186/1471-2105-6-230,BMC Bioinformatics,,16171528,Research Article,2005,,
5,10.1186/1471-2105-6-269,BMC Bioinformatics,http://microarray.cu-genome.org/ermineJ/,16280084,Software,2005,NXDOMAIN,2015-07-21 10:47:56
6,10.1186/1471-2105-6-173,BMC Bioinformatics,,16011807,Methodology Article,2005,,
7,10.1186/1471-2105-7-291,BMC Bioinformatics,,16762065,Research Article,2006,,
8,10.1186/1471-2105-7-295,BMC Bioinformatics,,16768797,Research Article,2006,,
9,10.1186/1471-2105-7-325,BMC Bioinformatics,http://www.uni-rostock.de/aidb,16803617,Database,2006,200,
