In [30]:
%%writefile doi.py
#!/usr/bin/env python
'''
Search doi by title and fist author surname
    based on https://github.com/torfbolt/DOI-finder
    See: http://www.crossref.org/guestquery/#textsearch
'''
def searchdoi(title='a model of  leptons', surname='Weinberg'):
    import re
    import urllib
    from bs4 import BeautifulSoup
    import httplib
    
    """
    based on https://github.com/torfbolt/DOI-finder
    See: http://www.crossref.org/guestquery/
    
    Search for the metadata of given a title; e.g.  "A model of  leptons" 
   (case insensitive), and the Surname (only) for the first author, 
    e.g. Weinberg 
                      
    returns a dictionary with the keys:

       ['Article Title','Author','ISSN','Volume','Persistent Link','Year',
        'Issue','Page','Journal Title'],

       where 'Author' is really the surname of the first author
    """
    title = re.sub(r"\$.*?\$","",title) # better remove all math expressions
    title = re.sub(r"[^a-zA-Z0-9 ]", " ", title) #remove non standard characters
    surname = re.sub(r"[{}'\\]","", surname) #remove non standard characters
    params = urllib.urlencode({"titlesearch":"titlesearch", "auth2" : surname, "atitle2" : title, "multi_hit" : "on", "article_title_search" : "Search", "queryType" : "author-title"})
    headers = {"User-Agent": "Mozilla/5.0" , "Accept": "text/html", "Content-Type" : "application/x-www-form-urlencoded", "Host" : "www.crossref.org"}
    conn = httplib.HTTPConnection("www.crossref.org:80")
    conn.request("POST", "/guestquery/", params, headers)
    response = conn.getresponse()
    # print response.status, response.reason
    data = response.read()
    conn.close()
    result = re.findall(r"\<table cellspacing=1 cellpadding=1 width=600 border=0\>.*?\<\/table\>" ,data, re.DOTALL)
    if (len(result) > 0):
        html=urllib.unquote_plus(result[0])
        #doi=re.sub('.*dx.doi.org\/(.*)<\/a>.*','\\1',doitmp)
        if re.search('No DOI found',html):
            html='<table><tr><td>No DOI found<td></tr></table>'
    else:
        doi={}
        #return {}         

    soup = BeautifulSoup(html)
    table = soup.find("table")

    dataset = []
    for row in table.find_all("tr"):
        for tdi in row.find_all("td"):
            dataset.append(tdi.get_text())
            
    if len(dataset)==20:
        headings=dataset[:9]
        datasets=dataset[10:-1]
        doi=dict(zip(headings,datasets))
        
    else:
        doi={}
        
    if doi:
        if doi.has_key('ISSN') and doi.has_key('Persistent Link'):
            doi['ISSN']=re.sub('([a-zA-Z0-9]{4})([a-zA-Z0-9]{4})','\\1-\\2',doi['ISSN'])
            doi[u'DOI']=doi['Persistent Link']
            
    return doi

if __name__ == "__main__":
    import sys
    title='';first_author_surname=''
    if sys.argv[1]:
        title=sys.argv[1]
    if sys.argv[2]:
        first_author_surname=sys.argv[2]
        
    d=searchdoi(title,first_author_surname)
    ref='';sep=','
    for k in ['Author','Article Title','Journal Title','Volume','Issue','Page','Year']:
        if d.has_key(k):
            if k=='Volume':
                d[k]='<strong>%s</strong>' %d[k]
            if k=='Year':
                sep=''
            ref=ref+d[k]+sep

    if d.has_key('DOI'):
        print '''
            <br/>DOI: <a href="%s">%s</a><br/>
            Ref: %s<br/>
            <br>
            CODE at <a href="https://github.com/restrepo/webpy">GitHub</a>: doi.py<br/><br/>
        ''' %(d['DOI'],d['DOI'],ref)
        print 'Official search at <a href="http://www.crossref.org/guestquery/#textsearch">crossref</a>'
    else:
        print '<br/>DOI lookup failed: try in: <a href="http://www.crossref.org/guestquery/#textsearch">crossref</a>'
    
        
    
    

Overwriting doi.py


In [27]:
run doi.py "Formation, habitability, and detection of extrasolar moons" "Heller"


            DOI: http://dx.doi.org/10.1089/ast.2014.1147<br/>
            Ref: Heller,Formation, Habitability, and Detection of Extrasolar Moons,Astrobiology,<strong>14</strong>,9,798,2014<br/>
            <br>
            CODE at https://github.com/restrepo/webpy: doi.py
        


In [20]:
d=searchdoi("Formation, habitability, and detection of extrasolar moons","Heller")
d

{u'Article Title': u'Formation, Habitability, and Detection of Extrasolar Moons',
 u'Author': u'Heller',
 u'DOI': u'http://dx.doi.org/10.1089/ast.2014.1147',
 u'ISSN': u'1531-1074',
 u'Issue': u'9',
 u'Journal Title': u'Astrobiology',
 u'Page': u'798',
 u'Persistent Link': u'http://dx.doi.org/10.1089/ast.2014.1147',
 u'Volume': u'14',
 u'Year': u'2014'}

In [23]:
if 1==1:
    ref='';sep=','
    for k in ['Author','Article Title','Journal Title','Volume','Issue','Page','Year']:
        if d.has_key(k):
            if k=='Volume':
                d[k]='<strong>%s</strong>' %d[k]
            if k=='Year':
                sep=''
            ref=ref+d[k]+sep

    print '''
        DOI: %s<br/>
        Ref: %s<br/>
        <br>
        CODE at https://github.com/restrepo/webpy: doi.py
    ''' %(d['DOI'],ref)


        DOI: http://dx.doi.org/10.1089/ast.2014.1147<br/>
        Ref: Heller,Formation, Habitability, and Detection of Extrasolar Moons,Astrobiology,<strong>14</strong>,9,798,2014<br/>
        <br>
        CODE at https://github.com/restrepo/webpy: doi.py
    


In [None]:
 conn = httplib.HTTPConnection("www.crossref.org:80")
    conn.request("POST", "/guestquery/", params, headers)

In [77]:
import  requests

title="Formation, habitability, and detection of extrasolar moons"; surname="Heller"
headers = {"User-Agent": "Mozilla/5.0" , "Accept": "text/html",\
               "Content-Type" : "application/x-www-form-urlencoded", "Host" : "www.crossref.org"}
payload = dict(titlesearch="titlesearch", auth2 = surname, atitle = title, multi_hit = "on",\
                   article_title_search = "Search", queryType = "author-title")
r = requests.post('http://www.crossref.org/guestquery/', data=payload,headers=headers)
r.text.find('http://dx.doi.org/10.1089/ast.2014.1147')

-1

In [76]:
import urllib
import httplib
title="Formation, habitability, and detection of extrasolar moons"; surname="Heller"
params = urllib.urlencode({"titlesearch":"titlesearch", "auth2" : surname, "atitle2" : title, "multi_hit" : "on", "article_title_search" : "Search", "queryType" : "author-title"})
headers = {"User-Agent": "Mozilla/5.0" , "Accept": "text/html", "Content-Type" : "application/x-www-form-urlencoded", "Host" : "www.crossref.org"}
conn = httplib.HTTPConnection("www.crossref.org:80")
conn.request("POST", "/guestquery/", params, headers)
response = conn.getresponse()
r = response.read()
r.find('http://dx.doi.org/10.1089/ast.2014.1147')

25620

In [78]:
import mechanize
browser = mechanize.Browser()
browser.set_handle_robots(False)
browser.addheaders = [('User-agent', 'Firefox')] 
browser.open("http://www.crossref.org/guestquery/")
assert browser.viewing_html()
browser.select_form(name="form2")
# use only surname of first author
browser["auth2"] =  "Heller" #re.sub(r'[A-Z] ', ' ',
#re.sub(r'[^a-zA-Z0-9 ]+', ' ', author).split("and")[0])
browser["atitle2"] = "Formation, habitability, and detection of extrasolar moons" #re.sub(r'[^a-zA-Z0-9 ]+', ' ', title)
response = browser.submit()
sourcecode = response.get_data()

In [80]:
sourcecode.find('http://dx.doi.org/10.1089/ast.2014.1147')

25627

In [90]:
import mechanize
browser = mechanize.Browser()
browser.set_handle_robots(False)
browser.addheaders = [('User-agent', 'Firefox')] 
browser.open("http://gfif.udea.edu.co/python/doi.php")
assert browser.viewing_html()
browser.select_form(name="form")
# use only surname of first author
browser["title"] = "Formation, habitability, and detection of extrasolar moons" #re.sub(r'[^a-zA-Z0-9 ]+', ' ', title)
browser["surname"] =  "Heller" #re.sub(r'[A-Z] ', ' ',
#re.sub(r'[^a-zA-Z0-9 ]+', ' ', author).split("and")[0])
response = browser.submit()
sourcecode = response.get_data()

FormNotFoundError: no form matching name 'form'

In [97]:
import requests

URL = 'http://gfif.udea.edu.co/python/doi.php'
payload = {
    'surname':"Heller",
    'title': "Formation, habitability, and detection of extrasolar moons",
}

session = requests.session()
r = requests.post(URL, data=payload)

In [107]:
r.url

u'http://gfif.udea.edu.co/python/doi.php'