In [1]:
import requests
import xml.etree.ElementTree as ElementTree
import re
import pandas as pd

In [2]:
r = requests.get('http://sfx.scholarsportal.info/mcmaster?&sfx.response_type=simplexml&sfx.ignore_date_threshold=1&issn=03801330')

In [3]:
r

<Response [200]>

In [4]:
root = ElementTree.fromstring(r.content)

In [5]:
for child in root:
    print(child.tag, child.attrib)

ctx_obj_set {}
targets {}


In [6]:
targets = root[1].findall('target')

In [7]:
targets

[<Element 'target' at 0x11e505d18>,
 <Element 'target' at 0x11e50e1d8>,
 <Element 'target' at 0x11e50e688>,
 <Element 'target' at 0x11e50eb38>,
 <Element 'target' at 0x11e511138>,
 <Element 'target' at 0x11e511638>,
 <Element 'target' at 0x11e511bd8>,
 <Element 'target' at 0x11e513138>,
 <Element 'target' at 0x11e513638>,
 <Element 'target' at 0x11e513ae8>,
 <Element 'target' at 0x11e513f48>,
 <Element 'target' at 0x11e5184a8>]

In [8]:
targets[0].find('target_name')

<Element 'target_name' at 0x11e505d68>

In [9]:
targets[0].find('target_name').text

'ULRICHSWEB_COM'

In [10]:
irrelevant = ['ULRICHSWEB_COM', 'OSP_MCMASTER_LCL', 'LOCAL_CATALOGUE_INNOVATIVE_INNOPAC', 'OSP_MCMASTER_LCL', 'LOCAL_FEEDBACK', 'CAPTURE_CITATION']
services = []


for target in targets:
    if target.find('target_name').text not in irrelevant:
        services.append(target.find('target_name').text)
    

In [11]:
services

['BIOONE_COMPLETE',
 'CRKN_ELSEVIER_SCIENCEDIRECT',
 'ELSEVIER_SCIENCE_DIRECT_AUTOLOAD_JOURNALS',
 'ELSEVIER_SD_BACKFILE_EARTH_SUPPLEMENT',
 'ELSEVIER_SD_BACKFILE_ENVIRONMENTAL_SUPPLEMENT',
 'ELSEVIER_SD_ELSEVIER']

In [12]:
df = pd.read_csv('data/backup-journals.csv')

In [13]:
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,publicationName,source_id,issn,eIssn,aggregationType,count,q_issn
0,0,0,Journal of Great Lakes Research,17510,03801330,,Journal,63.0,03801330
1,1,1,Canadian Journal of Fisheries and Aquatic Scie...,12016,0706652X,12057533,Journal,60.0,12057533
2,2,2,Environmental Toxicology and Chemistry,25094,07307268,15528618,Journal,38.0,15528618
3,3,3,Animal Behaviour,24580,00033472,,Journal,31.0,00033472
4,4,4,Biological Invasions,13257,13873547,15731464,Journal,29.0,15731464
5,5,5,Environmental Science and Technology,21537,0013936X,15205851,Journal,26.0,15205851
6,6,6,Molecular Ecology,20297,09621083,1365294X,Journal,24.0,1365294X
7,7,7,Journal of Fish Biology,22560,00221112,10958649,Journal,24.0,10958649
8,8,8,Hydrobiologia,15168,00188158,15735117,Journal,23.0,15735117
9,9,9,PLoS ONE,10600153309,,19326203,Journal,22.0,19326203


In [14]:
def mac_searchOpenURL(row):
    '''(pandas.DataFrame) --> pandas.Series
    
    Specifically for the McMaster University link resolver.
    
    This function takes a row of a pandas DataFrame and gets the ISSN of a journal. When used with the pandas apply function, this function uses the ISSNs to run a HTTP query against a library OpenURL link resolver server, retrieves the XML response, and parses out the package name and coverage dates. Using getText, this function returns two columns in a panda Series for each row: a statement of availability, and a statement of coverage (package names and the dates they cover).
    
    ''' 
    # A list of link resolver services that AREN'T actual full-text services.
    irrelevant = ['ULRICHSWEB_COM', 'OSP_MCMASTER_LCL', 'LOCAL_CATALOGUE_INNOVATIVE_INNOPAC', 'OSP_MCMASTER_LCL', 'LOCAL_FEEDBACK', 'CAPTURE_CITATION']
   
    # Create and run an HTTP request against the open URL link resolver 
    r = requests.get('http://sfx.scholarsportal.info/mcmaster?&sfx.response_type=simplexml&sfx.ignore_date_threshold=1&issn={}'.format(row['q_issn']))
    # Parse the XML response and store it as root
    root = ElementTree.fromstring(r.content)
    
    # Create an empty, temporary list to store the names of full-text services
    svcs = []
    
    targets = root[1].findall('target')
    
    # for every target service in the XML data...
    for target in targets:
        # if the target service is not one of the irrelevant services...
        if target.find('target_name').text not in irrelevant:
            svcs.append(target.find('target_name').text)
    if svcs != []:
        availability_statement = 'Full text available'
        print('Full-text available for ' + row['q_issn'])
    else:
        availability_statement = 'No full-text available'
        print('No full-text available for ' + row['q_issn'])
    return pd.Series([availability_statement, svcs])
    

In [15]:
df[['availability', 'services']] = df[:100].apply(mac_searchOpenURL ,axis=1)

Full-text available for 03801330
Full-text available for 03801330
Full-text available for 12057533
Full-text available for 15528618
Full-text available for 00033472
Full-text available for 15731464
Full-text available for 15205851
Full-text available for 1365294X
Full-text available for 10958649
Full-text available for 15735117
Full-text available for 19326203
Full-text available for 20457758
Full-text available for 13652427
Full-text available for 14724642
Full-text available for 14209101
Full-text available for 1600048X
Full-text available for 18791298
Full-text available for 14657279
Full-text available for 14712954
Full-text available for 18791026
Full-text available for 14320800
Full-text available for 13652540
Full-text available for 14390310
Full-text available for 03603199
Full-text available for 19395582
Full-text available for 00298549
Full-text available for 01718630
Full-text available for 19395590
Full-text available for 15729737
Full-text available for 00084301
Full-text 

In [16]:
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,publicationName,source_id,issn,eIssn,aggregationType,count,q_issn,availability,services
0,0,0,Journal of Great Lakes Research,17510,03801330,,Journal,63.0,03801330,Full text available,"[BIOONE_COMPLETE, CRKN_ELSEVIER_SCIENCEDIRECT,..."
1,1,1,Canadian Journal of Fisheries and Aquatic Scie...,12016,0706652X,12057533,Journal,60.0,12057533,Full text available,[CANADIAN_BUSINESS_AND_CURRENT_AFFAIRS_DATABAS...
2,2,2,Environmental Toxicology and Chemistry,25094,07307268,15528618,Journal,38.0,15528618,Full text available,[CRKN_WILEY_ONLINE_LIBRARY]
3,3,3,Animal Behaviour,24580,00033472,,Journal,31.0,00033472,Full text available,"[CRKN_ELSEVIER_SCIENCEDIRECT, ELSEVIER_SCIENCE..."
4,4,4,Biological Invasions,13257,13873547,15731464,Journal,29.0,15731464,Full text available,[CRKN_SPRINGER_LINK_CURRENT]
5,5,5,Environmental Science and Technology,21537,0013936X,15205851,Journal,26.0,15205851,Full text available,"[AMERICAN_CHEMICAL_SOCIETY_JOURNALS, AMERICAN_..."
6,6,6,Molecular Ecology,20297,09621083,1365294X,Journal,24.0,1365294X,Full text available,"[CRKN_WILEY_BLACKWELL_BACKFILES, CRKN_WILEY_ON..."
7,7,7,Journal of Fish Biology,22560,00221112,10958649,Journal,24.0,10958649,Full text available,"[CRKN_WILEY_BLACKWELL_BACKFILES, CRKN_WILEY_ON..."
8,8,8,Hydrobiologia,15168,00188158,15735117,Journal,23.0,15735117,Full text available,"[CRKN_SPRINGER_LINK_CURRENT, CRKN_SPRINGER_LIN..."
9,9,9,PLoS ONE,10600153309,,19326203,Journal,22.0,19326203,Full text available,"[DOAJ_DIRECTORY_OPEN_ACCESS_JOURNALS_FREE, GAL..."


In [18]:
df.iloc[1]['services']

['CANADIAN_BUSINESS_AND_CURRENT_AFFAIRS_DATABASE',
 'CANADIAN_SCIENCE_PUBLISHING_NRC_RESEARCH_PRESS_CURRENT_JOURNALS',
 'GALEGROUP_ACADEMIC_ONEFILE',
 'GALEGROUP_IT_CPI_Q']