# Checking OA status of research papers using Unpaywall

In [1]:
#import modules
import requests
import pandas as pd

## Define a list of dois to check

In [2]:
dois = [
"10.1111/1746-692x.12291",
"10.1080/07075332.2021.1976810",
"10.1016/j.gfs.2019.02.002",
"10.1108/ijlm-07-2020-0295",
"10.1038/s43016-021-00346-1",
"10.1007/s10845-015-1171-0",
"10.3310/hsdr07140",
"10.1136/bmjopen-2018-027934",
"10.3389/fpsyg.2020.00605",
"10.1108/josm-11-2019-0353",
"10.1016/j.tsc.2020.100758",
"10.1111/gwao.12675",
"10.1111/ecin.12953",
"10.1016/j.jcorpfin.2020.101718",
"10.1016/j.euroecorev.2021.103764",
"10.1007/s00199-021-01388-z",
"bad_doi" #we're adding a 'bad' doi - or even a 'good' one not in Unpaywall's DB - to see what happens
]

## Unpaywall API base details

In [3]:
#we're using the unpaywall api for this exercise

#base url
base_url = "https://api.unpaywall.org/v2/"

#you also need to send an email address with the request...
my_email = "shane.jackson@gmx.co.uk"


## Create urls to run

In [4]:
#requests made in format "base_doi_email"
#therefore we can create a list of urls to run via....

#run through dois
check_urls = [f"{base_url}{d}?email={my_email}" for d in dois]
    
#check the outputs
print(*check_urls, sep = "\n") # use this sep to show results on multiple lines

https://api.unpaywall.org/v2/10.1111/1746-692x.12291?email=shane.jackson@gmx.co.uk
https://api.unpaywall.org/v2/10.1080/07075332.2021.1976810?email=shane.jackson@gmx.co.uk
https://api.unpaywall.org/v2/10.1016/j.gfs.2019.02.002?email=shane.jackson@gmx.co.uk
https://api.unpaywall.org/v2/10.1108/ijlm-07-2020-0295?email=shane.jackson@gmx.co.uk
https://api.unpaywall.org/v2/10.1038/s43016-021-00346-1?email=shane.jackson@gmx.co.uk
https://api.unpaywall.org/v2/10.1007/s10845-015-1171-0?email=shane.jackson@gmx.co.uk
https://api.unpaywall.org/v2/10.3310/hsdr07140?email=shane.jackson@gmx.co.uk
https://api.unpaywall.org/v2/10.1136/bmjopen-2018-027934?email=shane.jackson@gmx.co.uk
https://api.unpaywall.org/v2/10.3389/fpsyg.2020.00605?email=shane.jackson@gmx.co.uk
https://api.unpaywall.org/v2/10.1108/josm-11-2019-0353?email=shane.jackson@gmx.co.uk
https://api.unpaywall.org/v2/10.1016/j.tsc.2020.100758?email=shane.jackson@gmx.co.uk
https://api.unpaywall.org/v2/10.1111/gwao.12675?email=shane.jackson@g

## Now run an example request

In [5]:
#just go with the first item in the list
test_response = requests.get(check_urls[0])

#get the returned json (this API returns results in json format)
test_json = test_response.json()

#turn into a dataframe
pd.json_normalize(test_json) #use json_normalize because the result is nested

Unnamed: 0,doi,doi_url,title,genre,is_paratext,published_date,year,journal_name,journal_issns,journal_issn_l,...,first_oa_location.url_for_landing_page,first_oa_location.evidence,first_oa_location.license,first_oa_location.version,first_oa_location.host_type,first_oa_location.is_best,first_oa_location.pmh_id,first_oa_location.endpoint_id,first_oa_location.repository_institution,first_oa_location.oa_date
0,10.1111/1746-692x.12291,https://doi.org/10.1111/1746-692x.12291,An Odd Crisis: Covid‐19 and UK Food Prices,journal-article,False,2020-12-01,2020,EuroChoices,"1478-0917,1746-692X",1478-0917,...,https://doi.org/10.1111/1746-692x.12291,open (via crossref license),cc-by,publishedVersion,publisher,True,,,,2021-02-07


## Good result but we only want certain fields, so let's identify...

In [6]:
#get a list of all field names
test_df = pd.json_normalize(test_json)
field_names = list(test_df.columns)
print(*field_names, sep = "\n")

doi
doi_url
title
genre
is_paratext
published_date
year
journal_name
journal_issns
journal_issn_l
journal_is_oa
journal_is_in_doaj
publisher
is_oa
oa_status
has_repository_copy
oa_locations
oa_locations_embargoed
updated
data_standard
z_authors
best_oa_location.updated
best_oa_location.url
best_oa_location.url_for_pdf
best_oa_location.url_for_landing_page
best_oa_location.evidence
best_oa_location.license
best_oa_location.version
best_oa_location.host_type
best_oa_location.is_best
best_oa_location.pmh_id
best_oa_location.endpoint_id
best_oa_location.repository_institution
best_oa_location.oa_date
first_oa_location.updated
first_oa_location.url
first_oa_location.url_for_pdf
first_oa_location.url_for_landing_page
first_oa_location.evidence
first_oa_location.license
first_oa_location.version
first_oa_location.host_type
first_oa_location.is_best
first_oa_location.pmh_id
first_oa_location.endpoint_id
first_oa_location.repository_institution
first_oa_location.oa_date


In [7]:
#we're interested in the following fields
keep_fields = ['doi', 'title', 'year', 'journal_name', 'journal_issns', 'is_oa', 'oa_status', 'oa_locations']

## Now create a df with all the results and only the relevant columns

In [8]:
#get the results
responses = [requests.get(u) for u in check_urls]

#get the json
oa_list =[r.json() for r in responses]

#turn into a dataframe, then drop unnecessary columns
oa_df = pd.json_normalize(oa_list)
oa_df = oa_df[keep_fields]

oa_df #note you get a row of NaNs for the 'bad' request

Unnamed: 0,doi,title,year,journal_name,journal_issns,is_oa,oa_status,oa_locations
0,10.1111/1746-692x.12291,An Odd Crisis: Covid‐19 and UK Food Prices,2020.0,EuroChoices,"1478-0917,1746-692X",True,hybrid,"[{'updated': '2022-05-03T09:42:46.676704', 'ur..."
1,10.1080/07075332.2021.1976810,"The “Big Survey”: Decolonisation, Development ...",2021.0,The International History Review,"0707-5332,1949-6540",True,hybrid,"[{'updated': '2022-01-22T18:30:32.210491', 'ur..."
2,10.1016/j.gfs.2019.02.002,Are Distributed Ledger Technologies the panace...,2019.0,Global Food Security,2211-9124,True,hybrid,"[{'updated': '2022-05-03T09:42:47.495975', 'ur..."
3,10.1108/ijlm-07-2020-0295,An in-depth case study of a modular service de...,2021.0,The International Journal of Logistics Management,0957-4093,True,green,"[{'updated': '2022-01-31T09:40:12.352511', 'ur..."
4,10.1038/s43016-021-00346-1,A trust framework for digital food systems,2021.0,Nature Food,2662-1355,True,green,"[{'updated': '2022-02-07T08:41:17.978876', 'ur..."
5,10.1007/s10845-015-1171-0,Modelling and simulation of operation and main...,2015.0,Journal of Intelligent Manufacturing,"0956-5515,1572-8145",True,green,"[{'updated': '2022-03-03T22:45:03.815151', 'ur..."
6,10.3310/hsdr07140,Policies and strategies to retain and support ...,2019.0,Health Services and Delivery Research,"2050-4349,2050-4357",True,gold,"[{'updated': '2022-05-03T09:42:49.403431', 'ur..."
7,10.1136/bmjopen-2018-027934,Workforce predictive risk modelling: developme...,2020.0,BMJ Open,"2044-6055,2044-6055",True,gold,"[{'updated': '2022-05-03T09:42:49.832673', 'ur..."
8,10.3389/fpsyg.2020.00605,A Framework for the Testing and Validation of ...,2020.0,Frontiers in Psychology,1664-1078,True,gold,"[{'updated': '2022-05-03T09:42:50.265313', 'ur..."
9,10.1108/josm-11-2019-0353,Development and validation of a measurement sc...,2020.0,Journal of Service Management,1757-5818,False,closed,[]
