From a564a1bc1d48c3e7ae654448ba73bf075ebd7828 Mon Sep 17 00:00:00 2001 From: Scott Chamberlain Date: Mon, 20 Nov 2017 14:38:44 -0800 Subject: [PATCH] added a bunch of new ones, tests probably wont work, need to fix those --- ignore/ssrn.rb | 54 ++++++++++++++++++++++++++++++ src/american_physical_society.json | 22 ++++++++++++ src/cogent.json | 1 + src/elsevier.json | 47 ++++++++++++++++++++++++++ src/emerald.json | 24 +++++++++++++ src/f1000.json | 31 +++++++++++++++++ src/karger.json | 22 ++++++++++++ src/pleiades.json | 23 +++++++++++++ src/royal_society_chemistry.json | 22 ++++++++++++ src/ssrn.json | 24 +++++++++++++ src/transtech.json | 22 ++++++++++++ 11 files changed, 292 insertions(+) create mode 100644 ignore/ssrn.rb create mode 100644 src/american_physical_society.json create mode 100644 src/elsevier.json create mode 100644 src/emerald.json create mode 100644 src/f1000.json create mode 100644 src/karger.json create mode 100644 src/pleiades.json create mode 100644 src/royal_society_chemistry.json create mode 100644 src/ssrn.json create mode 100644 src/transtech.json diff --git a/ignore/ssrn.rb b/ignore/ssrn.rb new file mode 100644 index 0000000..6e38bc1 --- /dev/null +++ b/ignore/ssrn.rb @@ -0,0 +1,54 @@ +require "test/unit" +require "multi_json" +require "faraday" +require "faraday_middleware" +require "faraday-cookie_jar" + +class TestSSRN < Test::Unit::TestCase + + def setup + @doi = "10.2139/ssrn.460001" + @ssrn = MultiJson.load(File.open('src/ssrn.json')) + end + + def test_ssrn_keys + assert_equal( + @ssrn.keys().sort(), + ["components", "cookies","crossref_member", "journals", + "open_access", "prefixes", "publisher", + "publisher_member", "publisher_parent", "regex", "urls"] + ) + assert_not_nil(@ssrn['urls']) + assert_nil(@ssrn['journals']) + end + + def test_ssrn_pdf + # scrape to get PDF URL first + conn = Faraday.new(:url => "https://doi.org/" + @doi) do |f| + f.use :cookie_jar + f.use Faraday::Response::Logger, Logger.new('faraday.log') + f.adapter Faraday.default_adapter + f.use FaradayMiddleware::FollowRedirects, limit: 3 + end + + res = conn.get + res.body + + # then get pdf + conn = Faraday.new(:url => @ssrn['urls']['pdf'] % @doi.match(@ssrn['components']['doi']['regex']).to_s) do |f| + f.use :cookie_jar + f.adapter Faraday.default_adapter + end + + res = conn.get do |f| + f.use :cookie_jar + f.adapter Faraday.default_adapter + end + assert_equal(Faraday::Response, res.class) + assert_equal(String, res.body.class) + end + +end + +# curl -c ssrncookies.txt 'http://ssrnoa.tandfonline.com/doi/pdf/10.1080/23312041.2015.1085296' +# curl -b ssrncookies.txt 'http://ssrnoa.tandfonline.com/doi/pdf/10.1080/23312041.2015.1085296' diff --git a/src/american_physical_society.json b/src/american_physical_society.json new file mode 100644 index 0000000..e0a588e --- /dev/null +++ b/src/american_physical_society.json @@ -0,0 +1,22 @@ +{ + "publisher": "american_physical_society", + "publisher_parent": null, + "crossref_member": 16, + "prefixes": [ + "10.1103" + ], + "urls": { + "pdf": "http://harvest.aps.org/v2/journals/articles/%s/fulltext" + }, + "components": { + "html": null, + "doi": { + "regex": "[0-9]{5}$" + } + }, + "cookies": false, + "regex": null, + "open_access": false, + "journals": null, + "notes": null +} diff --git a/src/cogent.json b/src/cogent.json index 4321961..3c254ad 100644 --- a/src/cogent.json +++ b/src/cogent.json @@ -1,6 +1,7 @@ { "publisher": "cogent", "publisher_parent": "informa", + "publisher_member": 301, "crossref_member": null, "prefixes": ["10.1080"], "urls": { diff --git a/src/elsevier.json b/src/elsevier.json new file mode 100644 index 0000000..492ac45 --- /dev/null +++ b/src/elsevier.json @@ -0,0 +1,47 @@ +{ + "publisher": "elsevier", + "publisher_parent": null, + "crossref_member": 78, + "prefixes": [ + "10.7424", + "10.14219", + "10.7811", + "10.1580", + "10.1533", + "10.1529", + "10.3816", + "10.1602", + "10.3921", + "10.1240", + "10.1205", + "10.4065", + "10.1197", + "10.1157", + "10.1383", + "10.1367", + "10.2353", + "10.2111", + "10.2139", + "10.1006", + "10.1016", + "10.1054", + "10.1053", + "10.1067", + "10.1078", + "10.3182" + ], + "urls": { + "xml": "http://api.elsevier.com/content/article/PII:%s?httpAccept=text/xml", + "plain": "http://api.elsevier.com/content/article/PII:%s?httpAccept=text/plain" + }, + "components": { + "html": null, + "doi": null, + "id": "get `alternative-id` from Crossref API" + }, + "cookies": false, + "regex": null, + "open_access": false, + "journals": null, + "notes": "need a different internal ID - get `alternative-id` from Crossref API" +} diff --git a/src/emerald.json b/src/emerald.json new file mode 100644 index 0000000..d079bec --- /dev/null +++ b/src/emerald.json @@ -0,0 +1,24 @@ +{ + "publisher": "emerald", + "publisher_parent": null, + "crossref_member": 140, + "prefixes": [ + "10.1108", + "10.5042" + ], + "urls": { + "html": "http://www.emeraldinsight.com/doi/full/%s", + "pdf": "http://www.emeraldinsight.com/doi/pdfplus/%s" + }, + "components": { + "html": null, + "doi": { + "regex": ".+" + } + }, + "cookies": false, + "regex": null, + "open_access": false, + "journals": null, + "notes": "Crossref link gives URL for html, but in pubpatternsapi just construct by hand" +} diff --git a/src/f1000.json b/src/f1000.json new file mode 100644 index 0000000..2bb8dc3 --- /dev/null +++ b/src/f1000.json @@ -0,0 +1,31 @@ +{ + "publisher": "f1000", + "publisher_parent": null, + "crossref_member": 4950, + "prefixes": ["10.12688"], + "urls": null, + "components": null, + "cookies": false, + "regex": null, + "open_access": true, + "journals": [ + { + "journal": "f1000", + "open_access": true, + "issn": "2050-084X", + "urls": { + "pdf": "https://f1000research.com/articles/6-221/v2/pdf", + "xml": "https://f1000research.com/articles/6-221/v2/xml" + }, + "components": { + "html": null, + "doi": { + "regex": "[0-9]{5}$" + } + } + } + ] +} + +// e.g. +// 10.12688/f1000research.10554.2 diff --git a/src/karger.json b/src/karger.json new file mode 100644 index 0000000..44685db --- /dev/null +++ b/src/karger.json @@ -0,0 +1,22 @@ +{ + "publisher": "karger", + "publisher_parent": null, + "crossref_member": 127, + "prefixes": [ + "10.1159" + ], + "urls": { + "pdf": "https://www.karger.com/Article/Pdf/%s" + }, + "components": { + "html": null, + "doi": { + "regex": "[0-9]{5}$" + } + }, + "cookies": false, + "regex": null, + "open_access": true, + "journals": null, + "notes": null +} diff --git a/src/pleiades.json b/src/pleiades.json new file mode 100644 index 0000000..4f5f93e --- /dev/null +++ b/src/pleiades.json @@ -0,0 +1,23 @@ +{ + "publisher": "pleiades", + "publisher_parent": null, + "crossref_member": 137, + "prefixes": [ + "10.1108", + "10.5042" + ], + "urls": { + "pdf": "https://link.springer.com/content/pdf/%s" + }, + "components": { + "html": null, + "doi": { + "regex": ".+" + } + }, + "cookies": false, + "regex": null, + "open_access": false, + "journals": null, + "notes": "follow redirects so that using a single URL pattern will work for new and old URLs" +} diff --git a/src/royal_society_chemistry.json b/src/royal_society_chemistry.json new file mode 100644 index 0000000..b8679fc --- /dev/null +++ b/src/royal_society_chemistry.json @@ -0,0 +1,22 @@ +{ + "publisher": "royal_society_chemistry", + "publisher_parent": null, + "crossref_member": 292, + "prefixes": [ + "10.1039" + ], + "urls": { + "pdf": "http://pubs.rsc.org/en/content/articlepdf/%s/JA/%s" + }, + "components": { + "html": null, + "doi": { + "regex": "[0-9]{5}$" + } + }, + "cookies": false, + "regex": null, + "open_access": false, + "journals": null, + "notes": null +} diff --git a/src/ssrn.json b/src/ssrn.json new file mode 100644 index 0000000..aa595b3 --- /dev/null +++ b/src/ssrn.json @@ -0,0 +1,24 @@ +{ + "publisher": "ssrn", + "publisher_parent": "elsevier", + "publisher_member": 78, + "crossref_member": null, + "prefixes": ["10.2139"], + "urls": { + "pdf": "https://papers.ssrn.com/sol3/%s" + }, + "components": { + "html": null, + "pdf" : { + "selector": "//a[@id=\"downloadPdf\"]", + "attribute": "href" + }, + "doi": { + "regex": ".+" + } + }, + "cookies": true, + "regex": null, + "open_access": true, + "journals": null +} diff --git a/src/transtech.json b/src/transtech.json new file mode 100644 index 0000000..279d6f3 --- /dev/null +++ b/src/transtech.json @@ -0,0 +1,22 @@ +{ + "publisher": "trans-tech-publications", + "publisher_parent": null, + "crossref_member": 2457, + "prefixes": [ + "10.4028" + ], + "urls": { + "pdf": "http://%s.pdf" + }, + "components": { + "html": null, + "doi": { + "regex": "www.scientific.net.+" + } + }, + "cookies": false, + "regex": null, + "open_access": true, + "journals": null, + "notes": null +}