Skip to content

Commit

Permalink
feat: enable scraper for OP mentions
Browse files Browse the repository at this point in the history
  • Loading branch information
ewan-escience committed Mar 4, 2024
1 parent 8b53d4e commit a8c69da
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 6 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) <e.cahen@esciencecenter.nl>
// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <e.cahen@esciencecenter.nl>
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -11,6 +11,8 @@
import com.google.gson.JsonParser;
import nl.esciencecenter.rsd.scraper.Config;
import nl.esciencecenter.rsd.scraper.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.time.Instant;
import java.util.ArrayList;
Expand All @@ -20,9 +22,6 @@
import java.util.TreeMap;
import java.util.stream.Collectors;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class MainMentions {

private static final Logger LOGGER = LoggerFactory.getLogger(MainMentions.class);
Expand All @@ -45,7 +44,7 @@ public static void main(String[] args) {

String doisJoined = mentionsToScrape.stream()
.map(mention -> mention.doi)
.map(doi -> Utils.urlEncode(doi))
.map(Utils::urlEncode)
.collect(Collectors.joining(","));
String jsonSources = null;
try {
Expand Down Expand Up @@ -88,9 +87,26 @@ public static void main(String[] args) {
}
}

String email = Config.crossrefContactEmail().orElse(null);
Collection<String> europeanPublicationsOfficeDois = doiToSource.entrySet()
.stream()
.filter(doiSourceEntry -> doiSourceEntry.getValue().equals("OP"))
.map(Map.Entry::getKey)
.toList();
try {
Collection<MentionRecord> openalexMentions = new OpenAlexCitations().mentionData(europeanPublicationsOfficeDois, email);
for (MentionRecord openalexMention : openalexMentions) {
mentionsFailedToScrape.remove(openalexMention.doi);
scrapedMentions.add(openalexMention);
}
} catch (Exception e) {
Utils.saveExceptionInDatabase("DataCite mention scraper", "mention", null, e);
}

Instant now = Instant.now();
for (MentionRecord mention : mentionsFailedToScrape.values()) {
mention.scrapedAt = now;
LOGGER.info("Failed to scrape mention with DOI {}", mention.doi);
}
scrapedMentions.addAll(mentionsFailedToScrape.values());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Objects;
import java.util.Optional;
import java.util.UUID;
import java.util.function.Predicate;
Expand All @@ -28,6 +29,37 @@ public class OpenAlexCitations {

static final String DOI_FILTER_URL_UNFORMATTED = "https://api.openalex.org/works?filter=doi:%s";

public Collection<MentionRecord> mentionData(Collection<String> dataciteDois, String email) throws IOException, InterruptedException {
String filter = dataciteDois.stream().filter(Objects::nonNull).collect(Collectors.joining("|"));
String worksUri = DOI_FILTER_URL_UNFORMATTED.formatted(Utils.urlEncode(filter)) + "&per-page=200";

HttpResponse<String> response;
if (email == null || email.isBlank()) {
response = Utils.getAsHttpResponse(worksUri);
} else {
response = Utils.getAsHttpResponse(worksUri, "User-Agent", "mailto:" + email);
}

JsonObject tree = JsonParser.parseString(response.body()).getAsJsonObject();
JsonArray citationsArray = tree
.getAsJsonArray("results");

Collection<MentionRecord> mentions = new ArrayList<>();
Instant now = Instant.now();
for (JsonElement citation : citationsArray) {
MentionRecord citationAsMention;
try {
citationAsMention = parseCitationAsMention(citation, now);
} catch (RuntimeException e) {
Utils.saveExceptionInDatabase("OpenAlex mention scraper", "mention", null, e);
continue;
}
mentions.add(citationAsMention);
}

return mentions;
}

public Collection<MentionRecord> citations(String doi, String email, UUID id) throws IOException, InterruptedException {

String doiUrlEncoded = Utils.urlEncode(doi);
Expand Down

0 comments on commit a8c69da

Please sign in to comment.