diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..51214d54 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,28 @@ +# Build outputs (regenerated inside the builder stage) +build/ +.gradle/ +out/ + +# IDE / editor / OS +.idea/ +.vscode/ +*.iml +.DS_Store + +# Runtime & logs (never needed at build time) +logs/ +tmp/ +*.log + +# Research / developer helpers never copied into the image +scripts/ +doc/ + +# Markdown & meta — not copied into the image +*.md +LICENSE +.claude/ +.github/ + +# Note: .git/ is intentionally NOT listed — Dockerfile.software COPYs it so +# Gradle can stamp revision.txt from `git rev-parse`. diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index c8a9cac4..5e9c8293 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -2,9 +2,9 @@ name: Build unstable on: [push] -concurrency: +concurrency: group: gradle -# cancel-in-progress: true + cancel-in-progress: true jobs: @@ -22,14 +22,14 @@ jobs: - name: Build with Gradle run: ./gradlew build -x test -# - name: Test with Gradle Jacoco and Coveralls -# run: ./gradlew test jacocoTestReport coveralls --no-daemon -# -# - name: Coveralls GitHub Action -# uses: coverallsapp/github-action@v2 -# with: -# github-token: ${{ secrets.GITHUB_TOKEN }} -# format: jacoco + - name: Test with Gradle Jacoco and Coveralls + run: ./gradlew test jacocoTestReport coveralls --no-daemon + + - name: Coveralls GitHub Action + uses: coverallsapp/github-action@v2 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + format: jacoco docker-build: needs: [ build ] diff --git a/Dockerfile.software b/Dockerfile.software index fb63fd73..9c61054a 100644 --- a/Dockerfile.software +++ b/Dockerfile.software @@ -87,6 +87,15 @@ ENV SOFTWARE_MENTIONS_OPTS="-Djava.library.path=/opt/grobid/grobid-home/lib/lin- CMD ["./software-mentions/bin/software-mentions", "server", "software-mentions/resources/config/config.yml"] +# Container-level liveness probe. /service/isalive returns "true" when the +# Dropwizard server is up; models are loaded lazily on first request so we +# don't need to wait for them before reporting healthy. +# Using python3 (required by JEP/DeLFT so guaranteed present) instead of +# curl/wget to avoid depending on packages that may not ship in the GROBID +# base image. +HEALTHCHECK --interval=30s --timeout=5s --start-period=90s --retries=3 \ + CMD python3 -c "import urllib.request,sys; sys.exit(0 if urllib.request.urlopen('http://localhost:8060/service/isalive', timeout=4).status == 200 else 1)" || exit 1 + LABEL \ authors="The contributors" \ org.label-schema.name="software-mentions" \ diff --git a/src/main/java/org/grobid/core/engines/SoftwareDisambiguator.java b/src/main/java/org/grobid/core/engines/SoftwareDisambiguator.java index 3d973314..c9ad7775 100644 --- a/src/main/java/org/grobid/core/engines/SoftwareDisambiguator.java +++ b/src/main/java/org/grobid/core/engines/SoftwareDisambiguator.java @@ -135,15 +135,10 @@ public boolean checkIfAlive() { url = new URL("http://" + nerd_host + "/service/isalive"); LOGGER.debug("Calling: " + url.toString()); -//System.out.println("Calling: " + url.toString()); - CloseableHttpClient httpClient = HttpClients.createDefault(); HttpGet get = new HttpGet(url.toString()); - CloseableHttpResponse response = null; - Scanner in = null; - try { - response = httpClient.execute(get); -//System.out.println(response.getStatusLine()); + try (CloseableHttpClient httpClient = HttpClients.createDefault(); + CloseableHttpResponse response = httpClient.execute(get)) { int code = response.getStatusLine().getStatusCode(); if (code != 200) { LOGGER.error("Failed isalive service: HTTP error code : " + code); @@ -151,11 +146,6 @@ public boolean checkIfAlive() { } else { result = true; } - } finally { - if (in != null) - in.close(); - if (response != null) - response.close(); } } catch (MalformedURLException e) { LOGGER.error("disambiguation service not available: MalformedURLException"); @@ -169,12 +159,11 @@ public boolean checkIfAlive() { } /** - * Check if the software customisation is ready on the entity-fishing server, if not load it + * Check if the software customisation is ready on the entity-fishing server, if not load it */ public void ensureCustomizationReady() { boolean result = false; URL url = null; - CloseableHttpResponse response = null; try { if ( (nerd_port != null) && (nerd_port.length() > 0) ) if (nerd_port.equals("443")) @@ -185,24 +174,15 @@ public void ensureCustomizationReady() { url = new URL("http://" + nerd_host + "/service/customisation/software"); LOGGER.debug("Calling: " + url.toString()); -//System.out.println("Calling: " + url.toString()); - CloseableHttpClient httpClient = HttpClients.createDefault(); HttpGet get = new HttpGet(url.toString()); - Scanner in = null; - try { - response = httpClient.execute(get); -//System.out.println(response.getStatusLine()); + try (CloseableHttpClient httpClient = HttpClients.createDefault(); + CloseableHttpResponse response = httpClient.execute(get)) { int code = response.getStatusLine().getStatusCode(); if (code != 200) { LOGGER.error("Failed customization lookup service: HTTP error code : " + code); } else { result = true; } - } finally { - if (in != null) - in.close(); - if (response != null) - response.close(); } } catch (MalformedURLException e) { LOGGER.error("disambiguation service not available: MalformedURLException"); @@ -230,37 +210,27 @@ public void ensureCustomizationReady() { cutomisationFile = new File(cutomisationFile.getAbsolutePath()); String json = FileUtils.readFileToString(cutomisationFile, "UTF-8"); - CloseableHttpClient httpClient = HttpClients.createDefault(); HttpPost post = new HttpPost(url.toString()); - //StringBody stringValue = new StringBody(json, ContentType.MULTIPART_FORM_DATA); - //StringBody stringName = new StringBody("software", ContentType.MULTIPART_FORM_DATA); MultipartEntityBuilder builder = MultipartEntityBuilder.create(); builder.setMode(HttpMultipartMode.BROWSER_COMPATIBLE); builder.addTextBody("value", json); builder.addTextBody("name", "software"); - //builder.addPart("value", stringValue); - //builder.addPart("name", stringName); HttpEntity entity = builder.build(); - try { - post.setEntity(entity); - response = httpClient.execute(post); -//System.out.println(response.getStatusLine()); - + post.setEntity(entity); + try (CloseableHttpClient httpClient = HttpClients.createDefault(); + CloseableHttpResponse response = httpClient.execute(post)) { int code = response.getStatusLine().getStatusCode(); if (code != 200) { LOGGER.error("Failed loading software customisation: HTTP error code : " + code); } else { LOGGER.info("Software customisation loaded"); } - } finally { - if (response != null) - response.close(); } } catch (MalformedURLException e) { - e.printStackTrace(); + LOGGER.warn("MalformedURLException while loading software customisation", e); } catch (IOException e) { - e.printStackTrace(); + LOGGER.warn("I/O error while loading software customisation", e); } } } @@ -504,8 +474,6 @@ public String runNerd(List entities, List subtokens url = new URL("http://" + nerd_host + ":" + nerd_port + "/service/" + RESOURCEPATH); else url = new URL("http://" + nerd_host + "/service/" + RESOURCEPATH); -//System.out.println("calling... " + url.toString()); - CloseableHttpClient httpClient = HttpClients.createDefault(); HttpPost post = new HttpPost(url.toString()); //post.addHeader("Content-Type", "application/json"); //post.addHeader("Accept", "application/json"); @@ -578,14 +546,9 @@ public String runNerd(List entities, List subtokens builder.addPart("query", stringBody); HttpEntity entity = builder.build(); - CloseableHttpResponse response = null; - Scanner in = null; - try { - //post.setEntity(new UrlEncodedFormEntity(params)); - post.setEntity(entity); - response = httpClient.execute(post); - // System.out.println(response.getStatusLine()); - + post.setEntity(entity); + try (CloseableHttpClient httpClient = HttpClients.createDefault(); + CloseableHttpResponse response = httpClient.execute(post)) { int code = response.getStatusLine().getStatusCode(); if (code != 200) { LOGGER.error("Failed annotating text segment: HTTP error code : " + code); @@ -593,22 +556,18 @@ public String runNerd(List entities, List subtokens } HttpEntity entityResp = response.getEntity(); - in = new Scanner(entityResp.getContent()); - while (in.hasNext()) { - output.append(in.next()); - output.append(" "); + try (Scanner in = new Scanner(entityResp.getContent())) { + while (in.hasNext()) { + output.append(in.next()); + output.append(" "); + } } EntityUtils.consume(entityResp); - } finally { - if (in != null) - in.close(); - if (response != null) - response.close(); } } catch (MalformedURLException e) { - e.printStackTrace(); + LOGGER.warn("MalformedURLException while calling entity-fishing", e); } catch (IOException e) { - e.printStackTrace(); + LOGGER.warn("I/O error while calling entity-fishing", e); } return output.toString().trim(); } diff --git a/src/main/java/org/grobid/core/utilities/XMLUtilities.java b/src/main/java/org/grobid/core/utilities/XMLUtilities.java index 3ff0af29..0999856a 100644 --- a/src/main/java/org/grobid/core/utilities/XMLUtilities.java +++ b/src/main/java/org/grobid/core/utilities/XMLUtilities.java @@ -1,6 +1,7 @@ package org.grobid.core.utilities; import java.io.*; +import java.nio.charset.StandardCharsets; import java.util.Iterator; import java.util.List; import java.util.ArrayList; @@ -8,6 +9,7 @@ import java.util.Map; import java.util.TreeMap; +import javax.xml.XMLConstants; import javax.xml.parsers.*; import javax.xml.transform.*; import javax.xml.transform.dom.DOMSource; @@ -38,16 +40,111 @@ public class XMLUtilities { private static final Logger LOGGER = LoggerFactory.getLogger(XMLUtilities.class); - public static String toPrettyString(String xml, int indent) { + // Cached JAXP factories. JAXP factories are not guaranteed thread-safe for + // their new*() methods; synchronized accessors below produce per-call + // builders/parsers/transformers that are themselves single-thread only. + // Caching avoids repeated ServiceLoader discovery which, under sustained + // TEI load, left classloader-backed references accumulating on the heap. + // Each factory is also hardened against XXE/SSRF since callers parse + // user-supplied XML/TEI; features are set defensively so unsupported + // options on a given JAXP implementation do not break class init. + private static final DocumentBuilderFactory DBF; + private static final SAXParserFactory SPF; + private static final XPathFactory XPF = XPathFactory.newInstance(); + private static final TransformerFactory TF; + + static { + DBF = DocumentBuilderFactory.newInstance(); + DBF.setNamespaceAware(true); + hardenDocumentBuilderFactory(DBF); + + SPF = SAXParserFactory.newInstance(); + hardenSAXParserFactory(SPF); + + TF = TransformerFactory.newInstance(); + hardenTransformerFactory(TF); + } + + private static void hardenDocumentBuilderFactory(DocumentBuilderFactory factory) { + trySetDBFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true); + trySetDBFeature(factory, "http://apache.org/xml/features/disallow-doctype-decl", true); + trySetDBFeature(factory, "http://xml.org/sax/features/external-general-entities", false); + trySetDBFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false); + trySetDBFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + try { + factory.setXIncludeAware(false); + } catch (UnsupportedOperationException | AbstractMethodError ignore) { + // older JAXP impls + } + factory.setExpandEntityReferences(false); + } + + private static void trySetDBFeature(DocumentBuilderFactory factory, String feature, boolean value) { try { + factory.setFeature(feature, value); + } catch (ParserConfigurationException e) { + LOGGER.debug("Unsupported DocumentBuilderFactory feature: {}", feature); + } + } + + private static void hardenSAXParserFactory(SAXParserFactory factory) { + trySetSPFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true); + trySetSPFeature(factory, "http://apache.org/xml/features/disallow-doctype-decl", true); + trySetSPFeature(factory, "http://xml.org/sax/features/external-general-entities", false); + trySetSPFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false); + trySetSPFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + } + + private static void trySetSPFeature(SAXParserFactory factory, String feature, boolean value) { + try { + factory.setFeature(feature, value); + } catch (Exception e) { + LOGGER.debug("Unsupported SAXParserFactory feature: {}", feature); + } + } + + private static void hardenTransformerFactory(TransformerFactory factory) { + try { + factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); + } catch (TransformerConfigurationException e) { + LOGGER.debug("Unsupported TransformerFactory feature: FEATURE_SECURE_PROCESSING"); + } + trySetTFAttribute(factory, XMLConstants.ACCESS_EXTERNAL_DTD, ""); + trySetTFAttribute(factory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, ""); + } + + private static void trySetTFAttribute(TransformerFactory factory, String attribute, Object value) { + try { + factory.setAttribute(attribute, value); + } catch (IllegalArgumentException e) { + LOGGER.debug("Unsupported TransformerFactory attribute: {}", attribute); + } + } + + private static synchronized DocumentBuilder newBuilder() throws ParserConfigurationException { + return DBF.newDocumentBuilder(); + } + + private static synchronized SAXParser newSAXParser() throws Exception { + return SPF.newSAXParser(); + } + + private static synchronized XPath newXPath() { + return XPF.newXPath(); + } + + private static synchronized Transformer newTransformer() throws TransformerConfigurationException { + return TF.newTransformer(); + } + + public static String toPrettyString(String xml, int indent) { + try (ByteArrayInputStream inputStream = new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))) { // Turn xml string into a document - org.w3c.dom.Document document = DocumentBuilderFactory.newInstance() - .newDocumentBuilder() - .parse(new InputSource(new ByteArrayInputStream(xml.getBytes("utf-8")))); + org.w3c.dom.Document document = newBuilder().parse(new InputSource(inputStream)); // Remove whitespaces outside tags document.normalize(); - XPath xPath = XPathFactory.newInstance().newXPath(); + XPath xPath = newXPath(); org.w3c.dom.NodeList nodeList = (org.w3c.dom.NodeList) xPath.evaluate("//text()[normalize-space()='']", document, XPathConstants.NODESET); @@ -58,17 +155,17 @@ public static String toPrettyString(String xml, int indent) { } // Setup pretty print options - TransformerFactory transformerFactory = TransformerFactory.newInstance(); - Transformer transformer = transformerFactory.newTransformer(); + Transformer transformer = newTransformer(); transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); // Return pretty print xml string - StringWriter stringWriter = new StringWriter(); - transformer.transform(new DOMSource(document), new StreamResult(stringWriter)); - return stringWriter.toString(); + try (StringWriter stringWriter = new StringWriter()) { + transformer.transform(new DOMSource(document), new StreamResult(stringWriter)); + return stringWriter.toString(); + } } catch (Exception e) { throw new RuntimeException(e); } @@ -115,14 +212,15 @@ public static BiblioItem parseTEIBiblioItem(org.w3c.dom.Document doc, org.w3c.do BiblStructSaxHandler handler = new BiblStructSaxHandler(); String teiXML = null; try { - SAXParserFactory spf = SAXParserFactory.newInstance(); - SAXParser p = spf.newSAXParser(); + SAXParser p = newSAXParser(); teiXML = serialize(doc, biblStructElement); - p.parse(new InputSource(new StringReader(teiXML)), handler); + try (StringReader reader = new StringReader(teiXML)) { + p.parse(new InputSource(reader), handler); + } } catch(Exception e) { if (teiXML != null) LOGGER.warn("The parsing of the biblStruct from TEI document failed for: " + teiXML); - else + else LOGGER.warn("The parsing of the biblStruct from TEI document failed for: " + biblStructElement.toString()); } return handler.getBiblioItem(); @@ -243,11 +341,10 @@ public static Pair getLeftRightTextContent(Element current) { public static String serialize(org.w3c.dom.Document doc, Node node) { // to avoid issues with space reamining from deleted nodes try { - XPathFactory xpathFactory = XPathFactory.newInstance(); // XPath to find empty text nodes. - XPathExpression xpathExp = xpathFactory.newXPath().compile( - "//text()[normalize-space(.) = '']"); - NodeList emptyTextNodes = (NodeList) + XPathExpression xpathExp = newXPath().compile( + "//text()[normalize-space(.) = '']"); + NodeList emptyTextNodes = (NodeList) xpathExp.evaluate(doc, XPathConstants.NODESET); // Remove each empty text node from document. @@ -256,21 +353,13 @@ public static String serialize(org.w3c.dom.Document doc, Node node) { emptyTextNode.getParentNode().removeChild(emptyTextNode); } } catch(Exception ex) { - ex.printStackTrace(); + LOGGER.warn("Failed to strip empty text nodes before serialize()", ex); } - DOMSource domSource = null; - String xml = null; - try { - if (node == null) { - domSource = new DOMSource(doc); - } else { - domSource = new DOMSource(node); - } - StringWriter writer = new StringWriter(); + DOMSource domSource = (node == null) ? new DOMSource(doc) : new DOMSource(node); + try (StringWriter writer = new StringWriter()) { StreamResult result = new StreamResult(writer); - TransformerFactory tf = TransformerFactory.newInstance(); - Transformer transformer = tf.newTransformer(); + Transformer transformer = newTransformer(); transformer.setOutputProperty(OutputKeys.METHOD, "xml"); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); @@ -279,11 +368,11 @@ public static String serialize(org.w3c.dom.Document doc, Node node) { if (node != null) transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); transformer.transform(domSource, result); - xml = writer.toString(); - } catch(TransformerException ex) { - ex.printStackTrace(); + return writer.toString(); + } catch(TransformerException | IOException ex) { + LOGGER.warn("Failed to serialize DOM node", ex); + return null; } - return xml; } @@ -296,13 +385,11 @@ public static void cleanXMLCorpus(String documentPath) throws Exception { File outputFile = new File(documentPath.replace(".tei.xml", ".clean.tei.xml")); // we use a DOM parser - org.w3c.dom.Document document = DocumentBuilderFactory.newInstance() - .newDocumentBuilder() - .parse(documentFile); + org.w3c.dom.Document document = newBuilder().parse(documentFile); // remove tei entries with empty body document.normalize(); - XPath xPath = XPathFactory.newInstance().newXPath(); + XPath xPath = newXPath(); org.w3c.dom.NodeList nodeList = (org.w3c.dom.NodeList) xPath.evaluate("//tei/text/body", document, XPathConstants.NODESET); @@ -350,28 +437,26 @@ public static void cleanXMLCorpus(String documentPath) throws Exception { } // Setup pretty print options - TransformerFactory transformerFactory = TransformerFactory.newInstance(); - Transformer transformer = transformerFactory.newTransformer(); + Transformer transformer = newTransformer(); transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); // Return pretty print xml string - StringWriter stringWriter = new StringWriter(); - transformer.transform(new DOMSource(document), new StreamResult(stringWriter)); - - // write result to file - FileUtils.writeStringToFile(outputFile, stringWriter.toString(), "UTF-8"); + try (StringWriter stringWriter = new StringWriter()) { + transformer.transform(new DOMSource(document), new StreamResult(stringWriter)); - // check again if everything is well-formed after the changes - try { - document = DocumentBuilderFactory.newInstance() - .newDocumentBuilder() - .parse(new InputSource(new ByteArrayInputStream(stringWriter.toString().getBytes("UTF-8")))); - } catch(Exception e) { - System.out.println("Problem with the final TEI XML"); - e.printStackTrace(); + // write result to file + FileUtils.writeStringToFile(outputFile, stringWriter.toString(), "UTF-8"); + + // check again if everything is well-formed after the changes + try (ByteArrayInputStream inputStream = + new ByteArrayInputStream(stringWriter.toString().getBytes(StandardCharsets.UTF_8))) { + document = newBuilder().parse(new InputSource(inputStream)); + } catch(Exception e) { + LOGGER.warn("Problem with the final TEI XML", e); + } } } @@ -481,10 +566,8 @@ public static void segment(org.w3c.dom.Document doc, Node node) { } String fullSent = "" + newSent + ""; boolean fail = false; - try { - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setNamespaceAware(true); - org.w3c.dom.Document d = factory.newDocumentBuilder().parse(new InputSource(new StringReader(fullSent))); + try (StringReader reader = new StringReader(fullSent)) { + newBuilder().parse(new InputSource(reader)); } catch(Exception e) { fail = true; } @@ -508,16 +591,14 @@ public static void segment(org.w3c.dom.Document doc, Node node) { //System.out.println(sent); - try { - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setNamespaceAware(true); - org.w3c.dom.Document d = factory.newDocumentBuilder().parse(new InputSource(new StringReader(sent))); + try (StringReader reader = new StringReader(sent)) { + org.w3c.dom.Document d = newBuilder().parse(new InputSource(reader)); //d.getDocumentElement().normalize(); Node newNode = doc.importNode(d.getDocumentElement(), true); newNodes.add(newNode); //System.out.println(serialize(doc, newNode)); } catch(Exception e) { - + LOGGER.debug("Failed to re-parse segmented sentence", e); } } diff --git a/src/main/java/org/grobid/service/ModelLoadStatus.java b/src/main/java/org/grobid/service/ModelLoadStatus.java new file mode 100644 index 00000000..ac5ba55b --- /dev/null +++ b/src/main/java/org/grobid/service/ModelLoadStatus.java @@ -0,0 +1,54 @@ +package org.grobid.service; + +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * Simple process-wide registry that tracks the outcome of eager model + * loading performed at service startup. + * + *

Software-mentions' DeLFT-based classifier models (context, type, + * used/created/shared) are instantiated directly via + * {@code DeLFTClassifierModel}, bypassing grobid-core's + * {@code TaggerFactory} bookkeeping. Because of this, + * {@code TaggerFactory.getLoadedModels()} / {@code getFailedModels()} alone + * cannot describe the health of the service. This class fills that gap for + * the classifier models and can also be used to track eager loading of + * CRF-based software parsers. + */ +public final class ModelLoadStatus { + + private static final Map LOADED = new LinkedHashMap<>(); + private static final Map FAILED = new LinkedHashMap<>(); + + private ModelLoadStatus() { + } + + public static synchronized void markLoaded(String modelName) { + FAILED.remove(modelName); + LOADED.put(modelName, "ok"); + } + + public static synchronized void markFailed(String modelName, String reason) { + LOADED.remove(modelName); + FAILED.put(modelName, reason == null ? "unknown error" : reason); + } + + public static synchronized Map getLoadedModels() { + return Collections.unmodifiableMap(new LinkedHashMap<>(LOADED)); + } + + public static synchronized Map getFailedModels() { + return Collections.unmodifiableMap(new LinkedHashMap<>(FAILED)); + } + + public static synchronized boolean hasFailures() { + return !FAILED.isEmpty(); + } + + public static synchronized void reset() { + LOADED.clear(); + FAILED.clear(); + } +} diff --git a/src/main/java/org/grobid/service/controller/HealthCheck.java b/src/main/java/org/grobid/service/controller/HealthCheck.java index 45a2600d..b74c1d1d 100644 --- a/src/main/java/org/grobid/service/controller/HealthCheck.java +++ b/src/main/java/org/grobid/service/controller/HealthCheck.java @@ -1,5 +1,9 @@ package org.grobid.service.controller; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import org.grobid.core.engines.tagging.TaggerFactory; +import org.grobid.service.ModelLoadStatus; import org.grobid.service.configuration.SoftwareServiceConfiguration; import com.google.inject.Inject; @@ -9,15 +13,37 @@ import jakarta.ws.rs.Produces; import jakarta.ws.rs.core.Response; +import java.util.Map; + import static jakarta.ws.rs.core.MediaType.APPLICATION_JSON; +/** + * Health / diagnostic endpoint for the software-mentions service. + * + *

In addition to the Dropwizard admin-style {@link #check()} method, this + * resource exposes a {@code GET /service/health} endpoint that returns a + * JSON document describing the state of the service: which models are + * loaded and which (if any) failed to load. The endpoint returns HTTP 500 + * whenever at least one model is known to have failed to load, so that + * orchestrators (Kubernetes, load-balancers, etc.) can take the instance + * out of rotation. + * + *

Loaded/failed status is populated by the model-loading code via + * {@link ModelLoadStatus#markLoaded(String)} / + * {@link ModelLoadStatus#markFailed(String, String)}. Until those call + * sites are wired up, the {@code models.loaded} map will be empty and the + * service will be reported as unhealthy — this is an intended default: + * without a signal that at least one model loaded, we err on the cautious + * side. + */ @Path("health") @Singleton @Produces(APPLICATION_JSON) public class HealthCheck extends com.codahale.metrics.health.HealthCheck { - @Inject - private SoftwareServiceConfiguration configuration; + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private final SoftwareServiceConfiguration configuration; @Inject public HealthCheck(SoftwareServiceConfiguration configuration) { @@ -26,14 +52,73 @@ public HealthCheck(SoftwareServiceConfiguration configuration) { @GET public Response alive() { - return Response.ok().build(); + boolean grobidHomeConfigured = + configuration != null && configuration.getGrobidHome() != null; + + Map loadedClassifiers = ModelLoadStatus.getLoadedModels(); + Map failedClassifiers = ModelLoadStatus.getFailedModels(); + Map loadedCrf = TaggerFactory.getLoadedModels(); + Map failedCrf = TaggerFactory.getFailedModels(); + + boolean hasFailures = ModelLoadStatus.hasFailures() || TaggerFactory.hasFailures(); + // Note: unlike datastet, software-mentions does not yet call + // ModelLoadStatus.markLoaded() from its model-loading code, so the + // loaded maps may be empty even when everything is fine. We therefore + // derive `ready` from grobidHome + absence of recorded failures only. + // Once load call-sites are wired, tighten this to also require + // !loadedClassifiers.isEmpty() || !loadedCrf.isEmpty(). + boolean ready = grobidHomeConfigured && !hasFailures; + + ObjectNode root = MAPPER.createObjectNode(); + root.put("status", ready ? "healthy" : "unhealthy"); + root.put("ready", ready); + root.put("grobidHomeConfigured", grobidHomeConfigured); + + ObjectNode models = MAPPER.createObjectNode(); + + ObjectNode loadedNode = MAPPER.createObjectNode(); + for (Map.Entry entry : loadedClassifiers.entrySet()) { + loadedNode.put(entry.getKey(), entry.getValue()); + } + for (Map.Entry entry : loadedCrf.entrySet()) { + loadedNode.put(entry.getKey(), entry.getValue()); + } + models.set("loaded", loadedNode); + + ObjectNode failedNode = MAPPER.createObjectNode(); + for (Map.Entry entry : failedClassifiers.entrySet()) { + failedNode.put(entry.getKey(), entry.getValue()); + } + for (Map.Entry entry : failedCrf.entrySet()) { + failedNode.put(entry.getKey(), entry.getValue()); + } + models.set("failed", failedNode); + + root.set("models", models); + + Response.Status status = ready + ? Response.Status.OK + : Response.Status.INTERNAL_SERVER_ERROR; + + return Response.status(status) + .entity(root.toString()) + .type(APPLICATION_JSON) + .build(); } @Override protected Result check() throws Exception { - return configuration.getGrobidHome() != null ? Result.healthy() : - Result.unhealthy("Grobid home is null in the configuration"); + if (configuration == null || configuration.getGrobidHome() == null) { + return Result.unhealthy("Grobid home is null in the configuration"); + } + if (ModelLoadStatus.hasFailures() || TaggerFactory.hasFailures()) { + StringBuilder reason = new StringBuilder("One or more models failed to load: "); + reason.append(ModelLoadStatus.getFailedModels()); + if (TaggerFactory.hasFailures()) { + reason.append("; CRF: ").append(TaggerFactory.getFailedModels()); + } + return Result.unhealthy(reason.toString()); + } + return Result.healthy(); } } - - diff --git a/src/main/resources/web/grobid/grobid-software.js b/src/main/resources/web/grobid/grobid-software.js index 3e843b5a..faac4b6f 100644 --- a/src/main/resources/web/grobid/grobid-software.js +++ b/src/main/resources/web/grobid/grobid-software.js @@ -40,6 +40,37 @@ class SoftciteApp { this.setupBaseURL(); this.fetchConceptBaseUrl(); this.configurePdfJs(); + this.startHealthCheck(); + } + + /** + * Polls /service/isalive and reflects the result on the #healthIndicator + * span in the header. Intervalized so we don't block the event loop. + * The 30s cadence is a balance between responsiveness and server load — + * bump it up if many users keep the page open. + */ + startHealthCheck(intervalMs = 30000) { + const probe = async () => { + const $indicator = $('#healthIndicator'); + if ($indicator.length === 0) return; + $indicator.removeClass('health-unknown health-healthy health-unhealthy') + .addClass('health-checking'); + const url = this.defineBaseURL('isalive'); + try { + const resp = await fetch(url, { method: 'GET', cache: 'no-store' }); + const ok = resp.ok; + const label = ok ? 'reachable' : `HTTP ${resp.status}`; + $indicator.removeClass('health-checking') + .addClass(ok ? 'health-healthy' : 'health-unhealthy') + .attr('title', `Service status: ${label} (checked ${new Date().toLocaleTimeString()})`); + } catch (err) { + const msg = (err && err.message) ? err.message : 'network error'; + $indicator.removeClass('health-checking').addClass('health-unhealthy') + .attr('title', `Service status: unreachable (${msg})`); + } + }; + probe(); + setInterval(probe, intervalMs); } setupEventListeners() { @@ -291,7 +322,7 @@ class SoftciteApp { this.resetMaps(); $("#pure-toggle-right, #toggle-group").hide(); - $('#infoResult').html('Requesting server...'); + $('#infoResult').html(' Requesting server\u2026'); $('#requestResult').html(''); const urlLocal = $('#gbdForm').attr('action'); @@ -312,7 +343,7 @@ class SoftciteApp { $("#pure-toggle-right, #toggle-group").hide(); this.resetExamplesClasses(); - $('#infoResult2').empty().html('Requesting server...'); + $('#infoResult2').empty().html(' Requesting server\u2026'); // initialize tabbed result area for PDF this.initializePdfTabs(); @@ -1282,7 +1313,7 @@ class SoftciteApp { const pdf_url = `resources/pdf-examples/${example.replace("/", "%2F")}.pdf`; $("#pure-toggle-right, #toggle-group").hide(); - $('#infoResult2').empty().html('Requesting server...'); + $('#infoResult2').empty().html(' Requesting server\u2026'); // initialize tabbed result area for PDF this.initializePdfTabs(); @@ -1802,14 +1833,14 @@ class SoftciteApp { } handleAjaxError(jqXHR) { - const errorMsg = "Error encountered while requesting the server.
" + jqXHR.responseText; - $('#infoResult, #infoResult2').html(`${errorMsg}`); + const responseText = jqXHR && jqXHR.responseText ? this.escapeHtml(jqXHR.responseText) : ""; + $('#infoResult, #infoResult2').html(`Error encountered while requesting the server.
${responseText}
`); this.entities = null; } handleAjaxError2(message = "") { - const errorMsg = message + " - The PDF document cannot be annotated. Please check the server logs."; - $('#infoResult, #infoResult2').html(`Error encountered while requesting the server.
${errorMsg}
`); + const safeMessage = this.escapeHtml(String(message)); + $('#infoResult, #infoResult2').html(`Error encountered while requesting the server.
${safeMessage} - The PDF document cannot be annotated. Please check the server logs.
`); this.entities = null; } diff --git a/src/main/resources/web/index.html b/src/main/resources/web/index.html index 7a30336e..274f10ab 100644 --- a/src/main/resources/web/index.html +++ b/src/main/resources/web/index.html @@ -35,6 +35,14 @@