diff --git a/src/main/java/org/grobid/core/data/ArticleBiblio.java b/src/main/java/org/grobid/core/data/ArticleBiblio.java new file mode 100644 index 00000000..728e016c --- /dev/null +++ b/src/main/java/org/grobid/core/data/ArticleBiblio.java @@ -0,0 +1,248 @@ +package org.grobid.core.data; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.grobid.core.utilities.LayoutTokensUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.NodeList; + +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathFactory; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +/** + * Data class to hold article metadata (DOI, title, authors) extracted from documents. + * This class provides a clean separation from BiblioComponent which is designed + * for reference components within software mentions. + */ +public class ArticleBiblio { + private static final Logger LOGGER = LoggerFactory.getLogger(ArticleBiblio.class); + + private String doi; + private String title; + private String authors; + + public ArticleBiblio() { + } + + public ArticleBiblio(String doi, String title, String authors) { + this.doi = doi; + this.title = title; + this.authors = authors; + } + + // Getters and setters + public String getDoi() { + return doi; + } + + public void setDoi(String doi) { + this.doi = doi; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public void setAuthors(String authors) { + this.authors = authors; + } + + public String getAuthors() { + return this.authors; + } + + /** + * Check if this metadata article has any meaningful content + */ + public boolean hasContent() { + return (StringUtils.isNotBlank(doi)) || + (StringUtils.isNotBlank(title)) || + (StringUtils.isNotBlank(authors)); + } + + /** + * Convert this MetadataArticle to JSON string for API response + */ + public String toJson() { + ObjectMapper mapper = new ObjectMapper(); + StringBuilder json = new StringBuilder(); + + json.append("\"biblio\": {"); + boolean firstField = true; + + // Add DOI if available + if (doi != null && !doi.trim().isEmpty()) { + if (!firstField) json.append(", "); + try { + json.append("\"doi\": ").append(mapper.writeValueAsString(doi)); + } catch (JsonProcessingException e) { + json.append("\"doi\": \"\""); + } + firstField = false; + } + + // Add title if available + if (title != null && !title.trim().isEmpty()) { + if (!firstField) json.append(", "); + try { + json.append("\"title\": ").append(mapper.writeValueAsString(title)); + } catch (JsonProcessingException e) { + json.append("\"title\": \"\""); + } + firstField = false; + } + + // Add authors if available + if (StringUtils.isNotBlank(authors)) { + if (!firstField) json.append(", "); + json.append("\"authors\": \"" + authors + "\""); + } + + json.append("}"); + return json.toString(); + } + + /** + * Create MetadataArticle from BiblioItem + */ + public static Optional fromBiblioItem(BiblioItem biblioItem) { + if (biblioItem == null) { + LOGGER.debug("BiblioItem is null, cannot create MetadataArticle"); + return Optional.empty(); + } + + LOGGER.debug("Creating MetadataArticle from BiblioItem"); + ArticleBiblio metadata = new ArticleBiblio(); + + if (biblioItem.getDOI() != null && !biblioItem.getDOI().trim().isEmpty()) { + metadata.setDoi(biblioItem.getDOI()); + LOGGER.debug("Extracted DOI: " + biblioItem.getDOI()); + } + + if (biblioItem.getTitle() != null && !biblioItem.getTitle().trim().isEmpty()) { + metadata.setTitle(biblioItem.getTitle()); + LOGGER.debug("Extracted title: " + biblioItem.getTitle()); + } + + String authors = LayoutTokensUtil.normalizeText(biblioItem.getAuthors()); + if (StringUtils.isNotBlank(authors)) { + metadata.setAuthors(authors); + } + + return metadata.hasContent() ? Optional.of(metadata) : Optional.empty(); + } + + /** + * Extract article metadata from TEI XML Document using XPath + */ + public static Optional fromTeiDocument(org.w3c.dom.Document teiDocument) { + if (teiDocument == null) { + return Optional.empty(); + } + + try { + XPathFactory xPathFactory = XPathFactory.newInstance(); + XPath xpath = xPathFactory.newXPath(); + + String title = extractTitle(teiDocument, xpath); + String doi = extractDOI(teiDocument, xpath); + List authors = extractAuthors(teiDocument, xpath); + + ArticleBiblio articleMetadata = new ArticleBiblio(); + articleMetadata.setDoi(doi); + articleMetadata.setTitle(title); + + if (CollectionUtils.isNotEmpty(authors)) { + articleMetadata.setAuthors(String.join(", ", authors)); + } + + return articleMetadata.hasContent() ? Optional.of(articleMetadata) : Optional.empty(); + } catch (Exception e) { + return Optional.empty(); + } + } + + private static String extractTitle(org.w3c.dom.Document doc, XPath xpath) { + try { + NodeList titleNodes = (NodeList) xpath.evaluate("//teiHeader/fileDesc/titleStmt/title[@level='a'][@type='main']/text()", doc, XPathConstants.NODESET); + if (titleNodes != null && titleNodes.getLength() > 0) { + String title = titleNodes.item(0).getNodeValue().trim(); + return title.isEmpty() ? "" : title; + } + } catch (Exception e) { + } + return ""; + } + + private static String extractDOI(org.w3c.dom.Document doc, XPath xpath) { + try { + NodeList doiNodes = (NodeList) xpath.evaluate("//teiHeader/fileDesc/sourceDesc/biblStruct/idno[2]/text()", doc, XPathConstants.NODESET); + if (doiNodes != null && doiNodes.getLength() > 0) { + String doi = doiNodes.item(0).getNodeValue().trim(); + return doi.isEmpty() ? "" : doi; + } + } catch (Exception e) { + } + return ""; + } + + private static List extractAuthors(org.w3c.dom.Document doc, XPath xpath) { + List authors = new ArrayList<>(); + try { + NodeList authorNodes = (NodeList) xpath.evaluate("//teiHeader/fileDesc/sourceDesc/biblStruct/analytic/author/persName", doc, XPathConstants.NODESET); + for (int i = 0; i < authorNodes.getLength(); i++) { + String author = formatAuthorFromNode(authorNodes.item(i)); + if (!author.isEmpty() && !authors.contains(author)) { + authors.add(author); + } + } + } catch (Exception e) { + } + return authors; + } + + /** + * Format author from XML node as "surname, name" + */ + private static String formatAuthorFromNode(org.w3c.dom.Node node) { + if (node == null) return ""; + + if (node.getNodeName().equals("persName")) { + String surname = ""; + String forename = ""; + + NodeList childNodes = node.getChildNodes(); + for (int i = 0; i < childNodes.getLength(); i++) { + org.w3c.dom.Node child = childNodes.item(i); + if (child.getNodeName().equals("surname")) { + surname = child.getTextContent().trim(); + } else if (child.getNodeName().equals("forename")) { + forename = child.getTextContent().trim(); + } + } + + if (!surname.isEmpty()) { + return forename.isEmpty() ? surname : surname + ", " + forename; + } + } + + return node.getTextContent().trim(); + } + + @Override + public String toString() { + return String.format("MetadataArticle{doi='%s', title='%s', authors=%s}", + doi, title, authors); + } +} \ No newline at end of file diff --git a/src/main/java/org/grobid/core/data/BiblioComponent.java b/src/main/java/org/grobid/core/data/BiblioComponent.java index f3aa366c..8756c125 100644 --- a/src/main/java/org/grobid/core/data/BiblioComponent.java +++ b/src/main/java/org/grobid/core/data/BiblioComponent.java @@ -1,17 +1,9 @@ package org.grobid.core.data; -import org.grobid.core.engines.label.TaggingLabel; -import org.grobid.core.utilities.TextUtilities; -import org.grobid.core.utilities.OffsetPosition; -import org.grobid.core.lexicon.SoftwareLexicon; -import org.grobid.core.layout.BoundingBox; -import org.grobid.core.layout.LayoutToken; - import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; - -import java.util.List; - +import org.grobid.core.layout.BoundingBox; +import org.grobid.core.utilities.TextUtilities; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -82,7 +74,7 @@ public String toJson() { } }*/ buffer.append(", \"refKey\": " + refKey); - + // knowledge information if (wikidataId != null) { buffer.append(", \"wikidataId\": \"" + wikidataId + "\""); diff --git a/src/main/java/org/grobid/core/engines/SoftwareContextClassifier.java b/src/main/java/org/grobid/core/engines/SoftwareContextClassifier.java index ecef6488..24065980 100644 --- a/src/main/java/org/grobid/core/engines/SoftwareContextClassifier.java +++ b/src/main/java/org/grobid/core/engines/SoftwareContextClassifier.java @@ -1,36 +1,21 @@ package org.grobid.core.engines; -import java.util.*; - -import org.apache.commons.io.FileUtils; -import org.grobid.core.GrobidModels; -import org.grobid.core.exceptions.GrobidException; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.layout.LayoutToken; -import org.grobid.core.layout.LayoutTokenization; -import org.grobid.core.utilities.*; -import org.grobid.core.jni.PythonEnvironmentConfig; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.grobid.core.data.SoftwareContextAttributes; +import org.grobid.core.data.SoftwareEntity; import org.grobid.core.jni.DeLFTClassifierModel; import org.grobid.core.utilities.GrobidConfig.ModelParameters; +import org.grobid.core.utilities.SoftwareConfiguration; import org.grobid.core.utilities.TextUtilities; -import org.grobid.core.data.SoftwareEntity; -import org.grobid.core.data.SoftwareContextAttributes; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.ArrayUtils; -import org.apache.commons.lang3.SystemUtils; -import org.apache.commons.lang3.tuple.Pair; - -import com.fasterxml.jackson.core.*; -import com.fasterxml.jackson.databind.*; -import com.fasterxml.jackson.databind.node.*; -import com.fasterxml.jackson.annotation.*; -import com.fasterxml.jackson.core.io.*; - -import static org.apache.commons.lang3.ArrayUtils.isEmpty; +import java.util.*; /** * Use a Deep Learning multiclass and multilabel classifier to characterize the context of a recognized software mention. @@ -132,7 +117,7 @@ public String classify(String text, MODEL_TYPE type) throws Exception { * @return list of predicted labels/scores pairs for each text */ public String classify(List texts, MODEL_TYPE type) throws Exception { - if (texts == null || texts.size() == 0) + if (CollectionUtils.isEmpty(texts)) return null; LOGGER.info("classify: " + texts.size() + " sentence(s) for type " + type.toString()); diff --git a/src/main/java/org/grobid/core/engines/SoftwareDisambiguator.java b/src/main/java/org/grobid/core/engines/SoftwareDisambiguator.java index 3d973314..d42971b7 100644 --- a/src/main/java/org/grobid/core/engines/SoftwareDisambiguator.java +++ b/src/main/java/org/grobid/core/engines/SoftwareDisambiguator.java @@ -1,77 +1,35 @@ package org.grobid.core.engines; -import nu.xom.Attribute; -import nu.xom.Element; +import com.fasterxml.jackson.core.io.JsonStringEncoder; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.io.FileUtils; -import org.grobid.core.GrobidModels; -import org.grobid.core.analyzers.SoftwareAnalyzer; +import org.apache.http.HttpEntity; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.conn.HttpHostConnectException; +import org.apache.http.entity.ContentType; +import org.apache.http.entity.mime.HttpMultipartMode; +import org.apache.http.entity.mime.MultipartEntityBuilder; +import org.apache.http.entity.mime.content.StringBody; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.util.EntityUtils; import org.grobid.core.data.SoftwareComponent; import org.grobid.core.data.SoftwareEntity; -import org.grobid.core.data.BiblioItem; -import org.grobid.core.document.Document; -import org.grobid.core.document.DocumentPiece; -import org.grobid.core.document.DocumentSource; -import org.grobid.core.document.xml.XmlBuilderUtils; -import org.grobid.core.engines.config.GrobidAnalysisConfig; -import org.grobid.core.engines.label.SoftwareTaggingLabels; -import org.grobid.core.engines.label.SegmentationLabels; -import org.grobid.core.engines.label.TaggingLabel; -import org.grobid.core.engines.label.TaggingLabels; -import org.grobid.core.exceptions.GrobidException; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.features.FeaturesVectorSoftware; -import org.grobid.core.layout.BoundingBox; import org.grobid.core.layout.LayoutToken; -import org.grobid.core.layout.LayoutTokenization; import org.grobid.core.lexicon.SoftwareLexicon; -import org.grobid.core.sax.TextChunkSaxHandler; -import org.grobid.core.tokenization.TaggingTokenCluster; -import org.grobid.core.tokenization.TaggingTokenClusteror; import org.grobid.core.utilities.SoftwareConfiguration; -import org.grobid.core.utilities.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.xml.sax.InputSource; - -import com.fasterxml.jackson.core.*; -import com.fasterxml.jackson.databind.*; -import com.fasterxml.jackson.databind.node.*; -import com.fasterxml.jackson.annotation.*; -import com.fasterxml.jackson.core.io.*; - -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; -import java.io.*; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.*; -import java.net.HttpURLConnection; +import java.io.File; +import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; - -import org.apache.http.HttpResponse; -import org.apache.http.NameValuePair; -import org.apache.http.client.HttpClient; -import org.apache.http.client.entity.UrlEncodedFormEntity; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.client.methods.HttpPost; -import org.apache.http.impl.client.HttpClientBuilder; -import org.apache.http.message.BasicNameValuePair; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.HttpClients; -import org.apache.http.client.methods.CloseableHttpResponse; -import org.apache.http.HttpEntity; -import org.apache.http.util.EntityUtils; -import org.apache.http.entity.mime.content.StringBody; -import org.apache.http.entity.ContentType; -import org.apache.http.entity.mime.MultipartEntityBuilder; -import org.apache.http.entity.mime.HttpMultipartMode; -import org.apache.http.conn.HttpHostConnectException; -import org.apache.commons.lang3.tuple.Pair; - -import static org.apache.commons.lang3.StringUtils.*; -import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; +import java.util.*; /** * Software entity disambiguator. Once software mentions are recognized and grouped @@ -273,7 +231,7 @@ public void ensureCustomizationReady() { * @return list of disambiguated software entities */ public List disambiguate(List entities, List tokens) { - if ( (entities == null) || (entities.size() == 0) ) + if (CollectionUtils.isEmpty(entities)) return entities; String json = null; try { diff --git a/src/main/java/org/grobid/core/engines/SoftwareParser.java b/src/main/java/org/grobid/core/engines/SoftwareParser.java index 4246b8ea..e8aa914d 100644 --- a/src/main/java/org/grobid/core/engines/SoftwareParser.java +++ b/src/main/java/org/grobid/core/engines/SoftwareParser.java @@ -6,6 +6,7 @@ import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; +import org.apache.commons.lang3.tuple.Triple; import org.grobid.core.GrobidModels; import org.grobid.core.analyzers.SoftwareAnalyzer; import org.grobid.core.data.*; @@ -25,6 +26,7 @@ import org.grobid.core.features.FeatureFactory; import org.grobid.core.features.FeaturesVectorSoftware; import org.grobid.core.layout.BoundingBox; +import org.grobid.core.utilities.TextNormalizationUtils; import org.grobid.core.layout.LayoutToken; import org.grobid.core.layout.LayoutTokenization; import org.grobid.core.layout.PDFAnnotation; @@ -349,9 +351,11 @@ public List processText(String text, boolean disambiguate) throw /** * Extract all Software mentions from a pdf file */ - public Pair, Document> processPDF(File file, - boolean disambiguate, - boolean addParagraphContext) throws IOException { + public Pair, Document> processPDF( + File file, + boolean disambiguate, + boolean addParagraphContext + ) throws IOException { List entities = new ArrayList(); Document doc = null; try { @@ -380,7 +384,8 @@ public Pair, Document> processPDF(File file, List> selectedLayoutTokenSequences = new ArrayList<>(); // from the header, we are interested in title, abstract and keywords - BiblioItem resHeader = null; + BiblioItem resHeader = new BiblioItem(); + doc.setResHeader(resHeader); SortedSet documentParts = doc.getDocumentPart(SegmentationLabels.HEADER); if (documentParts != null) { try { @@ -390,8 +395,8 @@ public Pair, Document> processPDF(File file, String labeledResult = null; if ((header != null) && (header.trim().length() > 0)) { labeledResult = parsers.getHeaderParser().label(header); - resHeader = new BiblioItem(); - resHeader.generalResultMappingHeader(labeledResult, headerTokenization); + + resHeader = parsers.getHeaderParser().resultExtraction(labeledResult, headerTokenization, resHeader); // title List titleTokens = resHeader.getLayoutTokens(TaggingLabels.HEADER_TITLE); @@ -551,7 +556,7 @@ public Pair, Document> processPDF(File file, SoftwareComponent softwareComponent = entity.getSoftwareName(); String localRawForm = softwareComponent.getRawForm(); if (localRawForm.indexOf("-") != -1 && !localRawForm.endsWith("-")) { - localRawForm = localRawForm.replaceAll("-( |\\n)*", ""); + localRawForm = TextNormalizationUtils.removeDashPatterns(localRawForm); localRawForm = localRawForm.replace("-", ""); if (allRawForms.contains(localRawForm)) { softwareComponent.setNormalizedForm(localRawForm); @@ -825,12 +830,12 @@ public Pair, Document> processPDF(File file, */ private List processLayoutTokenSequence( List layoutTokens, - List entities, - boolean disambiguate, - boolean addParagraphContext, - boolean fromPDF, - boolean fromXML, - List pdfAnnotations + List entities, + boolean disambiguate, + boolean addParagraphContext, + boolean fromPDF, + boolean fromXML, + List pdfAnnotations ) { List layoutTokenizations = new ArrayList(); layoutTokenizations.add(new LayoutTokenization(layoutTokens)); @@ -842,11 +847,11 @@ private List processLayoutTokenSequence( */ private List processLayoutTokenSequenceMultiple( List> layoutTokenList, - List entities, - boolean disambiguate, - boolean addParagraphContext, - boolean fromPDF, - boolean fromXML) { + List entities, + boolean disambiguate, + boolean addParagraphContext, + boolean fromPDF, + boolean fromXML) { return processLayoutTokenSequenceMultiple( layoutTokenList, @@ -1166,7 +1171,7 @@ public List propagateLayoutTokenSequence(List layou } public List markDAS(List entities, List availabilityTokens) { - if (entities == null || entities.size() == 0) + if (CollectionUtils.isEmpty(entities)) return entities; for (SoftwareEntity entity : entities) { if (entity.isInDataAvailabilitySection()) @@ -1238,7 +1243,7 @@ public List groupByEntities(List components) int n = 0; // index in entities SoftwareEntity previousEntity = null; currentEntity = null; - if (entities.size() == 0) + if (CollectionUtils.isEmpty(entities)) return entities; if (entities.size() > 1) { previousEntity = entities.get(0); @@ -2454,10 +2459,10 @@ public int boostrapTrainingPDF(String inputDirectory, /** * Extract all software mentions from a publisher XML file */ - public Pair, List> processXML(File file, - boolean disambiguate, - boolean addParagraphContext) throws IOException { - Pair, List> resultExtraction = null; + public Triple, List, List> processXML(File file, + boolean disambiguate, + boolean addParagraphContext) throws IOException { + Triple, List, List> resultExtraction = null; try { String tei = processXML(file); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); @@ -2484,10 +2489,12 @@ public Pair, List> processXML(File file, /** * Extract all software mentions from a publisher XML file */ - public Pair, List> processTEI(File file, - boolean disambiguate, - boolean addParagraphContext) throws IOException { - Pair, List> resultExtraction = null; + public Triple, List, List> processTEI( + File file, + boolean disambiguate, + boolean addParagraphContext + ) throws IOException { + Triple, List, List> resultExtraction = null; try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); @@ -2546,9 +2553,11 @@ public String processXML(File file) throws Exception { /** * Extract all software mentions from a publisher XML file */ - public Pair, List> processTEIDocument(org.w3c.dom.Document doc, - boolean disambiguate, - boolean addParagraphContext) { + public Triple, List, List> processTEIDocument( + org.w3c.dom.Document doc, + boolean disambiguate, + boolean addParagraphContext + ) { List entities = new ArrayList<>(); List> selectedLayoutTokenSequencesRaw = new ArrayList<>(); @@ -2609,7 +2618,7 @@ public Pair, List> processTEIDocument(org.w3c.d if (tokens.size() > 512) { String tempText = LayoutTokensUtil.toText(tokens); List offsetPositions = SentenceUtilities.getInstance().runSentenceDetection(tempText); - List> splitTokens =offsetPositions.stream() + List> splitTokens = offsetPositions.stream() .map(op -> tempText.substring(op.start, op.end)) .map(s -> SoftwareAnalyzer.getInstance().tokenizeWithLayoutToken(s)) .collect(Collectors.toList()); @@ -2853,7 +2862,8 @@ public Pair, List> processTEIDocument(org.w3c.d // finally classify the context for predicting the role of the software mention entities = SoftwareContextClassifier.getInstance(softwareConfiguration).classifyDocumentContexts(entities); - return Pair.of(entities, resCitations); + Optional metadata = ArticleBiblio.fromTeiDocument(doc); + return Triple.of(metadata, entities, resCitations); } diff --git a/src/main/java/org/grobid/core/engines/SoftwareTypeParser.java b/src/main/java/org/grobid/core/engines/SoftwareTypeParser.java index 1a205426..90f57bd6 100644 --- a/src/main/java/org/grobid/core/engines/SoftwareTypeParser.java +++ b/src/main/java/org/grobid/core/engines/SoftwareTypeParser.java @@ -1,53 +1,30 @@ package org.grobid.core.engines; -import org.apache.commons.io.FileUtils; -import org.grobid.core.GrobidModels; +import org.apache.commons.collections4.CollectionUtils; import org.grobid.core.analyzers.SoftwareAnalyzer; -import org.grobid.core.data.SoftwareComponent; -import org.grobid.core.data.BiblioComponent; -import org.grobid.core.data.SoftwareEntity; import org.grobid.core.data.SoftwareType; -import org.grobid.core.lexicon.SoftwareLexicon.Software_Type; import org.grobid.core.document.TEIFormatter; -import org.grobid.core.document.Document; -import org.grobid.core.document.DocumentPiece; -import org.grobid.core.document.DocumentSource; -import org.grobid.core.engines.config.GrobidAnalysisConfig; import org.grobid.core.engines.label.SoftwareTaggingLabels; -import org.grobid.core.engines.label.SegmentationLabels; import org.grobid.core.engines.label.TaggingLabel; -import org.grobid.core.engines.label.TaggingLabels; import org.grobid.core.engines.tagging.GrobidCRFEngine; -import org.grobid.core.exceptions.GrobidException; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.features.FeaturesVectorSoftware; +import org.grobid.core.features.FeatureFactory; import org.grobid.core.layout.BoundingBox; import org.grobid.core.layout.LayoutToken; -import org.grobid.core.layout.LayoutTokenization; -import org.grobid.core.lexicon.SoftwareLexicon; import org.grobid.core.lexicon.Lexicon; +import org.grobid.core.lexicon.SoftwareLexicon; import org.grobid.core.tokenization.TaggingTokenCluster; import org.grobid.core.tokenization.TaggingTokenClusteror; -import org.grobid.core.utilities.*; -import org.grobid.core.utilities.counters.CntManager; -import org.grobid.core.utilities.counters.impl.CntManagerFactory; -import org.grobid.core.lexicon.FastMatcher; +import org.grobid.core.utilities.BoundingBoxCalculator; +import org.grobid.core.utilities.LayoutTokensUtil; +import org.grobid.core.utilities.OffsetPosition; import org.grobid.core.utilities.SoftwareConfiguration; -import org.grobid.core.features.FeatureFactory; - +import org.grobid.core.utilities.counters.impl.CntManagerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.*; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.*; - -import static org.apache.commons.lang3.StringUtils.*; -import static org.grobid.core.document.xml.XmlBuilderUtils.teiElement; -import org.apache.commons.lang3.tuple.Pair; - -import static java.nio.charset.StandardCharsets.UTF_8; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; /** @@ -124,7 +101,7 @@ public List> processSentencesTokenSequenceMultiple(List layoutTokens : layoutTokenList) { layoutTokens = SoftwareAnalyzer.getInstance().retokenizeLayoutTokens(layoutTokens); - if ( (layoutTokens == null) || (layoutTokens.size() == 0) ) + if (CollectionUtils.isEmpty(layoutTokens)) continue; // positions for lexical match @@ -150,7 +127,7 @@ public List> processSentencesTokenSequenceMultiple(List layoutTokens : layoutTokenList) { layoutTokens = SoftwareAnalyzer.getInstance().retokenizeLayoutTokens(layoutTokens); - if ( (layoutTokens == null) || (layoutTokens.size() == 0) ) + if (CollectionUtils.isEmpty(layoutTokens)) continue; // text of the selected segment @@ -185,7 +162,7 @@ public List processFeatureInput(String text, String inputFeatures, * Process a list of already prepared input with features **/ public List> processFeatureInputs(List inputFeatures, List> layoutTokenList) throws Exception { - if (inputFeatures == null || inputFeatures.size() == 0) { + if (CollectionUtils.isEmpty(inputFeatures)) { // empty content, nothing more to do return null; } diff --git a/src/main/java/org/grobid/core/main/batch/SoftwareMain.java b/src/main/java/org/grobid/core/main/batch/SoftwareMain.java index 31b40ad3..74fda60f 100644 --- a/src/main/java/org/grobid/core/main/batch/SoftwareMain.java +++ b/src/main/java/org/grobid/core/main/batch/SoftwareMain.java @@ -4,6 +4,7 @@ import org.grobid.core.main.GrobidHomeFinder; import org.grobid.core.utilities.GrobidProperties; import org.grobid.core.utilities.SoftwareConfiguration; +import org.grobid.core.utilities.ConfigurationValidator; import org.grobid.core.utilities.GrobidConfig.ModelParameters; import org.grobid.core.main.LibraryLoader; import org.slf4j.Logger; @@ -79,8 +80,8 @@ protected static void initProcess(SoftwareConfiguration conf) { /*if (conf.getEngine().toUpperCase().equals("WAPITI")) LibraryLoader.load();*/ - if (conf != null && ( conf.getModel("software") != null || conf.getModel("software-type") != null)) - for (ModelParameters model : conf.getModels()) + if (ConfigurationValidator.hasValidSoftwareModels(conf)) + for (ModelParameters model : conf.getModels()) GrobidProperties.getInstance().addModel(model); LibraryLoader.load(); } @@ -91,8 +92,8 @@ protected static void initProcess(String grobidHome, SoftwareConfiguration conf) grobidHomeFinder.findGrobidHomeOrFail(); GrobidProperties.getInstance(grobidHomeFinder); - if (conf != null && ( conf.getModel("software") != null || conf.getModel("software-type") != null)) - for (ModelParameters model : conf.getModels()) + if (ConfigurationValidator.hasValidSoftwareModels(conf)) + for (ModelParameters model : conf.getModels()) GrobidProperties.getInstance().addModel(model); LibraryLoader.load(); diff --git a/src/main/java/org/grobid/core/utilities/ConfigurationValidator.java b/src/main/java/org/grobid/core/utilities/ConfigurationValidator.java new file mode 100644 index 00000000..47469da5 --- /dev/null +++ b/src/main/java/org/grobid/core/utilities/ConfigurationValidator.java @@ -0,0 +1,33 @@ +package org.grobid.core.utilities; + +/** + * Utility class for validating configuration objects and common validation patterns. + * Provides centralized methods for configuration validation. + */ +public class ConfigurationValidator { + + /** + * Validates if a software configuration has valid models for software processing. + * Checks for either "software" or "software-type" models. + * + * @param configuration the configuration to validate + * @return true if the configuration is not null and has valid models + */ + public static boolean hasValidSoftwareModels(SoftwareConfiguration configuration) { + return configuration != null && + (configuration.getModel("software") != null || + configuration.getModel("software-type") != null); + } + + /** + * Validates if a software configuration is valid with models collection. + * + * @param configuration the configuration to validate + * @return true if the configuration is not null and has models + */ + public static boolean isValidConfiguration(SoftwareConfiguration configuration) { + return configuration != null && + configuration.getModels() != null && + !configuration.getModels().isEmpty(); + } +} \ No newline at end of file diff --git a/src/main/java/org/grobid/core/utilities/TextNormalizationUtils.java b/src/main/java/org/grobid/core/utilities/TextNormalizationUtils.java new file mode 100644 index 00000000..1a677fb1 --- /dev/null +++ b/src/main/java/org/grobid/core/utilities/TextNormalizationUtils.java @@ -0,0 +1,98 @@ +package org.grobid.core.utilities; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.collections4.CollectionUtils; + +import java.util.Collection; +import java.util.List; +import java.util.Collections; + +/** + * Utility class for text normalization and common validation operations. + * Provides centralized methods for text cleaning, validation, and collection checks. + */ +public class TextNormalizationUtils { + + /** + * Normalizes whitespace in text by replacing newlines and tabs with spaces, + * then consolidating multiple spaces into single spaces and trimming. + * + * @param text the input text to normalize + * @return normalized text, or null if input is null + */ + public static String normalizeTextWhitespace(String text) { + if (text == null) { + return null; + } + return text.replaceAll("\\n", " ") + .replaceAll("\\t", " ") + .replaceAll("( )+", " ") + .trim(); + } + + /** + * Removes dash patterns followed by spaces or newlines. + * + * @param text the input text to process + * @return text with dash patterns removed, or null if input is null + */ + public static String removeDashPatterns(String text) { + if (text == null) { + return null; + } + return text.replaceAll("-( |\\n)*", ""); + } + + /** + * Checks if a collection is null or empty using CollectionUtils.isEmpty(). + * This is a centralized method for consistency. + * + * @param collection the collection to check + * @return true if the collection is null or empty + */ + public static boolean isEmpty(Collection collection) { + return CollectionUtils.isEmpty(collection); + } + + /** + * Checks if a collection is not null and not empty using CollectionUtils.isNotEmpty(). + * This is a centralized method for consistency. + * + * @param collection the collection to check + * @return true if the collection is not null and not empty + */ + public static boolean isNotEmpty(Collection collection) { + return CollectionUtils.isNotEmpty(collection); + } + + /** + * Returns an empty list if the input list is null or empty. + * + * @param list the input list + * @param the type of elements in the list + * @return the input list if not empty, otherwise an empty list + */ + public static List emptyIfNull(List list) { + return CollectionUtils.isEmpty(list) ? Collections.emptyList() : list; + } + + /** + * Checks if a string is null, empty, or contains only whitespace. + * + * @param text the string to check + * @return true if the string is null, empty, or whitespace-only + */ + public static boolean isBlank(String text) { + return StringUtils.isBlank(text); + } + + /** + * Checks if a string is not null, empty, and contains more than just whitespace. + * + * @param text the string to check + * @return true if the string is not null, empty, or whitespace-only + */ + public static boolean isNotBlank(String text) { + return StringUtils.isNotBlank(text); + } +} \ No newline at end of file diff --git a/src/main/java/org/grobid/service/controller/SoftwareProcessFile.java b/src/main/java/org/grobid/service/controller/SoftwareProcessFile.java index 004ca8f0..45e07bae 100644 --- a/src/main/java/org/grobid/service/controller/SoftwareProcessFile.java +++ b/src/main/java/org/grobid/service/controller/SoftwareProcessFile.java @@ -2,43 +2,35 @@ import com.google.inject.Inject; import com.google.inject.Singleton; - -import org.grobid.core.data.SoftwareComponent; -import org.grobid.core.data.SoftwareEntity; +import jakarta.ws.rs.core.Response; +import jakarta.ws.rs.core.Response.Status; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.commons.lang3.tuple.Triple; import org.grobid.core.data.BibDataSet; +import org.grobid.core.data.BiblioItem; +import org.grobid.core.data.ArticleBiblio; +import org.grobid.core.data.SoftwareEntity; import org.grobid.core.document.Document; import org.grobid.core.engines.Engine; -import org.grobid.core.main.LibraryLoader; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.document.DocumentSource; import org.grobid.core.engines.SoftwareParser; import org.grobid.core.engines.config.GrobidAnalysisConfig; -import org.grobid.core.factory.GrobidPoolingFactory; -import org.grobid.core.utilities.*; - +import org.grobid.core.factory.GrobidFactory; +import org.grobid.core.layout.Page; +import org.grobid.core.utilities.IOUtilities; +import org.grobid.core.utilities.SoftwareConfiguration; +import org.grobid.core.utilities.Versioner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; -import org.xml.sax.XMLReader; -import org.xml.sax.helpers.XMLReaderFactory; -import jakarta.ws.rs.WebApplicationException; -import jakarta.ws.rs.core.MediaType; -import jakarta.ws.rs.core.Response; -import jakarta.ws.rs.core.Response.Status; -import jakarta.ws.rs.core.StreamingOutput; -import java.io.*; -import java.nio.charset.Charset; -import java.util.*; +import javax.xml.bind.DatatypeConverter; +import java.io.File; +import java.io.InputStream; import java.security.DigestInputStream; import java.security.MessageDigest; -import javax.xml.bind.DatatypeConverter; - -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.tuple.Pair; - -import org.grobid.core.layout.Page; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.Optional; /** * @@ -101,6 +93,12 @@ public static Response processPDFAnnotation(final InputStream inputStream, String md5Str = DatatypeConverter.printHexBinary(digest).toUpperCase(); json.append(", \"md5\": \"" + md5Str + "\""); + // Add article metadata (biblio) from document header + BiblioItem resHeader = doc.getResHeader(); + Optional metadata = ArticleBiblio.fromBiblioItem(resHeader); + + metadata.ifPresent(articleBiblio -> json.append(", " + articleBiblio.toJson())); + // page height and width json.append(", \"pages\":["); List pages = doc.getPages(); @@ -286,18 +284,22 @@ public static Response extractXML(final InputStream inputStream, response = Response.status(Status.INTERNAL_SERVER_ERROR).build(); } else { long start = System.currentTimeMillis(); - Pair, List> extractionResult = + Triple, List, List> extractionResult = parser.processXML(originFile, disambiguate, addParagraphContext); long end = System.currentTimeMillis(); - List extractedEntities = extractionResult.getLeft(); + List extractedEntities = extractionResult.getMiddle(); + Optional metadata = extractionResult.getLeft(); StringBuilder json = new StringBuilder(); json.append("{ "); json.append(SoftwareServiceUtil.applicationDetails(Versioner.getVersion(), Versioner.getRevision())); - + String md5Str = DatatypeConverter.printHexBinary(digest).toUpperCase(); json.append(", \"md5\": \"" + md5Str + "\""); + + metadata.ifPresent(articleBiblio -> json.append(", " + articleBiblio.toJson())); + json.append(", \"mentions\":["); boolean first = true; if (extractedEntities != null) { @@ -372,20 +374,25 @@ public static Response extractTEI(final InputStream inputStream, response = Response.status(Status.INTERNAL_SERVER_ERROR).build(); } else { long start = System.currentTimeMillis(); - Pair, List> extractionResult = parser.processTEI(originFile, disambiguate, addParagraphContext); + Triple, List, List> extractionResult = parser.processTEI(originFile, disambiguate, addParagraphContext); long end = System.currentTimeMillis(); List extractedEntities = null; + Optional metadata = Optional.empty(); if (extractionResult != null) { - extractedEntities = extractionResult.getLeft(); + extractedEntities = extractionResult.getMiddle(); + metadata = extractionResult.getLeft(); } StringBuilder json = new StringBuilder(); json.append("{ "); json.append(SoftwareServiceUtil.applicationDetails(Versioner.getVersion(), Versioner.getRevision())); - + String md5Str = DatatypeConverter.printHexBinary(digest).toUpperCase(); json.append(", \"md5\": \"" + md5Str + "\""); + + metadata.ifPresent(articleBiblio -> json.append(", " + articleBiblio.toJson())); + json.append(", \"mentions\":["); boolean first = true; if (extractedEntities != null) { @@ -399,11 +406,9 @@ public static Response extractTEI(final InputStream inputStream, } json.append("], \"references\":["); - if (extractionResult != null) { - List bibDataSet = extractionResult.getRight(); - if (bibDataSet != null && bibDataSet.size()>0) { - SoftwareServiceUtil.serializeReferences(json, bibDataSet, extractedEntities); - } + List teiBibDataSet = extractionResult != null ? extractionResult.getRight() : null; + if (teiBibDataSet != null && teiBibDataSet.size()>0) { + SoftwareServiceUtil.serializeReferences(json, teiBibDataSet, extractedEntities); } json.append("], \"runtime\" :" + (end-start)); diff --git a/src/main/java/org/grobid/service/controller/SoftwareProcessString.java b/src/main/java/org/grobid/service/controller/SoftwareProcessString.java index 64458df8..9a125151 100644 --- a/src/main/java/org/grobid/service/controller/SoftwareProcessString.java +++ b/src/main/java/org/grobid/service/controller/SoftwareProcessString.java @@ -2,29 +2,22 @@ import com.google.inject.Inject; import com.google.inject.Singleton; - -import java.util.List; -import java.util.ArrayList; -import java.util.NoSuchElementException; - import jakarta.ws.rs.core.MediaType; import jakarta.ws.rs.core.Response; import jakarta.ws.rs.core.Response.Status; - -import org.grobid.core.data.SoftwareComponent; +import org.apache.commons.collections4.CollectionUtils; import org.grobid.core.data.SoftwareEntity; -import org.grobid.core.engines.SoftwareParser; import org.grobid.core.engines.SoftwareContextClassifier; -import org.grobid.core.engines.SoftwareContextClassifier.MODEL_TYPE; -import org.grobid.core.factory.GrobidPoolingFactory; -import org.grobid.core.utilities.GrobidProperties; +import org.grobid.core.engines.SoftwareParser; import org.grobid.core.utilities.SoftwareConfiguration; - +import org.grobid.core.utilities.TextNormalizationUtils; import org.grobid.core.utilities.Versioner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.commons.lang3.StringUtils; +import java.util.ArrayList; +import java.util.List; +import java.util.NoSuchElementException; /** * @@ -56,12 +49,12 @@ public static Response processText(String text, boolean disambiguate, SoftwareCo StringBuilder retVal = new StringBuilder(); SoftwareParser parser = SoftwareParser.getInstance(configuration); try { - if (text == null) { + if (TextNormalizationUtils.isBlank(text)) { return Response.status(Status.BAD_REQUEST).build(); } - + List entities = null; - text = text.replaceAll("\\n", " ").replaceAll("\\t", " "); + text = TextNormalizationUtils.normalizeTextWhitespace(text); long start = System.currentTimeMillis(); entities = parser.processText(text, disambiguate); long end = System.currentTimeMillis(); @@ -69,7 +62,7 @@ public static Response processText(String text, boolean disambiguate, SoftwareCo if (entities != null) { retVal.append("{ "); retVal.append(SoftwareServiceUtil.applicationDetails(Versioner.getVersion(), Versioner.getRevision())); - if (entities.size() == 0) { + if (CollectionUtils.isEmpty(entities)) { retVal.append(", \"mentions\": []"); } else { boolean first = true; @@ -117,15 +110,15 @@ public static Response characterizeContext(String text, SoftwareConfiguration co SoftwareContextClassifier classifier = SoftwareContextClassifier.getInstance(configuration); try { LOGGER.debug(">> set raw text for stateless service'..."); - - text = text.replaceAll("\\n", " ").replaceAll("\\t", " "); + + text = TextNormalizationUtils.normalizeTextWhitespace(text); long start = System.currentTimeMillis(); String resultJson = null; List texts = new ArrayList<>(); texts.add(text); try { List results = classifier.classifyDocumentContextsBinaryString(texts); - if (results.size()>0) + if (CollectionUtils.isNotEmpty(results)) resultJson = results.get(0); } catch(Exception e) { LOGGER.error("fail to classify document's set of contexts", e); @@ -195,7 +188,7 @@ public static String methodLogOut() { * Check whether the result is null or empty. */ public static boolean isResultOK(String result) { - return StringUtils.isBlank(result) ? false : true; + return TextNormalizationUtils.isNotBlank(result); } } diff --git a/src/main/java/org/grobid/service/controller/SoftwareServiceUtil.java b/src/main/java/org/grobid/service/controller/SoftwareServiceUtil.java index 19ebc95a..fe2ac53e 100644 --- a/src/main/java/org/grobid/service/controller/SoftwareServiceUtil.java +++ b/src/main/java/org/grobid/service/controller/SoftwareServiceUtil.java @@ -1,19 +1,19 @@ package org.grobid.service.controller; -import java.util.*; -import java.text.DateFormat; -import java.text.SimpleDateFormat; - -import org.grobid.core.data.BibDataSet; -import org.grobid.core.data.SoftwareEntity; -import org.grobid.core.data.BiblioComponent; - import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; - +import org.grobid.core.data.BibDataSet; +import org.grobid.core.data.BiblioComponent; +import org.grobid.core.data.SoftwareEntity; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.List; +import java.util.TimeZone; + /** * Utility methods for GROBID Software service. * diff --git a/src/main/java/org/grobid/trainer/SoftwareTrainerRunner.java b/src/main/java/org/grobid/trainer/SoftwareTrainerRunner.java index 08746e52..c4515f25 100644 --- a/src/main/java/org/grobid/trainer/SoftwareTrainerRunner.java +++ b/src/main/java/org/grobid/trainer/SoftwareTrainerRunner.java @@ -3,6 +3,7 @@ import org.grobid.core.main.GrobidHomeFinder; import org.grobid.core.utilities.SoftwareConfiguration; import org.grobid.core.utilities.GrobidProperties; +import org.grobid.core.utilities.ConfigurationValidator; import org.grobid.core.utilities.GrobidConfig.ModelParameters; import org.grobid.core.main.LibraryLoader; @@ -94,8 +95,8 @@ public static void main(String[] args) { System.out.println("path2GbdHome=" + path2GbdHome); initProcess(path2GbdHome); - if (conf != null && (conf.getModel("software") != null || conf.getModel("software-type") != null)) - for (ModelParameters model : conf.getModels()) + if (ConfigurationValidator.hasValidSoftwareModels(conf)) + for (ModelParameters model : conf.getModels()) GrobidProperties.getInstance().addModel(model); LibraryLoader.load(); diff --git a/src/test/java/org/grobid/core/engines/SoftwareContextClassifierTest.java b/src/test/java/org/grobid/core/engines/SoftwareContextClassifierTest.java index e1ac3cea..4d1376d5 100644 --- a/src/test/java/org/grobid/core/engines/SoftwareContextClassifierTest.java +++ b/src/test/java/org/grobid/core/engines/SoftwareContextClassifierTest.java @@ -7,6 +7,7 @@ import org.grobid.core.factory.GrobidFactory; import org.grobid.core.utilities.GrobidProperties; import org.grobid.core.utilities.SoftwareConfiguration; +import org.grobid.core.utilities.TextNormalizationUtils; import org.grobid.core.main.GrobidHomeFinder; import org.grobid.core.utilities.GrobidConfig.ModelParameters; import org.grobid.core.main.LibraryLoader; @@ -76,7 +77,7 @@ public void getTestResourcePath() { @Test public void testSoftwareContextClassifierText() throws Exception { String text = IOUtils.toString(this.getClass().getResourceAsStream("/text.txt"), StandardCharsets.UTF_8.toString()); - text = text.replaceAll("\\n", " ").replaceAll("\\t", " "); + text = TextNormalizationUtils.normalizeTextWhitespace(text); List texts = new ArrayList<>(); texts.add(text); String json = SoftwareContextClassifier.getInstance(configuration).classify(texts, MODEL_TYPE.used); diff --git a/src/test/java/org/grobid/core/engines/SoftwareParserTest.java b/src/test/java/org/grobid/core/engines/SoftwareParserTest.java index 98d8990b..b4333a2c 100644 --- a/src/test/java/org/grobid/core/engines/SoftwareParserTest.java +++ b/src/test/java/org/grobid/core/engines/SoftwareParserTest.java @@ -1,32 +1,28 @@ package org.grobid.core.engines; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.tuple.Pair; import org.grobid.core.data.SoftwareEntity; -import org.grobid.core.data.SoftwareComponent; import org.grobid.core.document.Document; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.utilities.GrobidProperties; -import org.grobid.core.utilities.SoftwareConfiguration; import org.grobid.core.main.GrobidHomeFinder; -import org.grobid.core.utilities.GrobidConfig.ModelParameters; import org.grobid.core.main.LibraryLoader; +import org.grobid.core.utilities.GrobidConfig.ModelParameters; +import org.grobid.core.utilities.GrobidProperties; +import org.grobid.core.utilities.SoftwareConfiguration; +import org.grobid.core.utilities.TextNormalizationUtils; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import java.io.File; import java.nio.charset.StandardCharsets; -import java.util.List; import java.util.Arrays; - -import org.apache.commons.lang3.tuple.Pair; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; +import java.util.List; import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.hasSize; -import static org.junit.Assert.assertNotNull; /** * @author Patrice @@ -72,7 +68,7 @@ public void getTestResourcePath() { @Test public void testSoftwareParserText() throws Exception { String text = IOUtils.toString(this.getClass().getResourceAsStream("/text.txt"), StandardCharsets.UTF_8.toString()); - text = text.replaceAll("\\n", " ").replaceAll("\\t", " "); + text = TextNormalizationUtils.normalizeTextWhitespace(text); List entities = SoftwareParser.getInstance(configuration).processText(text, false); System.out.println(text); System.out.println(entities.size());