Skip to content

Commit

Permalink
Closes #200: use tesseract command line
Browse files Browse the repository at this point in the history
  • Loading branch information
jendib committed Mar 23, 2018
1 parent be1c2a7 commit abde9b7
Show file tree
Hide file tree
Showing 14 changed files with 67 additions and 1,368 deletions.
2 changes: 0 additions & 2 deletions .travis.yml
Expand Up @@ -17,8 +17,6 @@ after_success:
- docker push $REPO
env:
global:
- TESSDATA_PREFIX=/usr/share/tesseract-ocr
- LC_NUMERIC=C
- secure: LRGpjWORb0qy6VuypZjTAfA8uRHlFUMTwb77cenS9PPRBxuSnctC531asS9Xg3DqC5nsRxBBprgfCKotn5S8nBSD1ceHh84NASyzLSBft3xSMbg7f/2i7MQ+pGVwLncusBU6E/drnMFwZBleo+9M8Tf96axY5zuUp90MUTpSgt0=
- secure: bCDDR6+I7PmSkuTYZv1HF/z98ANX/SFEESUCqxVmV5Gs0zFC0vQXaPJQ2xaJNRop1HZBFMZLeMMPleb0iOs985smpvK2F6Rbop9Tu+Vyo0uKqv9tbZ7F8Nfgnv9suHKZlL84FNeUQZJX6vsFIYPEJ/r7K5P/M0PdUy++fEwxEhU=
- secure: ewXnzbkgCIHpDWtaWGMa1OYZJ/ki99zcIl4jcDPIC0eB3njX/WgfcC6i0Ke9mLqDqwXarWJ6helm22sNh+xtQiz6isfBtBX+novfRt9AANrBe3koCMUemMDy7oh5VflBaFNP0DVb8LSCnwf6dx6ZB5E9EB8knvk40quc/cXpGjY=
Expand Down
3 changes: 0 additions & 3 deletions Dockerfile
Expand Up @@ -4,9 +4,6 @@ MAINTAINER b.gamard@sismics.com
RUN apt-get update && apt-get -y -q install ffmpeg mediainfo tesseract-ocr tesseract-ocr-fra tesseract-ocr-ita tesseract-ocr-kor tesseract-ocr-rus tesseract-ocr-ukr tesseract-ocr-spa tesseract-ocr-ara tesseract-ocr-hin tesseract-ocr-deu tesseract-ocr-pol tesseract-ocr-jpn tesseract-ocr-por tesseract-ocr-tha tesseract-ocr-jpn tesseract-ocr-chi-sim tesseract-ocr-chi-tra && \
apt-get clean && rm -rf /var/lib/apt/lists/*

ENV TESSDATA_PREFIX /usr/share/tesseract-ocr/4.00/
ENV LC_NUMERIC C

# Remove the embedded javax.mail jar from Jetty
RUN rm -f /opt/jetty/lib/jndi/javax.mail.glassfish-*.jar

Expand Down
35 changes: 15 additions & 20 deletions docs-core/pom.xml
Expand Up @@ -138,46 +138,41 @@
<artifactId>bcprov-jdk15on</artifactId>
</dependency>

<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.odftoolkit.odfdom.converter.pdf</artifactId>
</dependency>

<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.pdf</artifactId>
</dependency>

<dependency>
<groupId>net.java.dev.jna</groupId>
<artifactId>jna</artifactId>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.odftoolkit.odfdom.converter.pdf</artifactId>
</dependency>

<!-- ImageIO plugins -->
<dependency>
<groupId>com.levigo.jbig2</groupId>
<artifactId>levigo-jbig2-imageio</artifactId>
</dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>org.apache.poi.xwpf.converter.pdf</artifactId>
</dependency>

<!-- ImageIO plugins -->
<dependency>
<groupId>com.twelvemonkeys.imageio</groupId>
<artifactId>imageio-jpeg</artifactId>
</dependency>

<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-core</artifactId>
<groupId>com.twelvemonkeys.imageio</groupId>
<artifactId>imageio-tiff</artifactId>
</dependency>

<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-jpeg2000</artifactId>
</dependency>

<dependency>
<groupId>com.levigo.jbig2</groupId>
<artifactId>levigo-jbig2-imageio</artifactId>
</dependency>

<!-- Only for connecting to PostgreSQL database -->
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>42.2.2.jre7</version>
</dependency>

<!-- Test dependencies -->
Expand Down
Expand Up @@ -5,6 +5,7 @@
import com.sismics.docs.core.util.action.Action;
import com.sismics.docs.core.util.action.AddTagAction;
import com.sismics.docs.core.util.action.RemoveTagAction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.json.JsonObject;
Expand All @@ -18,7 +19,7 @@ public class ActionUtil {
/**
* Logger.
*/
private static final org.slf4j.Logger log = LoggerFactory.getLogger(LuceneUtil.class);
private static final Logger log = LoggerFactory.getLogger(ActionUtil.class);

/**
* Find the action associated to an action type.
Expand Down
46 changes: 22 additions & 24 deletions docs-core/src/main/java/com/sismics/docs/core/util/FileUtil.java
@@ -1,44 +1,40 @@
package com.sismics.docs.core.util;

import com.google.common.base.Charsets;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.io.CharStreams;
import com.sismics.docs.core.constant.Constants;
import com.sismics.docs.core.dao.jpa.FileDao;
import com.sismics.docs.core.dao.jpa.UserDao;
import com.sismics.docs.core.event.DocumentUpdatedAsyncEvent;
import com.sismics.docs.core.event.FileCreatedAsyncEvent;
import com.sismics.docs.core.model.jpa.File;
import com.sismics.docs.core.model.jpa.User;
import com.sismics.tess4j.Tesseract;
import com.sismics.util.ImageDeskew;
import com.sismics.util.Scalr;
import com.sismics.util.context.ThreadLocalContext;
import com.sismics.util.io.InputStreamReaderThread;
import com.sismics.util.mime.MimeTypeUtil;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.crypto.Cipher;
import javax.crypto.CipherInputStream;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.*;

/**
* File entity utilities.
*
* @author bgamard
*/
public class FileUtil {
/**
* Logger.
*/
private static final Logger log = LoggerFactory.getLogger(FileUtil.class);

/**
* File ID of files currently being processed.
*/
Expand All @@ -50,28 +46,30 @@ public class FileUtil {
* @param language Language to OCR
* @param image Buffered image
* @return Content extracted
* @throws Exception e
*/
public static String ocrFile(String language, BufferedImage image) {
public static String ocrFile(String language, BufferedImage image) throws Exception {
// Upscale, grayscale and deskew the image
String content = null;
BufferedImage resizedImage = Scalr.resize(image, Scalr.Method.AUTOMATIC, Scalr.Mode.AUTOMATIC, 3500, Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
image.flush();
ImageDeskew imageDeskew = new ImageDeskew(resizedImage);
BufferedImage deskewedImage = Scalr.rotate(resizedImage, - imageDeskew.getSkewAngle(), Scalr.OP_ANTIALIAS, Scalr.OP_GRAYSCALE);
resizedImage.flush();
image = deskewedImage;
Path tmpFile = ThreadLocalContext.get().createTemporaryFile();
ImageIO.write(deskewedImage, "tiff", tmpFile.toFile());

// OCR the file
try {
Tesseract instance = Tesseract.getInstance();
log.info("Starting OCR with TESSDATA_PREFIX=" + System.getenv("TESSDATA_PREFIX") + ";LC_NUMERIC=" + System.getenv("LC_NUMERIC"));
instance.setLanguage(language);
content = instance.doOCR(image);
} catch (Throwable e) {
log.error("Error while OCR-izing the image", e);
}
List<String> result = Lists.newLinkedList(Arrays.asList("tesseract", tmpFile.toAbsolutePath().toString(), "stdout", "-l", language));
ProcessBuilder pb = new ProcessBuilder(result);
Process process = pb.start();

return content;
// Consume the process error stream
final String commandName = pb.command().get(0);
new InputStreamReaderThread(process.getErrorStream(), commandName).start();

// Consume the data as text
try (InputStream is = process.getInputStream()) {
return CharStreams.toString(new InputStreamReader(is, Charsets.UTF_8));
}
}

/**
Expand Down
Expand Up @@ -17,7 +17,6 @@

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
Expand Down Expand Up @@ -45,19 +44,16 @@ public boolean accept(String mimeType) {
}

@Override
public BufferedImage generateThumbnail(Path file) throws IOException {
public BufferedImage generateThumbnail(Path file) throws Exception {
try (InputStream inputStream = Files.newInputStream(file)) {
return ImageIO.read(inputStream);
}
}

@Override
public String extractContent(String language, Path file) {
public String extractContent(String language, Path file) throws Exception {
try (InputStream inputStream = Files.newInputStream(file)) {
return FileUtil.ocrFile(language, ImageIO.read(inputStream));
} catch (IOException e) {
log.error("Error reading the image", e);
return null;
}
}

Expand Down
Expand Up @@ -58,6 +58,7 @@ public String extractContent(String language, Path file) {
PDDocument pdfDocument = PDDocument.load(inputStream)) {
PDFRenderer renderer = new PDFRenderer(pdfDocument);
for (int pageIndex = 0; pageIndex < pdfDocument.getNumberOfPages(); pageIndex++) {
log.info("OCR page " + (pageIndex + 1) + "/" + pdfDocument.getNumberOfPages() + " of PDF file containing only images");
sb.append(" ");
sb.append(FileUtil.ocrFile(language, renderer.renderImage(pageIndex)));
}
Expand Down

0 comments on commit abde9b7

Please sign in to comment.