Permalink
Browse files

Revert tika streaming as this consumes the body that Lucene then atte…

…mpts to read (and fails).

Raise Tika's 'to string' limit instead.

This reverts commit f038055.
  • Loading branch information...
1 parent 4c46e5d commit 7dd58cc220468117d01abb20432f53ebc54aa996 Robert Newson committed Jun 6, 2011
@@ -21,11 +21,9 @@
import java.io.IOException;
import java.io.InputStream;
-import org.apache.commons.io.IOUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.HttpHeaders;
@@ -43,6 +41,7 @@
private final org.apache.tika.Tika tika = new org.apache.tika.Tika();
private Tika() {
+ tika.setMaxStringLength(-1);
}
public void parse(final InputStream in, final String contentType, final String fieldName, final Document doc)
@@ -52,10 +51,13 @@ public void parse(final InputStream in, final String contentType, final String f
try {
// Add body text.
- doc.add(new Field(fieldName, tika.parse(in, md)));
+ doc.add(text(fieldName, tika.parseToString(in, md), false));
} catch (final IOException e) {
log.warn("Failed to index an attachment.", e);
return;
+ } catch (final TikaException e) {
+ log.warn("Failed to parse an attachment.", e);
+ return;
}
// Add DC attributes.
@@ -16,6 +16,7 @@
* limitations under the License.
*/
+import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.nullValue;
import static org.junit.Assert.assertThat;
@@ -52,6 +53,7 @@ public void testXML() throws IOException {
public void testWord() throws IOException {
parse("example.doc", "application/msword", "bar");
assertThat(doc.getField("bar"), not(nullValue()));
+ assertThat(doc.get("bar"), containsString("576 dsf45 d56 dsgh"));
}
private void parse(final String resource, final String type, final String field) throws IOException {

0 comments on commit 7dd58cc

Please sign in to comment.