Skip to content
Browse files

stream attachments through Tika to avoid 100,000 byte limit.

  • Loading branch information...
1 parent bc1bd8f commit 7051e32601a187543d2523f40add682e48d68ec4 Robert Newson committed
View
7 src/main/java/com/github/rnewson/couchdb/lucene/Tika.java
@@ -21,9 +21,11 @@
import java.io.IOException;
import java.io.InputStream;
+import org.apache.commons.io.IOUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.HttpHeaders;
@@ -50,13 +52,10 @@ public void parse(final InputStream in, final String contentType, final String f
try {
// Add body text.
- doc.add(text(fieldName, tika.parseToString(in, md), false));
+ doc.add(new Field(fieldName, tika.parse(in, md)));
} catch (final IOException e) {
log.warn("Failed to index an attachment.", e);
return;
- } catch (final TikaException e) {
- log.warn("Failed to parse an attachment.", e);
- return;
}
// Add DC attributes.
View
2 src/test/java/com/github/rnewson/couchdb/lucene/TikaTest.java
@@ -16,7 +16,6 @@
* limitations under the License.
*/
-import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.nullValue;
import static org.junit.Assert.assertThat;
@@ -53,7 +52,6 @@ public void testXML() throws IOException {
public void testWord() throws IOException {
parse("example.doc", "application/msword", "bar");
assertThat(doc.getField("bar"), not(nullValue()));
- assertThat(doc.get("bar"), containsString("576 dsf45 d56 dsgh"));
}
private void parse(final String resource, final String type, final String field) throws IOException {

0 comments on commit 7051e32

Please sign in to comment.
Something went wrong with that request. Please try again.