Permalink
Browse files

stream attachments through Tika to avoid 100,000 byte limit.

  • Loading branch information...
1 parent bc1bd8f commit f038055b36523dfd75a3e92b3423e2ed3a81f280 Robert Newson committed Apr 19, 2011
@@ -21,9 +21,11 @@
import java.io.IOException;
import java.io.InputStream;
+import org.apache.commons.io.IOUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.HttpHeaders;
@@ -50,13 +52,10 @@ public void parse(final InputStream in, final String contentType, final String f
try {
// Add body text.
- doc.add(text(fieldName, tika.parseToString(in, md), false));
+ doc.add(new Field(fieldName, tika.parse(in, md)));
} catch (final IOException e) {
log.warn("Failed to index an attachment.", e);
return;
- } catch (final TikaException e) {
- log.warn("Failed to parse an attachment.", e);
- return;
}
// Add DC attributes.
@@ -16,7 +16,6 @@
* limitations under the License.
*/
-import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.nullValue;
import static org.junit.Assert.assertThat;
@@ -53,7 +52,6 @@ public void testXML() throws IOException {
public void testWord() throws IOException {
parse("example.doc", "application/msword", "bar");
assertThat(doc.getField("bar"), not(nullValue()));
- assertThat(doc.get("bar"), containsString("576 dsf45 d56 dsgh"));
}
private void parse(final String resource, final String type, final String field) throws IOException {

0 comments on commit f038055

Please sign in to comment.