Skip to content

Commit

Permalink
Indexing docx file fails
Browse files Browse the repository at this point in the history
I use ElasticSearch 1.4.3 with mapper-attachment plugin 2.4.2 (TIKA 1.7).

I get an error when indexing **specific** docx file:
> "[DEBUG][org.elasticsearch.index.mapper.attachment.AttachmentMapper] Failed to extract [-1] characters of text for [null]: [org.apache.poi.xwpf.usermodel.XWPFSDT.getContent()Lorg/apache/poi/xwpf/usermodel/ISDTContent;]"

But if i use mapper-attachment plugin 2.4.1 (TIKA 1.5) there is no error and content is parsed successfully.

Caused by this change elastic#94.

Closes elastic#104.
  • Loading branch information
dadoonet committed Feb 20, 2015
1 parent 1e0f03b commit c3c9f66
Show file tree
Hide file tree
Showing 6 changed files with 131 additions and 38 deletions.
34 changes: 1 addition & 33 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
</scm>

<properties>
<poi.version>3.10.1</poi.version>
<!-- If we need to define any specific property -->
</properties>

<dependencies>
Expand All @@ -53,38 +53,6 @@
<artifactId>elasticsearch</artifactId>
</dependency>

<!-- Apache POI -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>${poi.version}</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>${poi.version}</version>
<exclusions>
<exclusion>
<groupId>stax</groupId>
<artifactId>stax-api</artifactId>
</exclusion>
<exclusion>
<groupId>xml-apis</groupId>
<artifactId>xml-apis</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>${poi.version}</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>${poi.version}</version>
</dependency>

<!-- Tika -->
<dependency>
<groupId>org.apache.tika</groupId>
Expand Down
4 changes: 0 additions & 4 deletions src/main/assemblies/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@
<useTransitiveFiltering>true</useTransitiveFiltering>
<includes>
<include>org.apache.tika:tika-parsers</include>
<include>org.apache.poi:poi</include>
<include>org.apache.poi:poi-ooxml</include>
<include>org.apache.poi:poi-ooxml-schemas</include>
<include>org.apache.poi:poi-scratchpad</include>
</includes>
</dependencySet>
</dependencySets>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,7 @@ public void parse(ParseContext context) throws IOException {
throw new MapperParsingException("Failed to extract [" + indexedChars + "] characters of text for [" + name + "]", e);
} else {
logger.debug("Failed to extract [{}] characters of text for [{}]: [{}]", indexedChars, name, e.getMessage());
logger.trace("exception caught", e);
}
return;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.mapper.xcontent;

import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.DocumentMapperParser;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.mapper.attachment.AttachmentMapper;
import org.elasticsearch.test.ElasticsearchTestCase;
import org.junit.Test;

import java.io.IOException;
import java.io.InputStream;

import static org.elasticsearch.common.io.Streams.copyToBytesFromClasspath;
import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika;
import static org.hamcrest.Matchers.isEmptyOrNullString;
import static org.hamcrest.Matchers.not;

/**
* Test for different documents
*/
public class VariousDocTest extends ElasticsearchTestCase {

/**
* Test for https://github.com/elasticsearch/elasticsearch-mapper-attachments/issues/104
*/
@Test
public void testWordDocxDocument104() throws Exception {
testTika("issue-104.docx", false);
testMapper("issue-104.docx", false);
}

/**
* Test for encrypted PDF
*/
@Test
public void testEncryptedPDFDocument() throws Exception {
testTika("encrypted.pdf", true);
// TODO Remove when this will be fixed in Tika. See https://issues.apache.org/jira/browse/TIKA-1548
System.clearProperty("sun.font.fontmanager");
testMapper("encrypted.pdf", true);
}

/**
* Test for HTML
*/
@Test
public void testHtmlDocument() throws Exception {
testTika("htmlWithEmptyDateMeta.html", false);
testMapper("htmlWithEmptyDateMeta.html", false);
}

/**
* Test for XHTML
*/
@Test
public void testXHtmlDocument() throws Exception {
testTika("testXHTML.html", false);
testMapper("testXHTML.html", false);
}

/**
* Test for TXT
*/
@Test
public void testTxtDocument() throws Exception {
testTika("text-in-english.txt", false);
testMapper("text-in-english.txt", false);
}

protected void testTika(String filename, boolean errorExpected) {
try (InputStream is = VariousDocTest.class.getResourceAsStream(filename)) {
String parsedContent = tika().parseToString(is);
assertThat(parsedContent, not(isEmptyOrNullString()));
logger.debug("extracted content: {}", parsedContent);
} catch (Throwable e) {
if (!errorExpected) {
fail("exception caught: " + e.getMessage());
}
}
}

protected void testMapper(String filename, boolean errorExpected) throws IOException {
DocumentMapperParser mapperParser = MapperTestUtils.newMapperParser(ImmutableSettings.builder().build());
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());

String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
DocumentMapper docMapper = mapperParser.parse(mapping);
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/" + filename);

BytesReference json = jsonBuilder()
.startObject()
.field("_id", 1)
.startObject("file")
.field("_name", filename)
.field("_content", html)
.endObject()
.endObject().bytes();

ParseContext.Document doc = docMapper.parse(json).rootDoc();
if (!errorExpected) {
assertThat(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()), not(isEmptyOrNullString()));
logger.debug("extracted content: {}", doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()));
}
}
}
2 changes: 1 addition & 1 deletion src/test/resources/log4j.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
</logger>

<logger name="org.elasticsearch.index.mapper">
<level value="debug" />
<level value="trace" />
</logger>

<root>
Expand Down
Binary file not shown.

0 comments on commit c3c9f66

Please sign in to comment.