Permalink
Browse files

include all DC attributes, if present.

  • Loading branch information...
Robert Newson
Robert Newson committed Mar 8, 2009
1 parent 44005d3 commit 46a3a3710165ae9ddca6c950238cf394f9c0602c
Showing with 55 additions and 10 deletions.
  1. +24 −3 README.md
  2. +31 −7 src/main/java/org/apache/couchdb/lucene/Tika.java
View
@@ -94,9 +94,30 @@ You can perform all types of queries using Lucene's default <a href="http://luce
<dt>_id<dd>The _id of the document.
<dt>_rev<dd>The _rev of the document.
<dt>_db<dd>The source database of the document.
-<dt>_body<dd>Any text extracted from any attachment (name may change).
-<dt>_author<dd>The author of any attachment (name may change).
-<dt>_title<dd>The title of any attachment (name may change).
+<dt>_body<dd>Any text extracted from any attachment.
+</dl>
+
+<h2>Dublin Core</h2>
+
+All Dublin Core attributes are indexed and stored if detected in the attachment. Descriptions of the fields come from the Tika javadocs.
+
+<dl>
+<dt>dc.contributor<dd> An entity responsible for making contributions to the content of the resource.
+<dt>dc.coverage<dd>The extent or scope of the content of the resource.
+<dt>dc.creator<dd>An entity primarily responsible for making the content of the resource.
+<dt>dc.date<dd>A date associated with an event in the life cycle of the resource.
+<dt>dc.description<dd>An account of the content of the resource.
+<dt>dc.format<dd>Typically, Format may include the media-type or dimensions of the resource.
+<dt>dc.identifier<dd>Recommended best practice is to identify the resource by means of a string or number conforming to a formal identification system.
+<dt>dc.language<dd>A language of the intellectual content of the resource.
+<dt>dc.modified<dd>Date on which the resource was changed.
+<dt>dc.publisher<dd>An entity responsible for making the resource available.
+<dt>dc.relation<dd>A reference to a related resource.
+<dt>dc.rights<dd>Information about rights held in and over the resource.
+<dt>dc.source<dd>A reference to a resource from which the present resource is derived.
+<dt>dc.subject<dd>The topic of the content of the resource.
+<dt>dc.title<dd>A name given to the resource.
+<dt>dc.type<dd>The nature or genre of the content of the resource.
</dl>
<h2>Examples</h2>
@@ -1,6 +1,7 @@
package org.apache.couchdb.lucene;
import static org.apache.couchdb.lucene.Utils.text;
+import static org.apache.couchdb.lucene.Utils.token;
import java.io.IOException;
import java.io.InputStream;
@@ -10,6 +11,7 @@
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
+import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParsingReader;
@@ -18,6 +20,8 @@
private static final Logger log = LogManager.getLogger(Tika.class);
+ private static final String DC = "dc.";
+
public void parse(final InputStream in, final String contentType, final Document doc) {
final AutoDetectParser parser = new AutoDetectParser();
final Metadata md = new Metadata();
@@ -36,16 +40,36 @@ public void parse(final InputStream in, final String contentType, final Document
return;
}
+ // Add body text.
doc.add(text(Config.BODY, body, false));
+ // Add DC attributes.
+ addDublinCoreAttributes(md, doc);
+
+ System.out.println(doc);
+ }
- if (md.get(Metadata.TITLE) != null) {
- doc.add(text(Config.TITLE, md.get(Metadata.TITLE), true));
- }
+ private void addDublinCoreAttributes(final Metadata md, final Document doc) {
+ addAttribute(DC, DublinCore.CONTRIBUTOR, md, doc);
+ addAttribute(DC, DublinCore.COVERAGE, md, doc);
+ addAttribute(DC, DublinCore.CREATOR, md, doc);
+ addAttribute(DC, DublinCore.DATE, md, doc);
+ addAttribute(DC, DublinCore.DESCRIPTION, md, doc);
+ addAttribute(DC, DublinCore.FORMAT, md, doc);
+ addAttribute(DC, DublinCore.IDENTIFIER, md, doc);
+ addAttribute(DC, DublinCore.LANGUAGE, md, doc);
+ addAttribute(DC, DublinCore.MODIFIED, md, doc);
+ addAttribute(DC, DublinCore.PUBLISHER, md, doc);
+ addAttribute(DC, DublinCore.RELATION, md, doc);
+ addAttribute(DC, DublinCore.RIGHTS, md, doc);
+ addAttribute(DC, DublinCore.SOURCE, md, doc);
+ addAttribute(DC, DublinCore.SUBJECT, md, doc);
+ addAttribute(DC, DublinCore.TITLE, md, doc);
+ addAttribute(DC, DublinCore.TYPE, md, doc);
+ }
- if (md.get(Metadata.AUTHOR) != null) {
- doc.add(text(Config.AUTHOR, md.get(Metadata.AUTHOR), true));
+ private void addAttribute(final String namespace, final String attributeName, final Metadata md, final Document doc) {
+ if (md.get(attributeName) != null) {
+ doc.add(token(namespace + attributeName, md.get(attributeName), true));
}
-
}
-
}

0 comments on commit 46a3a37

Please sign in to comment.