move PDFxStream lucene integration to separate project

snowtide · Nov 24, 2014 · ca89c04 · ca89c04
commit ca89c04
Show file tree

Hide file tree

Showing 14 changed files with 704 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,16 @@
+# emacs + vi backup files
+*~
+.*.sw*
+
+# various IDE junk
+*.ipr
+*.iml
+*.iws
+.idea
+.project
+.classpath
+.settings
+
+target
+classes
+it-repo
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2004-2014 Snowtide Informatics Systems, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
diff --git a/project.clj b/project.clj
@@ -0,0 +1,50 @@
+(defproject com.snowtide/lucene-pdf "3.0.0-SNAPSHOT"
+  :description "A library enabling easy Lucene indexing of PDF text and metadata via integration with PDFxStream"
+  :url "http://github.com/snowtide/lucene-pdf"
+  :license {:name "MIT"
+            :url "http://opensource.org/licenses/MIT"}
+  :min-lein-version "2.0.0"
+
+  :dependencies [[com.snowtide/pdfxstream "3.1.1"]
+                 [org.apache.lucene/lucene-core "4.10.2"]]
+
+  :plugins [[lein-javadoc "0.1.1"]]
+  :javadoc-opts {:package-names "com.snowtide.pdf.lucene"
+                 :output-dir "target/javadoc"
+                 :additional-args ["-Xdoclint:-missing" "-version" "-charset" "UTF-8"
+                                   "-docencoding" "UTF-8" "-encoding" "UTF-8"]}
+
+  :repositories {"snowtide-releases" {:url "http://maven.snowtide.com/releases"}}
+
+  :profiles {:lucene-1 [:dev :base {:dependencies [[org.apache.lucene/lucene-core "1.9.1"]]
+                                    :java-source-paths ["src/lucene-1"]}]
+             :lucene-2 [:dev :base {:dependencies [[org.apache.lucene/lucene-core "2.9.4"]]
+                                    :java-source-paths ["src/lucene-2"]}]
+             :lucene-3 [:dev :base {:dependencies [[org.apache.lucene/lucene-core "3.6.2"]]
+                                    :java-source-paths ["src/lucene-3"]}]
+             :lucene-4 [:dev :base {:dependencies [[org.apache.lucene/lucene-core "4.10.2"]
+                                                   [org.apache.lucene/lucene-analyzers-common "4.10.2"]]
+                                    :java-source-paths ["src/lucene-4"]}]
+
+             :dev {:dependencies [[org.clojure/clojure "1.6.0"]]
+                   :plugins []}}
+
+  :javac-options ["-target" "1.5" "-source" "1.5"]
+  :java-source-paths ["src/java"]
+  :test-paths ["test"]
+
+  :aliases  {"compile+" ["with-profile" "lucene-1:lucene-2:lucene-3:lucene-4" "do" "javac," "test"]
+             "release" ["do" "clean," "compile+," "release"]}
+
+  :deploy-repositories {"releases" {:url "s3p://maven.snowtide.com/releases" :creds :gpg}}
+  #_#_
+  :deploy-repositories {"releases" {:url "https://oss.sonatype.org/service/local/staging/deploy/maven2/" :creds :gpg}
+                        "snapshots" {:url "https://oss.sonatype.org/content/repositories/snapshots/" :creds :gpg}}
+
+  ;;maven central requirements
+  :scm {:url "git@github.com:snowtide/lucene-pdf.git"}
+  :pom-addition [:developers [:developer
+                              [:name "Chas Emerick"]
+                              [:url "http://cemerick.com"]
+                              [:email "chas@cemerick.com"]
+                              [:timezone "-5"]]]  )
diff --git a/src/java/com/snowtide/pdf/lucene/LucenePDFConfiguration.java b/src/java/com/snowtide/pdf/lucene/LucenePDFConfiguration.java
@@ -0,0 +1,189 @@
+package com.snowtide.pdf.lucene;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Instances of this class are used to control the creation of Lucene Documents from PDF content
+ * through the {@link LucenePDFDocumentFactory} class.
+ *
+ * @see <a href="http://www.snowtide.com/help/indexing-pdf-documents-with-lucene-and-pdfxstream">Indexing PDF
+ * Documents with Lucene and PDFxStream</a> for usage details
+ *
+ * @version ©2004-2014 Snowtide, http://snowtide.com, licensed under MIT. See LICENSE in the top level of the
+ * <a href="https://github.com/snowtide/lucene-pdf">lucene-pdf</a> project directory.
+ */
+public class LucenePDFConfiguration {
+
+    /**
+     * The default name assigned to the Lucene Field containing the main body of text extracted from a PDF file:
+     * <code>"text"</code>.
+     */
+    public static final String DEFAULT_MAIN_TEXT_FIELD_NAME = "text";
+
+    /**
+     * Mapping from PDF metadata keys to Lucene document field names.
+     */
+    private final HashMap<String, String> metadataFieldMapping = new HashMap<String,String>();
+    private boolean copyAllPDFMetadata = true;
+    private boolean indexBodyText = true;
+    private boolean storeBodyText = false;
+    private boolean tokenizeBodyText = true;
+    private boolean indexMetadata = true;
+    private boolean storeMetadata = true;
+    private boolean tokenizeMetadata = true;
+    private String bodyTextFieldName = DEFAULT_MAIN_TEXT_FIELD_NAME;
+
+    /**
+     * Creates a new config object.  The resulting object retains the default configuration
+     * except for the name assigned to the Lucene Field that contains the main PDF text content.
+     *
+     * @param mainTextFieldName - the name that should be assigned to Fields containing
+     * the main PDF text content.
+     */
+    public LucenePDFConfiguration (String mainTextFieldName) {
+        setBodyTextFieldName(mainTextFieldName);
+    }
+
+    /**
+     * Creates a new config object.  Fields containing the main text content of
+     * {@link com.snowtide.pdf.Document PDF documents} converted into
+     * Lucene Documents will be assigned a {@link LucenePDFConfiguration#DEFAULT_MAIN_TEXT_FIELD_NAME
+     * default name}.  Other configuration defaults are as follows:
+     * <ul>
+     * <li>All PDF metadata attributes are copied to the resulting Lucene documents</li>
+     * <li>The main text content is tokenized and indexed, but not stored</li>
+     * <li>The PDF metadata attributes are tokenized, stored, and indexed.</li>
+     * </ul>
+     */
+    public LucenePDFConfiguration () {
+        this(DEFAULT_MAIN_TEXT_FIELD_NAME);
+    }
+
+    /**
+     * Sets the name that will be assigned to Lucene Fields containing PDF body text content.
+     */
+    public void setBodyTextFieldName (String bodyTextFieldName) {
+        this.bodyTextFieldName = bodyTextFieldName;
+    }
+
+    /**
+     * Returns the name that will be assigned to Lucene Fields containing PDF body text content.
+     */
+    public String getBodyTextFieldName () {
+        return bodyTextFieldName;
+    }
+
+    /**
+     * Returns a copy of the mapping between PDF metadata attributes and the names given to Lucene fields created for
+     * them.
+     */
+    public Map<String,String> getMetadataFieldMapping () {
+        return new HashMap<String,String>(metadataFieldMapping);
+    }
+
+    /**
+     * Returns the name that should be given to Lucene Fields created from the value of the named PDF metadata
+     * attribute.
+     */
+    public String getMetadataFieldMapping (String pdfMetadataAttr) {
+        return metadataFieldMapping.get(pdfMetadataAttr);
+    }
+
+    /**
+     * Sets the name that will be assigned to Lucene Fields corresponding to the provided PDF metadata attribute
+     * name (e.g. {@link com.snowtide.pdf.Document#ATTR_AUTHOR}, etc).
+     */
+    public void setMetadataFieldMapping (String pdfMetadataAttr, String fieldName) {
+        metadataFieldMapping.put(pdfMetadataAttr, fieldName);
+    }
+
+    /**
+     * Returns true if any PDF metadata attributes not explicitly {@link #getMetadataFieldMapping() mapped} will
+     * be added to generated Lucene Documents using their names as specified in the source PDFs.
+     */
+    public boolean copyAllPDFMetadata() {
+        return copyAllPDFMetadata;
+    }
+
+    /**
+     * @see LucenePDFConfiguration#copyAllPDFMetadata()
+     */
+    public void setCopyAllPDFMetadata(boolean b) {
+        copyAllPDFMetadata = b;
+    }
+
+    /**
+     * Sets Field attributes that will be used when creating the Field object for the main text content of
+     * a PDF document.  These attributes correspond to the <code>store</code>,
+     * <code>index</code>, and <code>token</code> parameters of the {@link org.apache.lucene.document.Field}
+     * constructor before Lucene v4.x and the same-named attributes of {@link org.apache.lucene.document.FieldType}
+     * afterwards.
+     */
+    public void setBodyTextSettings (boolean store, boolean index, boolean token) {
+        indexBodyText = index;
+        storeBodyText = store;
+        tokenizeBodyText = token;
+    }
+
+    /**
+     * Sets Field attributes that will be used when creating Field objects for the document attributes found in
+     * a PDF document.  These attributes correspond to the <code>store</code>,
+     * <code>index</code>, and <code>token</code> parameters of the {@link org.apache.lucene.document.Field}
+     * constructor before Lucene v4.x and the same-named attributes of {@link org.apache.lucene.document.FieldType}
+     * afterwards.
+     */
+    public void setMetadataSettings (boolean store, boolean index, boolean token) {
+        indexMetadata = index;
+        storeMetadata = store;
+        tokenizeMetadata = token;
+    }
+
+    /**
+     * Returns true if the main body text of PDFs added to Lucene Documents created through
+     * {@link LucenePDFDocumentFactory} using this config object will be indexed.
+     */
+    public boolean indexBodyText () {
+        return indexBodyText;
+    }
+
+    /**
+     * Returns true if the main body text of PDFs added to Lucene Documents created through
+     * {@link LucenePDFDocumentFactory} using this config object will be stored.
+     */
+    public boolean storeBodyText () {
+        return storeBodyText;
+    }
+
+    /**
+     * Returns true if the main body text of PDFs added to Lucene Documents created through
+     * {@link LucenePDFDocumentFactory} using this config object will be tokenized.
+     */
+    public boolean tokenizeBodyText () {
+        return tokenizeBodyText;
+    }
+
+    /**
+     * Returns true if the PDF metadata attributes added Lucene Documents created through
+     * {@link LucenePDFDocumentFactory} using this config object will be indexed.
+     */
+    public boolean indexMetadata () {
+        return indexMetadata;
+    }
+
+    /**
+     * Returns true if the PDF metadata attributes added Lucene Documents created through
+     * {@link LucenePDFDocumentFactory} using this config object will be stored.
+     */
+    public boolean storeMetadata () {
+        return storeMetadata;
+    }
+
+    /**
+     * Returns true if the PDF metadata attributes added Lucene Documents created through
+     * {@link LucenePDFDocumentFactory} using this config object will be tokenized.
+     */
+    public boolean tokenizeMetadata () {
+        return tokenizeMetadata;
+    }
+}