Skip to content

Commit

Permalink
move PDFxStream lucene integration to separate project
Browse files Browse the repository at this point in the history
  • Loading branch information
cemerick committed Nov 24, 2014
0 parents commit ca89c04
Show file tree
Hide file tree
Showing 14 changed files with 704 additions and 0 deletions.
16 changes: 16 additions & 0 deletions .gitignore
@@ -0,0 +1,16 @@
# emacs + vi backup files
*~
.*.sw*

# various IDE junk
*.ipr
*.iml
*.iws
.idea
.project
.classpath
.settings

target
classes
it-repo
22 changes: 22 additions & 0 deletions LICENSE
@@ -0,0 +1,22 @@
The MIT License (MIT)

Copyright (c) 2004-2014 Snowtide Informatics Systems, Inc.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

50 changes: 50 additions & 0 deletions project.clj
@@ -0,0 +1,50 @@
(defproject com.snowtide/lucene-pdf "3.0.0-SNAPSHOT"
:description "A library enabling easy Lucene indexing of PDF text and metadata via integration with PDFxStream"
:url "http://github.com/snowtide/lucene-pdf"
:license {:name "MIT"
:url "http://opensource.org/licenses/MIT"}
:min-lein-version "2.0.0"

:dependencies [[com.snowtide/pdfxstream "3.1.1"]
[org.apache.lucene/lucene-core "4.10.2"]]

:plugins [[lein-javadoc "0.1.1"]]
:javadoc-opts {:package-names "com.snowtide.pdf.lucene"
:output-dir "target/javadoc"
:additional-args ["-Xdoclint:-missing" "-version" "-charset" "UTF-8"
"-docencoding" "UTF-8" "-encoding" "UTF-8"]}

:repositories {"snowtide-releases" {:url "http://maven.snowtide.com/releases"}}

:profiles {:lucene-1 [:dev :base {:dependencies [[org.apache.lucene/lucene-core "1.9.1"]]
:java-source-paths ["src/lucene-1"]}]
:lucene-2 [:dev :base {:dependencies [[org.apache.lucene/lucene-core "2.9.4"]]
:java-source-paths ["src/lucene-2"]}]
:lucene-3 [:dev :base {:dependencies [[org.apache.lucene/lucene-core "3.6.2"]]
:java-source-paths ["src/lucene-3"]}]
:lucene-4 [:dev :base {:dependencies [[org.apache.lucene/lucene-core "4.10.2"]
[org.apache.lucene/lucene-analyzers-common "4.10.2"]]
:java-source-paths ["src/lucene-4"]}]

:dev {:dependencies [[org.clojure/clojure "1.6.0"]]
:plugins []}}

:javac-options ["-target" "1.5" "-source" "1.5"]
:java-source-paths ["src/java"]
:test-paths ["test"]

:aliases {"compile+" ["with-profile" "lucene-1:lucene-2:lucene-3:lucene-4" "do" "javac," "test"]
"release" ["do" "clean," "compile+," "release"]}

:deploy-repositories {"releases" {:url "s3p://maven.snowtide.com/releases" :creds :gpg}}
#_#_
:deploy-repositories {"releases" {:url "https://oss.sonatype.org/service/local/staging/deploy/maven2/" :creds :gpg}
"snapshots" {:url "https://oss.sonatype.org/content/repositories/snapshots/" :creds :gpg}}

;;maven central requirements
:scm {:url "git@github.com:snowtide/lucene-pdf.git"}
:pom-addition [:developers [:developer
[:name "Chas Emerick"]
[:url "http://cemerick.com"]
[:email "chas@cemerick.com"]
[:timezone "-5"]]] )
189 changes: 189 additions & 0 deletions src/java/com/snowtide/pdf/lucene/LucenePDFConfiguration.java
@@ -0,0 +1,189 @@
package com.snowtide.pdf.lucene;

import java.util.HashMap;
import java.util.Map;

/**
* Instances of this class are used to control the creation of Lucene Documents from PDF content
* through the {@link LucenePDFDocumentFactory} class.
*
* @see <a href="http://www.snowtide.com/help/indexing-pdf-documents-with-lucene-and-pdfxstream">Indexing PDF
* Documents with Lucene and PDFxStream</a> for usage details
*
* @version ©2004-2014 Snowtide, http://snowtide.com, licensed under MIT. See LICENSE in the top level of the
* <a href="https://github.com/snowtide/lucene-pdf">lucene-pdf</a> project directory.
*/
public class LucenePDFConfiguration {

/**
* The default name assigned to the Lucene Field containing the main body of text extracted from a PDF file:
* <code>"text"</code>.
*/
public static final String DEFAULT_MAIN_TEXT_FIELD_NAME = "text";

/**
* Mapping from PDF metadata keys to Lucene document field names.
*/
private final HashMap<String, String> metadataFieldMapping = new HashMap<String,String>();
private boolean copyAllPDFMetadata = true;
private boolean indexBodyText = true;
private boolean storeBodyText = false;
private boolean tokenizeBodyText = true;
private boolean indexMetadata = true;
private boolean storeMetadata = true;
private boolean tokenizeMetadata = true;
private String bodyTextFieldName = DEFAULT_MAIN_TEXT_FIELD_NAME;

/**
* Creates a new config object. The resulting object retains the default configuration
* except for the name assigned to the Lucene Field that contains the main PDF text content.
*
* @param mainTextFieldName - the name that should be assigned to Fields containing
* the main PDF text content.
*/
public LucenePDFConfiguration (String mainTextFieldName) {
setBodyTextFieldName(mainTextFieldName);
}

/**
* Creates a new config object. Fields containing the main text content of
* {@link com.snowtide.pdf.Document PDF documents} converted into
* Lucene Documents will be assigned a {@link LucenePDFConfiguration#DEFAULT_MAIN_TEXT_FIELD_NAME
* default name}. Other configuration defaults are as follows:
* <ul>
* <li>All PDF metadata attributes are copied to the resulting Lucene documents</li>
* <li>The main text content is tokenized and indexed, but not stored</li>
* <li>The PDF metadata attributes are tokenized, stored, and indexed.</li>
* </ul>
*/
public LucenePDFConfiguration () {
this(DEFAULT_MAIN_TEXT_FIELD_NAME);
}

/**
* Sets the name that will be assigned to Lucene Fields containing PDF body text content.
*/
public void setBodyTextFieldName (String bodyTextFieldName) {
this.bodyTextFieldName = bodyTextFieldName;
}

/**
* Returns the name that will be assigned to Lucene Fields containing PDF body text content.
*/
public String getBodyTextFieldName () {
return bodyTextFieldName;
}

/**
* Returns a copy of the mapping between PDF metadata attributes and the names given to Lucene fields created for
* them.
*/
public Map<String,String> getMetadataFieldMapping () {
return new HashMap<String,String>(metadataFieldMapping);
}

/**
* Returns the name that should be given to Lucene Fields created from the value of the named PDF metadata
* attribute.
*/
public String getMetadataFieldMapping (String pdfMetadataAttr) {
return metadataFieldMapping.get(pdfMetadataAttr);
}

/**
* Sets the name that will be assigned to Lucene Fields corresponding to the provided PDF metadata attribute
* name (e.g. {@link com.snowtide.pdf.Document#ATTR_AUTHOR}, etc).
*/
public void setMetadataFieldMapping (String pdfMetadataAttr, String fieldName) {
metadataFieldMapping.put(pdfMetadataAttr, fieldName);
}

/**
* Returns true if any PDF metadata attributes not explicitly {@link #getMetadataFieldMapping() mapped} will
* be added to generated Lucene Documents using their names as specified in the source PDFs.
*/
public boolean copyAllPDFMetadata() {
return copyAllPDFMetadata;
}

/**
* @see LucenePDFConfiguration#copyAllPDFMetadata()
*/
public void setCopyAllPDFMetadata(boolean b) {
copyAllPDFMetadata = b;
}

/**
* Sets Field attributes that will be used when creating the Field object for the main text content of
* a PDF document. These attributes correspond to the <code>store</code>,
* <code>index</code>, and <code>token</code> parameters of the {@link org.apache.lucene.document.Field}
* constructor before Lucene v4.x and the same-named attributes of {@link org.apache.lucene.document.FieldType}
* afterwards.
*/
public void setBodyTextSettings (boolean store, boolean index, boolean token) {
indexBodyText = index;
storeBodyText = store;
tokenizeBodyText = token;
}

/**
* Sets Field attributes that will be used when creating Field objects for the document attributes found in
* a PDF document. These attributes correspond to the <code>store</code>,
* <code>index</code>, and <code>token</code> parameters of the {@link org.apache.lucene.document.Field}
* constructor before Lucene v4.x and the same-named attributes of {@link org.apache.lucene.document.FieldType}
* afterwards.
*/
public void setMetadataSettings (boolean store, boolean index, boolean token) {
indexMetadata = index;
storeMetadata = store;
tokenizeMetadata = token;
}

/**
* Returns true if the main body text of PDFs added to Lucene Documents created through
* {@link LucenePDFDocumentFactory} using this config object will be indexed.
*/
public boolean indexBodyText () {
return indexBodyText;
}

/**
* Returns true if the main body text of PDFs added to Lucene Documents created through
* {@link LucenePDFDocumentFactory} using this config object will be stored.
*/
public boolean storeBodyText () {
return storeBodyText;
}

/**
* Returns true if the main body text of PDFs added to Lucene Documents created through
* {@link LucenePDFDocumentFactory} using this config object will be tokenized.
*/
public boolean tokenizeBodyText () {
return tokenizeBodyText;
}

/**
* Returns true if the PDF metadata attributes added Lucene Documents created through
* {@link LucenePDFDocumentFactory} using this config object will be indexed.
*/
public boolean indexMetadata () {
return indexMetadata;
}

/**
* Returns true if the PDF metadata attributes added Lucene Documents created through
* {@link LucenePDFDocumentFactory} using this config object will be stored.
*/
public boolean storeMetadata () {
return storeMetadata;
}

/**
* Returns true if the PDF metadata attributes added Lucene Documents created through
* {@link LucenePDFDocumentFactory} using this config object will be tokenized.
*/
public boolean tokenizeMetadata () {
return tokenizeMetadata;
}
}

0 comments on commit ca89c04

Please sign in to comment.