Permalink
Browse files

initial commit

  • Loading branch information...
sorenmacbeth committed Oct 18, 2011
0 parents commit 9e06746f22bac5be5f15e9b2d52681ea4480eb57
Showing with 74 additions and 0 deletions.
  1. +6 −0 .gitignore
  2. +13 −0 README
  3. +11 −0 project.clj
  4. +44 −0 src/clj/lucene_cascalog/core.clj
@@ -0,0 +1,6 @@
+pom.xml
+*jar
+/lib/
+/classes/
+.lein-failures
+.lein-deps-sum
13 README
@@ -0,0 +1,13 @@
+# lucene-cascalog
+
+FIXME: write description
+
+## Usage
+
+FIXME: write
+
+## License
+
+Copyright (C) 2011 FIXME
+
+Distributed under the Eclipse Public License, the same as Clojure.
@@ -0,0 +1,11 @@
+(defproject lucene-cascalog "1.0.0-SNAPSHOT"
+ :description "demo project using lucene from cascalog"
+ :javac-options {:debug "true" :fork "true"}
+ :source-path "src/clj"
+ :aot :all
+ :dependencies [[org.clojure/clojure "1.2.1"]
+ [cascalog "1.8.2"]
+ [org.apache.lucene/lucene-core "3.4.0"]]
+ :dev-dependencies [[org.apache.hadoop/hadoop-core "0.20.2-dev"]]
+ :repositories {"conjars.org" "http://conjars.org/repo"})
+
@@ -0,0 +1,44 @@
+(ns lucene-cascalog.core
+ (:use cascalog.api)
+ (:import
+ org.apache.lucene.analysis.standard.StandardAnalyzer
+ org.apache.lucene.analysis.TokenStream
+ org.apache.lucene.util.Version
+ org.apache.lucene.analysis.tokenattributes.TermAttribute))
+
+(defn tokenizer-seq
+ "Build a lazy-seq out of a tokenizer with TermAttribute"
+ [^TokenStream tokenizer ^TermAttribute term-att]
+ (lazy-seq
+ (when (.incrementToken tokenizer)
+ (cons (.term term-att) (tokenizer-seq tokenizer term-att)))))
+
+(defn load-analyzer [^java.util.Set stopwords]
+ (StandardAnalyzer. Version/LUCENE_CURRENT stopwords))
+
+(defn tokenize-text
+ "Apply a lucene tokenizer to cleaned text content as a lazy-seq"
+ [^StandardAnalyzer analyzer page-text]
+ (let [reader (java.io.StringReader. page-text)
+ tokenizer (.tokenStream analyzer nil reader)
+ term-att (.addAttribute tokenizer TermAttribute)]
+ (tokenizer-seq tokenizer term-att)))
+
+(defn emit-tokens [tokens-seq]
+ "Compute n-grams of a seq of tokens"
+ (partition 1 1 tokens-seq))
+
+(defmapcatop tokenize-string {:stateful true}
+ ([] (load-analyzer StandardAnalyzer/STOP_WORDS_SET))
+ ([analyzer text]
+ (emit-tokens (tokenize-text analyzer text)))
+ ([analyzer] nil))
+
+(defn tokenize-strings [in-path out-path]
+ (let [src (hfs-textline in-path)]
+ (?<- (hfs-textline out-path :sink-mode :replace)
+ [!line ?token]
+ (src !line)
+ (tokenize-string !line :> ?token)
+ (:distinct false))))
+

0 comments on commit 9e06746

Please sign in to comment.