From 15c338a9d07d262c52887ad644392c61d7952ef0 Mon Sep 17 00:00:00 2001 From: Richard Hull Date: Tue, 31 May 2016 21:39:01 +0100 Subject: [PATCH] Initial port from .cljs -> .clj --- src/boyer_moore/core.clj | 119 +++++++++++++++++++++++++++++++++ test/boyer_moore/core_test.clj | 29 ++++++++ 2 files changed, 148 insertions(+) create mode 100644 src/boyer_moore/core.clj create mode 100644 test/boyer_moore/core_test.clj diff --git a/src/boyer_moore/core.clj b/src/boyer_moore/core.clj new file mode 100644 index 0000000..ebdf69e --- /dev/null +++ b/src/boyer_moore/core.clj @@ -0,0 +1,119 @@ +(ns boyer-moore.core) + +; ported directly from the Java version at wikipedia: +; http://en.wikipedia.org/wiki/Boyer_moore#Implementations + +(defn- char-code-at [s i] + (int (get s i))) + +(defn- char= + ([s i j] + (char= s s i j)) + + ([s1 s2 i j] + (= (get s1 i) (get s2 j)))) + +(defn- prefix? + "Is needle[p:end] a prefix of needle?" + [needle p] + (let [len (count needle)] + (loop [i p + j 0] + (cond + (= i len) + true + + (not (char= needle (inc i) (inc j))) + false + + :else + (recur (inc i) (inc j)))))) + +(defn- suffix-length + "Returns the maximum length of the substring ends at p and is a suffix" + [needle p] + (loop [i p + j (dec (count needle)) + len 0] + (if (and (> i 0) (char= needle i j)) + (recur (dec i) (dec j) (inc len)) + len))) + +(defn- make-char-table + "Makes the jump table based on the mismatched character information" + [needle] + (let [len (count needle)] + (loop [i 0 + table (transient (vec (repeat 256 len)))] + (if-not (< i (dec len)) + (persistent! table) + (recur + (inc i) + (assoc! + table + (char-code-at needle i) + (- len 1 i))))))) + +(defn- calc-prefixes [needle] + (let [len (count needle)] + (loop [i (dec len) + last-posn len + table (transient (vec (repeat len 0)))] + (if-not (>= i 0) + (persistent! table) + (let [last-posn (if (prefix? needle i) i last-posn)] + (recur + (dec i) + last-posn + (assoc! + table + (- len 1 i) + (+ last-posn (- i) len -1)))))))) + +(defn- make-offset-table + "Makes the jump table based on the scan offset which mismatch occurs" + [needle] + (let [len (count needle)] + (loop [i 0 + table (transient (calc-prefixes needle))] + (if-not (< i (dec len)) + (persistent! table) + (let [slen (suffix-length needle i)] + (recur + (inc i) + (assoc! table slen (+ len -1 (- i) slen)))))))) + +(defn index-of + "Returns the index with the string of the first occurrence of the + specified substring. If it is not a substring, return nil. + + haystack - the string to be scanned + needle - the target string to search" + ([haystack needle] + (index-of haystack needle 0)) + + ([haystack needle offset] + (let [len (count needle) + m1 (dec len)] + (if (zero? len) + offset + (let [char-table (make-char-table needle) + offset-table (make-offset-table needle) + calc-offset (fn [i j] (+ i + (Math/max + (char-code-at offset-table (- m1 j)) + (char-code-at char-table (char-code-at haystack i)))))] + (loop [i (+ offset m1) + j m1] + (cond + (>= i (count haystack)) + nil + + (neg? j) + (inc i) + + (char= haystack needle i j) + (recur (dec i) (dec j)) + + :else + (recur (calc-offset i j) m1)))))))) \ No newline at end of file diff --git a/test/boyer_moore/core_test.clj b/test/boyer_moore/core_test.clj new file mode 100644 index 0000000..b884925 --- /dev/null +++ b/test/boyer_moore/core_test.clj @@ -0,0 +1,29 @@ +(ns boyer-moore.core-test + (:require + [clojure.test :refer :all] + [boyer-moore.core :refer [index-of]])) + +(deftest boyer-moore->string + (is (= (index-of "Hello world" "") 0)) + (is (= (index-of "Hello world" "" 3) 3)) + (is (= (index-of "Hello world" "Hello") 0)) + (is (= (index-of "Hello world" "world") 6)) + (is (= (index-of "Hello world" "world" 4) 6)) + (is (= (index-of "Hello world" "o wo") 4)) + (is (= (index-of "Hello world" "not") nil)) + (is (= (index-of "svertices vertices" "vertices") 1)) + (is (= (index-of "FFS :svertices :vertices" ":vertices") 15))) + +;(deftest boyer-moore->dataview +; (let [data (str +; "Rent a flat above a shop!\n" +; "Cut your hair and get a job!\n" +; "Smoke some fags and play some pool.") +; dataview (create-dataview (count data))] +; +; (set-binary-data! dataview 0 (seq data)) +; +; (is= (index-of dataview "") 0 "Empty string") +; (is= (index-of dataview "Cut your") 26 "Match at start of 2nd line") +; (is= (index-of dataview "Common People") nil "No match") +; (is= (index-of dataview "above a shop" 26) nil "No match with offset"))) \ No newline at end of file