Skip to content

Commit

Permalink
str_utils2.clj: added codepoints and docodepoints
Browse files Browse the repository at this point in the history
  • Loading branch information
Stuart Sierra committed Jun 5, 2009
1 parent 66d0c22 commit bea244b
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 19 deletions.
93 changes: 74 additions & 19 deletions src/clojure/contrib/str_utils2.clj
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,60 @@
(:require [clojure.contrib.java-utils :as j])
(:import (java.util.regex Pattern)))

(defmacro dochars

(defmacro dochars
"bindings => [name string]
Repeatedly executes body, with name bound to each character in
string."
string. Does NOT handle Unicode supplementary characters (above
U+FFFF)."
[bindings & body]
(assert (vector bindings))
(assert (= 2 (count bindings)))
`(let [#^String s# ~(second bindings)]
(dotimes [i# (.length ~(second bindings))]
(dotimes [i# (.length s#)]
(let [~(first bindings) (.charAt s# i#)]
~@body))))


(defmacro docodepoints
"bindings => [name string]
Repeatedly executes body, with name bound to the integer code point
of each Unicode character in the string. Handles Unicode
supplementary characters (above U+FFFF) correctly."
[bindings & body]
(assert (vector bindings))
(assert (= 2 (count bindings)))
;; This seems to be the fastest way to iterate over characters.
(let [character (first bindings)
string (second bindings)]
`(let [#^String s# ~string
len# (.length s#)]
(loop [i# 0]
(when (< i# len#)
(let [~character (.charAt s# i#)]
(if (Character/isHighSurrogate ~character)
(let [~character (.codePointAt s# i#)]
~@body
(recur (+ 2 i#)))
(let [~character (int ~character)]
~@body
(recur (inc i#))))))))))

(defn codepoints
"Returns a sequence of integer Unicode code points in s. Handles
Unicode supplementary characters (above U+FFFF) correctly."
[#^String s]
(let [len (.length s)
f (fn thisfn [#^String s i]
(when (< i len)
(let [c (.charAt s i)]
(if (Character/isHighSurrogate c)
(cons (.codePointAt s i) (thisfn s (+ 2 i)))
(cons (int c) (thisfn s (inc i)))))))]
(lazy-seq (f s 0))))

(defn escape
"Escapes characters in string according to a cmap, a function or map
from characters to their replacements."
Expand All @@ -56,14 +98,10 @@
(.append buffer c)))
(.toString buffer)))

(defn escape-pattern [#^String s]
(escape s (fn [c] (when (#{\\ \[ \] \. \^ \$ \? \* \+ \( \)} c)
(str \\ c)))))

(defn as-pattern [re]
(if (instance? Pattern re)
re
(Pattern/compile (escape-pattern (j/as-str re)))))
(Pattern/compile (Pattern/quote (j/as-str re)))))

(defn blank?
"True if s is nil, empty, or contains only whitespace."
Expand Down Expand Up @@ -100,12 +138,20 @@
(.substring s (- (count s) n))))

(defmulti
#^{:doc "Replaces all instances of a in s with b. a and b may be
Characters, Strings, Pattern/String, or Pattern/Fn."
:arglists '([s a b])}
#^{:doc "Replaces all instances of pattern in string with replacement.
Allowed argument types for pattern and replacement are:
1. String and String
2. Character and Character
3. regex Pattern and String
(Uses java.util.regex.Matcher.replaceAll)
4. regex Pattern and function
(Calls function with re-groups of each match, uses return
value as replacement.)"
:arglists '([string pattern replacement])}
replace
(fn [#^String s a b]
[(class a) (class b)]))
(fn [#^String string pattern replacement]
[(class pattern) (class replacement)]))

(defmethod replace [String String] [#^String s #^String a #^String b]
(.replace s a b))
Expand All @@ -127,14 +173,23 @@
(.toString buffer)))))))

(defmulti
#^{:doc "Replaces the first instance of a in s with b. a must be
Pattern, b may be String or Fn."
:arglists '([s a b])}
#^{:doc "Replaces the first instance of pattern in s with replacement.
Allowed argument types for pattern and replacement are:
1. String and String
2. regex Pattern and String
(Uses java.util.regex.Matcher.replaceAll)
3. regex Pattern and function
"
:arglists '([s pattern replacement])}
replace-first
(fn [s a b]
[(class a) (class b)]))
(fn [s pattern replacement]
[(class pattern) (class replacement)]))

(defmethod replace-first [String String] [#^String s pattern replacement]
(.replaceFirst (re-matcher (Pattern/quote pattern) s) replacement))

(defmethod replace-first [Pattern String] [#^String s #^Pattern re replacement]
(defmethod replace-first [Pattern String] [#^String s re replacement]
(.replaceFirst (re-matcher re s) replacement))

(defmethod replace-first [Pattern clojure.lang.IFn] [#^String s #^Pattern re f]
Expand Down
5 changes: 5 additions & 0 deletions src/clojure/contrib/test_contrib/str_utils2.clj
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,8 @@
(deftest t-replace-first
(is (= "barbarfoo" (s/replace-first "foobarfoo" #"foo" "bar")))
(is (= "FOObarfoo" (s/replace-first "foobarfoo" #"foo" s/upper-case))))

(deftest t-codepoints
(is (= (list 102 111 111 65536 98 97 114)
(s/codepoints "foo\uD800\uDC00bar"))
"Handles Unicode supplementary characters")))

0 comments on commit bea244b

Please sign in to comment.