Skip to content

Commit

Permalink
Unicode 7.0
Browse files Browse the repository at this point in the history
Closes PR 14971
  • Loading branch information
mflatt committed Feb 9, 2015
1 parent fe68c9a commit 9c7d0b8
Show file tree
Hide file tree
Showing 7 changed files with 5,282 additions and 3,105 deletions.
2 changes: 2 additions & 0 deletions pkgs/racket-doc/scribblings/reference/chars.scrbl
Expand Up @@ -21,6 +21,8 @@ reader are @tech{interned} in @racket[read-syntax] mode.

@see-read-print["character"]{characters}

@history[#:changed "6.1.1.8" @elem{Updated from Unicode 5.0.1 to Unicode 7.0.0.}]

@; ----------------------------------------
@section{Characters and Scalar Values}

Expand Down
2 changes: 1 addition & 1 deletion pkgs/racket-test-core/tests/racket/uni-norm.rktl
Expand Up @@ -16,7 +16,7 @@

(define (get-test-file)
(define name "NormalizationTest.txt")
(define base "http://www.unicode.org/Public/5.0.0/ucd/")
(define base "http://www.unicode.org/Public/7.0.0/ucd/")
(define here (current-load-relative-directory))
(or (for/or ([dir (list here (current-directory))])
(define path (build-path dir name))
Expand Down
14 changes: 9 additions & 5 deletions pkgs/racket-test-core/tests/racket/unicode.rktl
Expand Up @@ -1469,7 +1469,8 @@
#\u3000
;; Post SRFI-14?
#\u205F
#\u180E))
;; #\u180E --- in Unicode 7, this code point changed from Zs to Cf
))

;; Punctuation in Latin-1:
(check-all-latin-1
Expand Down Expand Up @@ -1498,8 +1499,10 @@
#\{
#\}
#\u00A1
#\u00A7 ;; Made punctuation in Unicode 7.0
#\u00AB
;; #\u00AD ;; Treated as a control character now?
#\u00B6 ;; Made punctuation in Unicode 7.0
#\u00B7
#\u00BB
#\u00BF))
Expand All @@ -1521,7 +1524,7 @@
#\u00A4
#\u00A5
#\u00A6
#\u00A7
;; #\u00A7 ;; Made punctuation in Unicode 7.0
#\u00A8
#\u00A9
#\u00AC
Expand All @@ -1530,7 +1533,7 @@
#\u00B0
#\u00B1
#\u00B4
#\u00B6
;; #\u00B6 ;; Made punctuation in Unicode 7.0
#\u00B8
#\u00D7
#\u00F7))
Expand Down Expand Up @@ -1558,7 +1561,7 @@
#\u3000
;; Post SRFI-14?
#\u205F
#\u180E
;; #\u180E --- in Unicode 7, this code point changed from Zs to Cf
))


Expand All @@ -1579,7 +1582,8 @@
(if (char-numeric? c) 1 0)
(if (char-punctuation? c) 1 0)
(if (char-symbolic? c)
(if (char<=? #\u24B6 c #\u24E9)
(if (or (char<=? #\u24B6 c #\u24E9)
(char<=? #\U1F130 c #\U1F189)) ;; added in Unicode 6.0
0 ;; Those are both alphabetic and symbolic
1)
0))
Expand Down
50 changes: 42 additions & 8 deletions racket/src/racket/src/mk-uchar.rkt
Expand Up @@ -277,6 +277,21 @@
(hash-set! do-not-compose-ht code #t))))
(loop))))))

(define (composition-key a b)
;; If `a` and `b` are both in the BMP (i.e., both fit in 16 bits),
;; map to a 32-bit key.
(bitwise-ior (arithmetic-shift (bitwise-and a #xFFFF) 16)
(bitwise-and b #xFFFF)
(arithmetic-shift
(bitwise-ior (arithmetic-shift (arithmetic-shift a -16)
5)
(arithmetic-shift b -16))
32)))

(define (composition-key-first k)
(bitwise-ior (bitwise-and (arithmetic-shift k -16) #xFFFF)
(arithmetic-shift (arithmetic-shift k -37) 16)))

(define (extract-decomp decomp code)
(if (string=? decomp "")
#f
Expand All @@ -293,9 +308,9 @@
code
(lambda () #f))))
(hash-set! compose-initial-ht a #t)
(let ([key (bitwise-ior (arithmetic-shift a 16) b)])
(let ([key (composition-key a b)])
(when (hash-ref compose-map key (lambda () #f))
(error 'decomp "composition already mapped: ~e" key))
(error 'decomp "composition already mapped: ~x for: ~x" key code))
(hash-set! compose-map key code)))
(hash-set! decomp-ht code (cons a b))
#t)
Expand Down Expand Up @@ -423,7 +438,7 @@
;; 4.0, there are only four of these: U+0344, U+0F73,
;; U+0F75, and U+0F81.
(for-each (lambda (k)
(let ([a (arithmetic-shift k -16)])
(let ([a (composition-key-first k)])
(unless (zero? (hash-ref combining-class-ht a))
(hash-remove! compose-map k))))
(hash-map compose-map (lambda (k v) k)))
Expand Down Expand Up @@ -734,11 +749,21 @@


(let ()
(define (make-composes-table ps)
(list->vector (sort ps (lambda (a b) (< (car a) (car b))))))

(define canon-composes
(list->vector (sort (hash-map compose-map cons)
(lambda (a b) (< (car a) (car b))))))
(define count (hash-count compose-map))

(make-composes-table (for/list ([(k v) (in-hash compose-map)]
#:when (k . <= . #xFFFFFFFF))
(cons k v))))
(define count (vector-length canon-composes))

(define long-canon-composes
(make-composes-table (for/list ([(k v) (in-hash compose-map)]
#:when (k . > . #xFFFFFFFF))
(cons k v))))
(define long-count (vector-length long-canon-composes))

(define-values (all-composes decomp-vector long-composes)
(let ([decomp-pos-ht (make-hasheq)]
[counter count]
Expand All @@ -748,7 +773,7 @@
(hash-for-each decomp-ht
(lambda (k v)
;; Use table of composed shorts:
(let ([key (+ (arithmetic-shift (car v) 16) (cdr v))])
(let ([key (composition-key (car v) (cdr v))])
(let ([pos
(if (and ((car v) . <= . #xFFFF)
((cdr v) . <= . #xFFFF))
Expand Down Expand Up @@ -813,6 +838,15 @@
(printf " the mapped index, negate, then multiply by 2 to find the pair. */\n")
(print-compose-data "unsigned int" "compose_long_pairs" values long-composes (vector-length long-composes) #t 8)
(printf "\n")
(printf "/* utable_canon_compose_long_pairs repeats information from utable_compose_long_pairs,\n")
(printf " but for canonical compositions only. The two characters are combined by putting the\n")
(printf " lower 16 bits of the combined numbers in the low 32 bits, and then the next higher 10\n")
(printf " bits provide the remaining 5 bits of each character, and the array is sorted. The\n")
(printf " canon_compose_long_result array provides in parellel the composed character. */\n")
(printf "#define LONG_COMPOSE_TABLE_SIZE ~a\n\n" long-count)
(print-compose-data "mzlonglong" "canon_compose_long_pairs" car long-canon-composes long-count #t 8)
(print-compose-data "unsigned int" "canon_compose_long_result" cdr long-canon-composes long-count #t 8)
(printf "\n")
(printf "/* utable_decomp_keys identifies characters that have a canonical decomposition;\n")
(printf " it is sorted, so binary search can be used, but use scheme_needs_decompose()\n")
(printf " from scheme.h to first determine whether a character may have a mapping in this table.\n")
Expand Down

0 comments on commit 9c7d0b8

Please sign in to comment.