forked from JuliaLang/julia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utf8.jl
104 lines (84 loc) · 3.09 KB
/
utf8.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
## from src/boot.jl:
#
# type UTF8String <: String
# data::Array{Uint8,1}
# end
#
## basic UTF-8 decoding & iteration ##
const _jl_utf8_offset = [
0x00000000, 0x00003080,
0x000e2080, 0x03c82080,
0xfa082080, 0x82082080,
]
const _jl_utf8_trailing = [
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
]
is_utf8_start(byte::Uint8) = ((byte&0xc0)!=0x80)
## required core functionality ##
length(s::UTF8String) = length(s.data)
strlen(s::UTF8String) = ccall(:u8_strlen, Int, (Ptr{Uint8},), s.data)
function next(s::UTF8String, i::Int)
if !is_utf8_start(s.data[i])
error("invalid UTF-8 character index")
end
trailing = _jl_utf8_trailing[s.data[i]+1]
if length(s.data) < i + trailing
error("premature end of UTF-8 data")
end
c = uint32(0)
for j = 1:trailing
c += s.data[i]
c <<= 6
i += 1
end
c += s.data[i]
i += 1
c -= _jl_utf8_offset[trailing+1]
char(c), i
end
function first_utf8_byte(c::Char)
c < 0x80 ? uint8(c) :
c < 0x800 ? uint8((c>>6 )|0xc0) :
c < 0x10000 ? uint8((c>>12)|0xe0) :
uint8((c>>18)|0xf0)
end
## overload methods for efficiency ##
isvalid(s::UTF8String, i::Integer) =
(1 <= i <= length(s.data)) && is_utf8_start(s.data[i])
function ref(s::UTF8String, r::Range1{Int})
i = isvalid(s,first(r)) ? first(r) : nextind(s,first(r))
j = nextind(s,last(r))-1
UTF8String(s.data[i:j])
end
function strchr(s::UTF8String, c::Char, i::Integer)
if c < 0x80 return memchr(s.data, c, i) end
while true
i = memchr(s.data, first_utf8_byte(c), i)
if i==0 || s[i]==c return i end
i = next(s,i)[2]
end
end
strcat(a::ByteString, b::ByteString, c::ByteString...) = UTF8String(memcat(a,b,c...))
# ^^ at least one must be UTF-8 or the ASCII-only method would get called
transform_to_utf8(s::String, f::Function) =
sprint(length(s), io->for c in s; write(io,f(c)::Char); end)
uppercase(s::UTF8String) = transform_to_utf8(s, uppercase)
lowercase(s::UTF8String) = transform_to_utf8(s, lowercase)
ucfirst(s::UTF8String) = string(uppercase(s[1]), s[2:])
lcfirst(s::UTF8String) = string(lowercase(s[1]), s[2:])
## outputing UTF-8 strings ##
print(io::IO, s::UTF8String) = (write(io, s.data);nothing)
write(io, s::UTF8String) = write(io, s.data)
## transcoding to UTF-8 ##
utf8(x) = convert(UTF8String, x)
convert(::Type{UTF8String}, s::UTF8String) = s
convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data)
convert(::Type{UTF8String}, a::Array{Uint8,1}) = check_utf8(UTF8String(a))
convert(::Type{UTF8String}, s::String) = utf8(cstring(s))