Initial push

quinnj · Jun 5, 2015 · 20e0612 · StefanKarpinski · Jun 5, 2015 · quinnj
1 parent 6314cba
commit 20e0612
Show file tree

Hide file tree

Showing 3 changed files with 199 additions and 3 deletions.
diff --git a/src/Strings.jl b/src/Strings.jl
@@ -1,5 +1,111 @@
 module Strings
 
-# package code goes here
+using Compat, Mmap
+
+abstract Encoding
+abstract DirectIndexedEncoding <: Encoding
+
+immutable ASCII  <: DirectIndexedEncoding end
+immutable Latin1 <: DirectIndexedEncoding end
+
+immutable UTF8    <: Encoding end
+immutable UTF16LE <: Encoding end
+immutable UTF32LE <: DirectIndexedEncoding end
+immutable UCS2LE  <: DirectIndexedEncoding end
+
+immutable UTF16BE <: Encoding end
+immutable UTF32BE <: DirectIndexedEncoding end
+immutable UCS2BE  <: DirectIndexedEncoding end
+
+if ENDIAN_BOM == 0x01020304
+    typealias UTF16 UTF16BE
+    typealias UTF32 UTF32BE
+    typealias UCS2 UCS2BE
+elseif ENDIAN_BOM == 0x04030201
+    typealias UTF16 UTF16LE
+    typealias UTF32 UTF32LE
+    typealias UCS2 UCS2LE
+else
+    error("seriously? what is this machine?")
+end
+
+immutable String{T<:Encoding}
+  ptr::Ptr{UInt8}
+  len::Int
+end
+
+function ==(a::String, b::String)
+    na = sizeof(a)
+    nb = sizeof(b)
+    na != nb && return false
+    c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt),
+              a.ptr, b.ptr, min(na,nb))
+    return c == 0
+end
+
+const PAGESIZE = @compat Int(@unix ? ccall(:jl_getpagesize, Clong, ()) : ccall(:jl_getallocationgranularity, Clong, ()))
+
+# this would be handled by native Julia GC, but we'll roll our own for now
+type StringPool
+    pool::Vector{Vector{UInt8}}
+    ind::Int
+    pos::Int
+end
+
+const POOL = StringPool(Any[Mmap.mmap(UInt8,PAGESIZE)],1,1)
+
+function ensureroom!(n::Int)
+    if POOL.pos + n < PAGESIZE
+        # we have enough room to allocate `n` bytes
+        return
+    elseif n < PAGESIZE
+        # we're hitting a page boundary
+        push!(POOL.pool,Mmap.mmap(UInt8,PAGESIZE))
+        POOL.ind += 1
+        POOL.pos = 1
+        return
+    elseif n > PAGESIZE
+        totalneededbytes = (div(n, PAGESIZE) + 1) * PAGESIZE
+        push!(POOL.pool,Mmap.mmap(UInt8,totalneededbytes))
+        POOL.ind += 1
+        POOL.pos = 1
+        return
+    end
+end
+
+function storebytes!(s::Ptr{UInt8},n::Int,offset::Int=1)
+    ensureroom!(n)
+    # unsafe_copy!(dest::Ptr{T}, src::Ptr{T}, N)
+    ptr = pointer(POOL.pool[POOL.ind])+UInt(POOL.pos)
+    unsafe_copy!(ptr, s+UInt(offset), n)
+    return ptr, n
+end
+function storebytes!{N}(ptrs::NTuple{N,Ptr{UInt8}},lens::NTuple{N,Int})
+    @inbounds begin
+    n = prod(lens)
+    ensureroom!(n)
+    # unsafe_copy!(dest::Ptr{T}, src::Ptr{T}, N)
+    starting_ptr = pointer(POOL.pool[POOL.ind])+UInt(POOL.pos)
+    ptr = starting_ptr
+    for i = 1:N
+        unsafe_copy!(ptr, ptrs[i], lens[i])
+        ptr += lens[i]
+    end
+    end
+    return starting_ptr, n
+end
+function storebytes!(s::Vector{UInt8},n::Int,offset::Int=1)
+    ensureroom!(n)
+    # unsafe_copy!(dest::Array, do, src::Array, so, N)
+    unsafe_copy!(POOL.pool[POOL.ind],POOL.pos,s,offset,n)
+    ptr = pointer(POOL.pool[POOL.ind]) + UInt(POOL.pos) - 1
+    POOL.pos += n
+    return ptr, n
+end
+storebytes!(s::Vector{UInt16},n::Int) = storebytes!(reinterpret(UInt8,s),n)
+storebytes!(s::Vector{UInt32},n::Int) = storebytes!(reinterpret(UInt8,s),n)
+storebytes!(s::Vector{Char},n::Int)   = storebytes!(reinterpret(UInt8,s),n)
+
+include("ascii.jl")
 
 end # module
diff --git a/src/ascii.jl b/src/ascii.jl
@@ -0,0 +1,62 @@
+function String(s::ASCIIString)
+    ptr, len = storebytes!(s.data,length(s))
+    return String{ASCII}(ptr,len)
+end
+
+function Base.show(io::IO, x::String{ASCII})
+    if x.len == 0
+        print(io, Char('"'), Char('"'))
+    else
+        print(io, Char('"'))
+        for i = 1:x.len
+            print(io, Char(unsafe_load(x.ptr, i)))
+        end
+        print(io, Char('"'))
+    end
+    return
+end
+Base.endof(x::String{ASCII}) = x.len
+Base.length(x::String{ASCII}) = x.len
+Base.sizeof(x::String{ASCII}) = x.len
+
+function Base.getindex(x::String{ASCII}, i::Int)
+    0 < i < x.len || throw(BoundsError())
+    c = unsafe_load(x.ptr,i)
+    return ifelse(c < 0x80, Char(c), '\ufffd')
+end
+
+# substring
+function getindex(s::String{ASCII}, r::UnitRange{Int})
+    n = length(r)
+    (0 < first(r) <= s.len && last(r) <= s.len) || throw(BoundsError())
+    isempty(r) && return String{ASCII}(C_NULL,0)
+
+    if n < div(s.len,4) || n < 16 # < 25% of original string size or really small
+        # just make a copy
+        ptr, len = storebytes!(s.ptr, n, first(r)-1)
+        return String{ASCII}(ptr,len)
+    else
+        # share data with original string
+        return String{ASCII}(s.ptr+UInt(first(r)-1),n)
+    end
+end
+
+# concatenation
+function string(strs::String{ASCII}...)
+    @inbounds begin
+    N = length(strs)
+    n = 0
+    for i = 1:N
+        n += strs[i].len
+    end
+    ensureroom!(n)
+    # unsafe_copy!(dest::Ptr{T}, src::Ptr{T}, N)
+    starting_ptr = pointer(POOL.pool[POOL.ind])+UInt(POOL.pos)
+    ptr = starting_ptr
+    for i = 1:N
+        unsafe_copy!(ptr, strs[i].ptr, strs[i].len)
+        ptr += strs[i].len
+    end
+    end
+    return String{ASCII}(starting_ptr,n)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,5 +1,33 @@
-using Strings
+reload("Strings")
 using Base.Test
 
 # write your own tests here
-@test 1 == 1
+s = Strings.String("hey there")
+s[1] == 'h'
+s[2] == 'e'
+s[3] == 'y'
+
+endof(s) == 9
+sizeof(s) == 9
+length(s) == 9
+
+s[1:3] == Strings.String("hey")
+s[2:4] == Strings.String("ey ")
+
+s = Strings.String("a fairly sizable string to test larger string sizes")
+s[1:3] == Strings.String("a f")
+s[1:20] == Strings.String("a fairly sizable str")
+
+Strings.string(Strings.String("hey"),Strings.String(" "),Strings.String("ho")) == Strings.String("hey ho")
+
+
+s = ""
+@time for i  = 1:1000
+    s *= " "
+end
+
+s = Strings.String("")
+space = Strings.String(" ")
+@time for i  = 1:1000
+    s = Strings.string(s,space)
+end