Commit
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,111 @@ | ||
module Strings | ||
|
||
# package code goes here | ||
using Compat, Mmap | ||
|
||
abstract Encoding | ||
abstract DirectIndexedEncoding <: Encoding | ||
|
||
immutable ASCII <: DirectIndexedEncoding end | ||
immutable Latin1 <: DirectIndexedEncoding end | ||
|
||
immutable UTF8 <: Encoding end | ||
immutable UTF16LE <: Encoding end | ||
immutable UTF32LE <: DirectIndexedEncoding end | ||
immutable UCS2LE <: DirectIndexedEncoding end | ||
|
||
immutable UTF16BE <: Encoding end | ||
immutable UTF32BE <: DirectIndexedEncoding end | ||
immutable UCS2BE <: DirectIndexedEncoding end | ||
|
||
if ENDIAN_BOM == 0x01020304 | ||
typealias UTF16 UTF16BE | ||
typealias UTF32 UTF32BE | ||
typealias UCS2 UCS2BE | ||
elseif ENDIAN_BOM == 0x04030201 | ||
typealias UTF16 UTF16LE | ||
typealias UTF32 UTF32LE | ||
typealias UCS2 UCS2LE | ||
else | ||
error("seriously? what is this machine?") | ||
end | ||
|
||
immutable String{T<:Encoding} | ||
ptr::Ptr{UInt8} | ||
len::Int | ||
end | ||
|
||
function ==(a::String, b::String) | ||
na = sizeof(a) | ||
nb = sizeof(b) | ||
na != nb && return false | ||
c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), | ||
a.ptr, b.ptr, min(na,nb)) | ||
return c == 0 | ||
end | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
quinnj
Author
Owner
|
||
|
||
const PAGESIZE = @compat Int(@unix ? ccall(:jl_getpagesize, Clong, ()) : ccall(:jl_getallocationgranularity, Clong, ())) | ||
|
||
# this would be handled by native Julia GC, but we'll roll our own for now | ||
type StringPool | ||
pool::Vector{Vector{UInt8}} | ||
ind::Int | ||
pos::Int | ||
end | ||
|
||
const POOL = StringPool(Any[Mmap.mmap(UInt8,PAGESIZE)],1,1) | ||
|
||
function ensureroom!(n::Int) | ||
if POOL.pos + n < PAGESIZE | ||
# we have enough room to allocate `n` bytes | ||
return | ||
elseif n < PAGESIZE | ||
# we're hitting a page boundary | ||
push!(POOL.pool,Mmap.mmap(UInt8,PAGESIZE)) | ||
POOL.ind += 1 | ||
POOL.pos = 1 | ||
return | ||
elseif n > PAGESIZE | ||
totalneededbytes = (div(n, PAGESIZE) + 1) * PAGESIZE | ||
push!(POOL.pool,Mmap.mmap(UInt8,totalneededbytes)) | ||
POOL.ind += 1 | ||
POOL.pos = 1 | ||
return | ||
end | ||
end | ||
|
||
function storebytes!(s::Ptr{UInt8},n::Int,offset::Int=1) | ||
ensureroom!(n) | ||
# unsafe_copy!(dest::Ptr{T}, src::Ptr{T}, N) | ||
ptr = pointer(POOL.pool[POOL.ind])+UInt(POOL.pos) | ||
unsafe_copy!(ptr, s+UInt(offset), n) | ||
return ptr, n | ||
end | ||
function storebytes!{N}(ptrs::NTuple{N,Ptr{UInt8}},lens::NTuple{N,Int}) | ||
@inbounds begin | ||
n = prod(lens) | ||
ensureroom!(n) | ||
# unsafe_copy!(dest::Ptr{T}, src::Ptr{T}, N) | ||
starting_ptr = pointer(POOL.pool[POOL.ind])+UInt(POOL.pos) | ||
ptr = starting_ptr | ||
for i = 1:N | ||
unsafe_copy!(ptr, ptrs[i], lens[i]) | ||
ptr += lens[i] | ||
end | ||
end | ||
return starting_ptr, n | ||
end | ||
function storebytes!(s::Vector{UInt8},n::Int,offset::Int=1) | ||
ensureroom!(n) | ||
# unsafe_copy!(dest::Array, do, src::Array, so, N) | ||
unsafe_copy!(POOL.pool[POOL.ind],POOL.pos,s,offset,n) | ||
ptr = pointer(POOL.pool[POOL.ind]) + UInt(POOL.pos) - 1 | ||
POOL.pos += n | ||
return ptr, n | ||
end | ||
storebytes!(s::Vector{UInt16},n::Int) = storebytes!(reinterpret(UInt8,s),n) | ||
storebytes!(s::Vector{UInt32},n::Int) = storebytes!(reinterpret(UInt8,s),n) | ||
storebytes!(s::Vector{Char},n::Int) = storebytes!(reinterpret(UInt8,s),n) | ||
|
||
include("ascii.jl") | ||
|
||
end # module |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
function String(s::ASCIIString) | ||
ptr, len = storebytes!(s.data,length(s)) | ||
return String{ASCII}(ptr,len) | ||
end | ||
|
||
function Base.show(io::IO, x::String{ASCII}) | ||
if x.len == 0 | ||
print(io, Char('"'), Char('"')) | ||
else | ||
print(io, Char('"')) | ||
for i = 1:x.len | ||
print(io, Char(unsafe_load(x.ptr, i))) | ||
end | ||
print(io, Char('"')) | ||
end | ||
return | ||
end | ||
Base.endof(x::String{ASCII}) = x.len | ||
Base.length(x::String{ASCII}) = x.len | ||
Base.sizeof(x::String{ASCII}) = x.len | ||
|
||
function Base.getindex(x::String{ASCII}, i::Int) | ||
0 < i < x.len || throw(BoundsError()) | ||
c = unsafe_load(x.ptr,i) | ||
return ifelse(c < 0x80, Char(c), '\ufffd') | ||
end | ||
|
||
# substring | ||
function getindex(s::String{ASCII}, r::UnitRange{Int}) | ||
n = length(r) | ||
(0 < first(r) <= s.len && last(r) <= s.len) || throw(BoundsError()) | ||
isempty(r) && return String{ASCII}(C_NULL,0) | ||
|
||
if n < div(s.len,4) || n < 16 # < 25% of original string size or really small | ||
# just make a copy | ||
ptr, len = storebytes!(s.ptr, n, first(r)-1) | ||
return String{ASCII}(ptr,len) | ||
else | ||
# share data with original string | ||
return String{ASCII}(s.ptr+UInt(first(r)-1),n) | ||
end | ||
end | ||
|
||
# concatenation | ||
function string(strs::String{ASCII}...) | ||
@inbounds begin | ||
N = length(strs) | ||
n = 0 | ||
for i = 1:N | ||
n += strs[i].len | ||
end | ||
ensureroom!(n) | ||
# unsafe_copy!(dest::Ptr{T}, src::Ptr{T}, N) | ||
starting_ptr = pointer(POOL.pool[POOL.ind])+UInt(POOL.pos) | ||
ptr = starting_ptr | ||
for i = 1:N | ||
unsafe_copy!(ptr, strs[i].ptr, strs[i].len) | ||
ptr += strs[i].len | ||
end | ||
end | ||
return String{ASCII}(starting_ptr,n) | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,33 @@ | ||
using Strings | ||
reload("Strings") | ||
using Base.Test | ||
|
||
# write your own tests here | ||
@test 1 == 1 | ||
s = Strings.String("hey there") | ||
s[1] == 'h' | ||
s[2] == 'e' | ||
s[3] == 'y' | ||
|
||
endof(s) == 9 | ||
sizeof(s) == 9 | ||
length(s) == 9 | ||
|
||
s[1:3] == Strings.String("hey") | ||
s[2:4] == Strings.String("ey ") | ||
|
||
s = Strings.String("a fairly sizable string to test larger string sizes") | ||
s[1:3] == Strings.String("a f") | ||
s[1:20] == Strings.String("a fairly sizable str") | ||
|
||
Strings.string(Strings.String("hey"),Strings.String(" "),Strings.String("ho")) == Strings.String("hey ho") | ||
|
||
|
||
s = "" | ||
@time for i = 1:1000 | ||
s *= " " | ||
end | ||
|
||
s = Strings.String("") | ||
space = Strings.String(" ") | ||
@time for i = 1:1000 | ||
s = Strings.string(s,space) | ||
end |
50 comments
on commit 20e0612
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it be possible to split out the encoding type, as a real small PR, that might get put in Base, that hopefully TPTB could approve really quickly? The whole change will take some time, and should be in a package, for now at least (much easier to debug and update) [but I'd love to see this sort of stuff as part of Base...], but the simple addition of a nice type system to indicate encoding, separating it from the storage, would be good to have for lots of things to build upon.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmmm......possibly, what are the use cases you'd see for it immediately? I'd hesitate, because I think for most cases I can imagine, we're already using, or at least should be using, UTF8String
, UTF16String
, etc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd change my check_string functions to use an encoding for a first argument, instead of determine what encoding to check based on the type of the container, which isn't really as extensible...
This would make my functions much more general.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@tkelman Now that you've joined us, what do you think of @quinnj quickly putting in a PR to add just the encoding to Base... I'd immediately update check_string to use his Encoding types, they'd really allow me to improve the generality of the code. (trying to conflate storage and encoding, which is what happens now throughout Julia,and I unfortunately added to, is simply not correct, and with Encoding, we can fix it).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Neutral on that idea, for now. If we were to go a little crazier and think about having a single String
type parameterized by the encoding, then we might be in business. It seems like there's too much (mostly) copy-pasted code with small variations between these different encoding sizes, feels like it's missing abstraction. This might also be a terrible idea and an abuse of type parameters, dunno.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The point is, using the current string types to attempt to indicate encoding just isn't correct.
Vector{UInt8} could contain any of the 9 different Encoding types (and those encoding types are just
the ones that represent subsets of Unicode (i.e. ASCII, Latin1, UCS2), or encodings using 8-bit or 16-bit code units, plus byte swapped variants. The check_string function really should be able to handle all of those, and also handle UTF-16 and UCS2 in a vector of 16-bit words, plus UTF-32 in a vector of 32-bit words. That's where having his encoding types would allow greater usefulness of the functions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you are also viewing the Encodings still as being part of the String PR.
I view them as separate... the Encoding type is a fundamental building block for good string handling, that was simply missing. Whatever happens to this Strings PR, which I like, but needs more work ironing out all the issues, and maybe combining with some of Stefen's ByteVec ideas, I don't see any reason Encodings shouldn't go into Base now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ScottPJones, I agree that the Encoding
idea sounds great (I mean, I did come up with it.... :) ), but let's not rush anything. I know you like to push at a good pace, but let's let things simmer a bit before being too hasty at cramming things in Base. Particularly with design decisions like this (as opposed to bug fixes, performance patches, etc.), we should really take some time to think through possible use cases of a separate Encoding
type and all the corners of the current codebase it could/would touch. I hate the idea of having an isolated Encoding
type that is used by a small family of methods while the larger string codebase exists without it.
I think there's starting to be some good momentum for a string overhaul (kicked off by @StefanKarpinski's ByteVec PR), so with a big, probably breaking change in strings on the horizon, that would be a great time to factor in an Encoding
type and maybe Char{E<:Encoding}
as a part of the overhaul. Those are the exact things that hopefully we can play with in this repo and do a proof-of-concept of how things could look. Having the start of a great, simple, and performant string redesign that someone could play with would go a long ways to getting it into Base.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, but do you understand that having Encodings
and my new & improved check
function, instead of what's currently in #11575, will help fix a number of other Unicode related bugs?
There are a number of problems that have come from conflating the encoding and way it is put in storage...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure I follow your comments:
new & improved check function
as @tkelman mentioned, I'm not sure we're really seeing a huge improvement by putting two methods together with a big if E <: UTF8
to split most of the method in two. It smells of a need for further refactoring into other methods and use of dispatch.
will help fix a number of other Unicode related bugs?
You mean, your PR here has fixes that are not included in JuliaLang/julia#11575? Why can't they be included there?
There are a number of problems that have come from conflating the encoding and way it is put in storage...
Can you expound on this point? What's an example of a specific case where the encoding and way it is stored? I'm not doubting, but I just don't what this problem looks like necessarily. How does the new Encoding
type specifically address these issues?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, this PR does not have any extra fixes.
This PR (placed in #11575) and merged into Base, would allow me to deal easily with the code that tries to look for a BOM, and possibly do a byte-swap operation. The current code is inconsistent between UTF-16 and UTF-32, and I pointed out elsewhere that this would help me be able to fix #11463, #11501, and #11502.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you expound on this point? What's an example of a specific case where the encoding and way it is stored?
For example, there are currently methods to convert to UTF16String
/ UTF32String
, from Vector{UInt8}
,
which assume that they will find either UTF16LE
or UTF16BE
for UTF16String
, and UTF32LE
or UTF32BE
for UTF32String
. That totally misses that you may really have a UTF16
(either endian) or a UTF8
stored in the Vector{UInt8}
that needs to be converted to a UTF32String
.
The other thing that my new code handles is to handle not just Vector{UInt8}
, but also AbstractVector
and AbstractArray
[I haven't had a chance to try what you suggested about just being able to say AbstractArray{UInt8}
, the old code actually does a reshape in some cases with AbstractArray
...,
so I assume that there must have been some reason they kept those separate...]
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The picture of how things ought to work isn't clear enough yet to move encodings into Base. That's just asking for multiple rounds of breaking changes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is it that you don't understand about Encodings that makes you think things are not clear enough?
Why would there be any breaking changes? I don't see that being the case at all.
First off, Encodings would only initially be used in Base by my checkstring function, nothing else.
The set of encodings that have been defined, except for Latin1, are all ones that are currently implicitly used in the current (broken) conversion code (even the opposite endian one, because there are conversions that try to handle a BOM...).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Plenty. What kinds of object should they be? Do they become type parameters of other types? If we stick them in Base now, I would bet cold hard cash on us not getting it 100% right and causing future breakage for people who used them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, @ScottPJones, I don't think it's a matter of not understanding what Encodings
are, but how we're going to use them exactly. Like I mentioned above, adding them right away would cause a little weirdness because we have UTF8String
,UTF16String
,etc. and with Encodings
, we'd have UTF8
,UTF16
,etc.
Like I said before, this is a much better change to introduce with a larger overhaul where it can be grafted in all the various corners that encodings touch (as opposed to using it just for a check_string
function).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@StefanKarpinski What do you mean "What kinds of object should they be?" They are simply types, used for parameterization, no values.
You seem to have little faith in the Julians' ability to get a design right at first... considering the breakage that happened with the "Tupocalpyse", and the major breakage planned for "Arraymeggedon", maybe I can understand that attitude.
If it is an approach that I approved of, such as Jacob's, I'll take your bet...
$10? $20? A round of drinks at the Muddy Charles? I've already done this, twice (not twice because I got it wrong at all the first time, but once for single-byte encodings, back in '92, and then extended it for multi-byte encodings and Unicode in '96, but nothing at all broke from the first design...) What I handled then was a lot more complex than the simple cases here, which are simply subsets and encodings of Unicode.
@quinnj Should they all be marked as abstract
instead of immutable
? Does it make any difference, since they are only used for parameterization?
Also, I disagree that Encodings should be introduced along with larger changes (I've had too much bad experience with trying to get a sweeping change in, even just to fix outright bugs).
Better to get it in, have it used in a limited case (i.e. check_string
), let people "kick the tires" a bit,
before making it exported and documented, and trying to use it (and Strings.jl
) to replace the current string handling mess...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you possibly write a single comment without being a dick?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Excuse me??? Just what is your problem?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW, your comments do not conform to the Julia community standards...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My problem is that you are continually passive aggressive, argumentative, and dismissive of other people's hard work and disrespectful of other people's time and energy. Now you are trash talking the design decisions that have gone into Julia and the process by which we decide on and handle breaking changes. This is beyond galling and is incredibly insulting.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You seem to have little faith in the Julians' ability to get a design right at first... considering the breakage that happened with the "Tupocalpyse", and the major breakage planned for "Arraymeggedon", maybe I can understand that attitude.
@ScottPJones, this comment is disrespectful and attacks @StefanKarpinski and his hard work directly. Stefan was sharing his thoughts on a design decision and you turned it into a personal questioning of his abilities and ideas. Not ok. You may disagree with another's opinion, but an appropriate response is focused on providing evidence or reasoning around the idea at hand, not making a personal, even if somewhat offhand or joking, comment.
On abstract vs. immutable, I've never really heard a good argument one way or the other, but there is somewhat of precedent for making them immutable. The corollary is with Encoding
being a true abstract types, while UTF8
is a "concrete" subtype, i.e. you wouldn't need/want to subtype UTF8
.
On waiting to put Encodings
into Base, I think the counter-argument goes back to Stefan's original comment here. Encodings
will end up touching several corners/places in Base and string handling, so it's hard to know exactly how they should be structured until a larger integration is carried out. Sure, they seem to work fine for check_string
now, but maybe when they're incorporated into some other code, we realize we actually need them to be structured a little differently (I've already been making tweaks and changes here and there as I've continued to piece things together). This also isn't quite as drastic as a "complete sweeping change" since the user-facing API will stay relatively the same, and it's mainly a major reworking of the internals. In these cases, I think it's much better to incorporate all the appropriate changes at once (which can be thoroughly tested by CI and by pulling changes locally) to make sure the new internals are solid, and then releasing, hopefully only needing deprecations.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You seem to be living on another planet. Go read my comments - I have heaped loads of praise on both Julia, and the people who have come up with it, and have many many times thanked all of the time and energy that people have spent on constructive criticism. I have NOT ONCE made a derogatory comment about another person here.
Trash talking the design decisions? I don't think I've said anything that hasn't been said before, by any number of people, including yourself... (I dared criticize string interpolation... turns out Jeff is against it also... I criticized the choice of *
for string concatenation... do I need to quote your very own words back at you???). Many many people have complained about the fallout from the "tupocalypse"... maybe that could have been better handled? I agree that the new approach for tuples is better, but that just points out that the original design had a flaw so severe that it was decided that a very painful breaking change was necessary. Why do you feel commenting on that is so insulting? Were all of the design decisions I've criticized your personal decisions? Or do you feel threatened if I bring up obvious bugs in the code?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@quinnj I didn't think that comment was an attack on Stefan... he was the one that seemed disrespectful of your work, i.e.:
I would bet cold hard cash on us not getting it 100% right and causing future breakage for people who used them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@quinnj Putting Encodings into Base would not mean that they are cast in stone, right from the beginning.
What changes (except for removing Binary, which I think should go back, and moving it out of a module, for convenience) have you had to make to Encodings? I hadn't yet seen any.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ScottPJones, it was inappropriate because it was directed at Stefan personally, ("You seem to have little faith....", my emphasis added), whereas Stefan's comment was focused on the design, not me personally. I didn't think twice about Stefan's comment because I can objectively view his opinion there and see his point of view that things might change. Your comment to Stefan, however, made me cringe a little because it took his counter-argument to putting Encodings in Base right away to questioning his faith/trust in collaborators.
Putting things in Base is a way of casting things in stone, so to speak; we really want to feel comfortable that it's at least the best decision/design we can come up with at the moment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@quinnj Read @StefanKarpinski 's comment again... if you cringe at the "you", why don't you cringe at the "we" and "us", and saying that "we'd" not get it right? How is that directed at the design? That is directed at the people who came up with the design, and the people wishing to have it moved into Base.
If you take my comment, which was "You seem to have little faith...", my emphasis added) as an attack,
which it wasn't meant to be (I wanted to point out the inconsistency in the position of advocating major breaking changes, while being afraid that something might cause future breakage), then you need to take that comment doubly as an attack, and what do you say about personal comments such as "Could you possibly write a single comment without being a dick?". Is that defensible behavior around here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Putting things in Base is a way of casting things in stone, so to speak; we really want to feel comfortable that it's at least the best decision/design we can come up with at the moment.
I really hadn't seen much evidence of things being in Base being cast in stone, in the < 3 months since I became aware of Julia... It really would be nice if major breaking changes were staged in some better fashion, maybe with separate branch, or repository, with all registered packages being either fixed, or removed, before that branch becomes the canonical branch.
(I'm used to even minor breaking changes simply not being allowed... even things like much greater memory consumption or performance regressions were not allowed... can't have a hospital shut down because of an update).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just chill. @ScottPJones the frequency, length, and tone of your comments have managed to get under the skin of a significant fraction of the core contributors to Julia, in a short period of time.
There is a separate branch for breaking changes - in Julia it's called master. We're quite a bit more conservative about backports to the release branch, and work pretty hard to keep that stable. If a package starts failing on release-0.3, that's almost always due to changes in the package rather than changes between Julia patch releases.
We could however do a better job of tracking performance regressions, and overall statistics from our test suite. That's not getting any better because no one is working on the necessary infrastructure for it right now.
On the technical subject of this conversation, we're not convinced yet that Encodings bring enough of a benefit to the overall design to be worth bringing into base as-is. Out here in a package where you're more free to experiment would be a good place to keep working on it and see how the bigger picture fits together. Base Julia and the core team really need to focus on critical release-defining issues right now, and redesigning strings is not mature enough to be part of that.
For the open PR's with unicode-related fixes that are potentially on the table, we're also not seeing how Encodings, as-is, bring any major benefit. I've mentioned before that I don't think there's any great reason for UTF32String
to even be part of base any more, except for Cwstring
(which I haven't seen much use of in Julia code on unices anyway). Adding marker types for multiple obscure variants of it doesn't seem necessary for base either. I might be convinced otherwise, especially as part of a more substantial redesign of string internals, but it's going to take some time and technical justification (not just "What is it that you don't understand" and what essentially amounts to 'take my word for it' despite being still relatively new to Julia).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@tkelman The frequency of my comments has had to do with the many issues I've found. It's not a good thing if people let reports of problems "get under their skin". Just as I have learned a LOT from the thorough reviews of my changes to try to fix those bugs, the core contributors should take the opportunity to learn from "outsiders" reviewing their design and code, and questioning their design decisions. If the design and/or code can't stand up to a review, then instead of getting defensive, they should acknowledge that, and help get things fixed.
I haven't ever made derogatory statements about anybody, and have many times praised the overall work.
I think that maybe some other people need to take a deep breath and chill.
Yes, master is now how breaking changes are currently handled... my point is that maybe that is not the best way when dealing with a change that is known to be breaking in advance. Too many people have to use latest, simply because the functionality simply isn't present in the release branch, or because of bugs only fixed in latest. One of my colleagues has twice been bitten by packages broken by the tuple changes (ODBC.jl
and StrPack.jl
), the last one still broken a couple months after the breaking tuple changes...
Could a "breaking" branch be set up, and nightly builds made of that, as well as a "stable" latest?
Registered packages could then be (automatically?) tested against the "breaking" builds, and a scoreboard kept of which packages still need work to be able to function with the changes. When all the packages are either working, or it is decided that they would be removed from the registered packages, and Compat.jl
has been updated (if possible) to handle the changes, then that breaking branch can become the new "stable" latest.
The current PRs with Unicode fixes, don't need Encodings
, I didn't say that they did. What I said was that in order to fix some of the Issues I have brought up, specifically the ones about converting Vector{UInt8}
to a UTF16String
or a UTF32String
, which try to look for a BOM, but can't handle a UTF-8 BOM, for example, or a UTF-16 BOM when converting to UTF32String
or vice-versa, Encodings
would be needed.
The nice thing about Encodings
is that it is extensible. If you wanted (in a package, of course) to define encodings for SJIS
, EUC
, CP1252
, SCSU
, or BOCU-1
, you could. The "obscure" variants that you refer to are already being handled in the current code in Base, they just are done so implicitly and inconsistently.
About moving UTF32String
out of Base, it is needed for handling Cwstring
, so at least conversions to and from UTF-32 encoding need to be handled. There are a large amount of string handling "features" that really could be moved out of Base, such as RopeString
(I'd make a PR to move them out right away, but I'd want to check to see if any of the registered packages use them first, and I haven't had time to do that yet).
Also, not having Encodings
in Base greatly restricts how general purpose unsafe_checkstring
can be, because it can't distinguish between different things stored in a Vector
/Array
/AbstractArray
of UInt8
.
I haven't just said "take my word for it", I'd already pointed out concrete examples of bugs in the current code, directly due to this conflation of encodings and storage representation.
I know I'm relatively new to Julia, and I've been taking all of the (very good) advice on how to better use Julia (I hope that my responsiveness to review suggestions has been noted), but I'm not at all new to software engineering or string handling / Unicode issues in particular. I'm not asking you to "take my word for it" at all, I believe strongly in "trust, but verify"... so even if in the future you gain some level of trust in my knowledge of string / Unicode issues, I would always want you to never simply take me at my word.
(I've done the same thing with people's advice on Julia... I've trusted them, but done my own testing, which has several times shown up problems with that advice, which has since been fixed... such as the performance of copy!
with arrays of different element sizes, and the performance regressions that I reported, which Jeff fixed).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@quinnj I've come up with some examples, that make me sure that all Encodings
should be abstract, never immutable.
CESU-8
, for example, is a variant of UTF-8
, as is Java's Modified UTF-8
.
It should be possible to define those as:
abstract CESU8 <: UTF8
abstract ModUTF8 <: UTF8
Base64 encoding, as well as the Base64ws that I want to add (I already have my own C & Julia implementations), can be defined as a type of ASCII
, i.e.
abstract BASE64 <: ASCII
abstract BASE64WS <: ASCII
I also think the basic encodings should be done as a hierarchy:
abstract Encoding
abstract DirectIndexedEncoding <: Encoding
abstract Binary <: DirectIndexedEncoding
abstract ASCII <: DirectIndexedEncoding
abstract Latin1 <: DirectIndexedEncoding
abstract UTF8 <: Encoding
abstract UTF16 <: Encoding
abstract UTF32 <: DirectIndexedEncoding
abstract UCS2 <: DirectIndexedEncoding
# Opposite endian encodings of 16-bit and 32-bit encodings
abstract UTF16OE <: UTF16
abstract UTF32OE <: UTF32
abstract UCS2OE <: UCS2
# This is easier to use (and not get the ordering mixed up) than ENDIAN_BOM
const BIG_ENDIAN = reinterpret(UInt32,UInt8[1:4;])[1] == 0x01020304
if BIG_ENDIAN
abstract UTF16BE <: UTF16
abstract UTF32BE <: UTF32
abstract UCS2BE <: UCS2
abstract UTF16LE <: UTF16OE
abstract UTF32LE <: UTF32OE
abstract UCS2LE <: UCS2OE
else
abstract UTF16LE <: UTF16
abstract UTF32LE <: UTF32
abstract UCS2LE <: UCS2
abstract UTF16BE <: UTF16OE
abstract UTF32BE <: UTF32OE
abstract UCS2BE <: UCS2OE
end
This allows one to look at the encoding, without having to know about what particular variant is being used (except where it matters). You can also then make parameterized functions to get the "traits" of a particular encoding, such as "nativeendian", "bigendian".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ScottPJones we're not getting defensive, we're telling you that you're managing to annoy us and should try to find a way to communicate without doing so for things to ever move forward more productively. It's not the bugs that are annoying us, it's just your habits of discourse. To prove my point, holy crap your response was long and I'm way too busy to read it all.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Aside from interpersonal nonsense, it's pretty clear from the technical content of this conversation that the design of encodings is not nailed down enough to go into Base. This is not a complete design – you are still actively in the process of exploring the design. There is something very fishy about all encodings being abstract types. Whether that's right or not, however, while it's even a question, it's too early to put them into Base. It will be time when there's been a working design that everyone's happy with for at least a month.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ScottPJones, I don't see any advantage to defining
UTF16LE <: UTF16
vs.
typealias UTF16 UTF16LE # on little-endian machine
I find the latter easier to reason about, personally.
I also don't even know if we need UTF16LE
/UTF16BE
distinctions; how is the actual code going to really be that different. I mean, maybe all we need is a simple if BIG_ENDIAN...
in the code that does a simple switch and that's all that's needed. I suspect we need to just need to start working on the actual code this involves to really see how it pans out.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To prove my point, holy crap your response was long and I'm way too busy to read it all.
@tkelman I wish you would. Did you see my suggestion about how to handle things without as much grief as the tuple changes caused (and have still been causing, as of this week)? I think "Arraymeggedon" won't be bad, because it will be at the very start of a cycle, the problem with the tuple changes is that it was after 9 months, when people were already depending on things in 0.4.
@StefanKarpinski That's fine, but I don't see it really taking an entire month to work out all of the angles.
What do you view as being fishy about the encodings being abstract types?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@quinnj By doing the way I described, you can define things based on a more abstract level of the encoding, i.e. UTF16
, instead of big-endian vs. little-endian (which only is an issue when you have to store them into bytes). For example, next
will return a Char
, not a sequence of bytes, so you can write a function that handles UTF16
encoding, and pass it something that is UTF16OE
, and it will handle the byte-swapping for you. That is the great advantage in making it a hierarchy in this fashion... you shouldn't have to have your code cluttered with if BIG_ENDIAN
(it's not pretty, having had to do just that in C code,
and this is a way of handling things where Julia would shine).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I eventually did, but walls-of-text are not very effective. Discussions of how to handle breaking changes are largely unrelated from this "Strings redesign needs more thought" conversation.
Something that we should look into doing in the future is running PkgEvaluator on large breaking PR's before they get merged. We have all the infrastructure to do this, we just need another few people to work on hooking it all up to make it happen.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Something that we should look into doing in the future is running PkgEvaluator on large breaking PR's before they get merged. We have all the infrastructure to do this, we just need another few people to work on hooking it all up to make it happen.
+1 – I feel like we were doing that at some point and then stopped doing it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ScottPJones, alright, I'd be willing to structure the Encodings that way (let's leave them out of a module for now) if you could submit a PR. I'll probably try to play with all this some more next week as I find time (though I'm trying to overhaul ODBC in the mean time, so I may not be able to do much).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we ever ran PkgEvaluator on a PR before it got merged, the buildbots are much more capable now than they were prior to ~6 months ago. As you've probably noticed we've had much less of Elliot and Iain's time lately, they're the main people who would be able to facilitate this kind of thing easily. What they've put in place can largely be used by other people now though.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are two different concepts that may be meant by UTF16
:
- the collection of encodings which are some kind of UTF-16 – little endian, big, native, opposite, etc.
- the native-endian standard UTF-16 encoding on this particular machine.
The former being abstract makes a lot of sense. The latter seems pretty concrete to me.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@tkelman I was just trying to respond to everything in your fairly large "wall-of-text" 😀 You did have two paragraphs there about the issues of breaking changes and regressions.
I wasn't aware of PkgEvaluator, that sounds like a very good idea.
What do you think of not doing known major breaking changes except right after a new release? (i.e. like the coming "Arraymeggedon", and maybe simply have new releases more frequently, to avoid the temptation to put in something like the tuples change at the end of a long cycle). I think @StefanKarpinski is on the right track with proposing a short 0.5 cycle focusing on breaking array changes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
String{UTF16} would be what you want for a concrete implementation of UTF-16 encoding.
I think doing it that way makes a nice clean separation between what is abstract, and what is concrete.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Work happens as it happens, schedules and plans never work out very well. Short release cycle sounds like a great idea in theory but we've yet to be able to pull it off in practice.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One way to make it happen is to simply say that every 6 months, a new release will be cut, with everything that is currently stable in latest, instead of defining releases based on which features you want to get in.
If a feature isn't really ready by the release date, it doesn't go in, and hopefully will go into the next one, but since the cycles are short, that's not too painful. We went to that approach after having some very long release cycles (1.5 years!), with customers then wanting adhoc releases with features that they absolutely needed. It's not perfect, and the 6 months ended up stabilizing at about 8 months... but it works.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What you're stating there was almost exactly the plan as of 9-10 months ago. Turns out master isn't very stable right now, and cutting a release from its current state without vital features like precompilation doesn't make much sense.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@tkelman That "waiting for vital features" is exactly the problem. Was 0.4 relatively stable before the tuple changes? If so, then _that_s when a new release branch should have been cut (0.4-pre
?). Then the tuple introduction would have gone a lot more smoothly, and 0.5 would have been about new tuples, precompilation, (and Unicode bug fixes? 😀)... following 6 months later, 0.6 would have been about arrays...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Was 0.4 relatively stable before the tuple changes?
Not really. It's had intermittent segfaults and other issues all year.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That sounds like it's time to focus on tracking down bugs, instead of adding the next big thing... not nearly as much fun, but better for everybody in the long run.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd like to apologize for using the term "dick" in reference to @ScottPJones earlier in this thread (and for necromancing such a long-dead thread in the process). I regret losing my temper and using inappropriate language on @quinnj's repository.
The encodings have to match for this to be correct. You probably want a fallback that iterates through the code points and compares them individually.