Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions docs/src/man/sorting.md
Original file line number Diff line number Diff line change
Expand Up @@ -308,3 +308,58 @@ julia> unsort!(ds)
```

## `issorted`/`issorted!`

The `issorted` function checks if a data set is sorted by given column(s). The syntax for the function is `issorted(ds, cols)`, and by default the `mapformats` keyword argument is set to `true` and the `rev` keyword argument is set to `false`. The `issorted!` function does the same job, however, if it returns `true` it marks the input data set as a sorted data set, i.e. it attaches some meta information to the data set.

### Examples

```jldoctest
julia> ds = Dataset(x1 = [1, 4, 7], x2 = [3.0, 1.1, -10.0], x3 = ["one", "two", "three"])
3×3 Dataset
Row │ x1 x2 x3
│ identity identity identity
│ Int64? Float64? String?
─────┼──────────────────────────────
1 │ 1 3.0 one
2 │ 4 1.1 two
3 │ 7 -10.0 three

julia> issorted(ds, 1)
true

julia> issorted(ds, 2)
false

julia> issorted(ds, 2, rev = true)
true

julia> julia> fmt(x) = x == "one" ? 1 : x=="two" ? 2 : 3
fmt (generic function with 1 method)

julia> setformat!(ds, :x3=>fmt)
3×3 Dataset
Row │ x1 x2 x3
│ identity identity fmt
│ Int64? Float64? String?
─────┼─────────────────────────────
1 │ 1 3.0 1
2 │ 4 1.1 2
3 │ 7 -10.0 3

julia> issorted(ds, 3)
true

julia> issorted!(ds, 1:3, rev = [false, true, false])
true

julia> ds
3×3 Sorted Dataset
Sorted by: x1, x2, x3
Row │ x1 x2 x3
│ identity identity fmt
│ Int64? Float64? String?
─────┼─────────────────────────────
1 │ 1 3.0 1
2 │ 4 1.1 2
3 │ 7 -10.0 3
```
1 change: 1 addition & 0 deletions src/InMemoryDatasets.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ export
groupby,
gatherby,
describe,
issorted!,
unsort!,
ungroup!,
modify,
Expand Down
81 changes: 81 additions & 0 deletions src/sort/sort.jl
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,84 @@ function unsort!(ds::Dataset)
ds
end
end

function Base.issorted(ds::AbstractDataset, cols::MultiColumnIndex; rev = false, mapformats = true)
_issorted(ds, cols, nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64), rev = rev, mapformats = mapformats)[1]
end
Base.issorted(ds::AbstractDataset, col::ColumnIndex; rev = false, mapformats = true) = issorted(ds, [col], rev = rev, mapformats = mapformats)

function issorted!(ds::Dataset, cols::MultiColumnIndex; rev = false, mapformats = true)
res, starts, lastvalid, colsidx, revs, mapformats = _issorted(ds, cols, nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64), rev = rev, mapformats = mapformats)
if res
_reset_grouping_info!(ds)
append!(index(ds).sortedcols, collect(colsidx))
append!(index(ds).rev, revs)
append!(index(ds).perm, collect(1:nrow(ds)))
append!(index(ds).starts, starts)
index(ds).ngroups[] = lastvalid
index(ds).fmt[] = mapformats
end
res
end

issorted!(ds::Dataset, col::ColumnIndex; rev = false, mapformats = true) = issorted!(ds, [col], rev = rev, mapformats = mapformats)

function _issorted(ds, cols::MultiColumnIndex, ::Val{T}; rev = false, mapformats = true) where T
colsidx = index(ds)[cols]
if rev isa AbstractVector
@assert length(rev) == length(colsidx) "length of rev and the number of selected columns must match"
revs = rev
else
revs = repeat([rev], length(colsidx))
end
by = Function[]

if mapformats
for j in 1:length(colsidx)
push!(by, getformat(parent(ds), colsidx[j]))
end
else
for j in 1:length(colsidx)
push!(by, identity)
end
end
res = true
starts = Vector{T}(undef, nrow(ds))
starts[1] = 1
lastvalid = 1
inbits = zeros(Bool, nrow(ds))
inbits[1] = true
for j in 1:length(colsidx)
v = _columns(ds)[colsidx[j]]
for rng in 1:lastvalid
lo = starts[rng]
rng == lastvalid ? hi = nrow(ds) : hi = starts[rng+1] - 1
part_res = _issorted_barrier(v, Base.Order.ord(isless, by[j], revs[j]), lo, hi)
!part_res && return false, starts, lastvalid, colsidx, revs, mapformats
end
_find_starts_of_groups!(_columns(ds)[colsidx[j]], 1:nrow(ds), by[j], inbits)
lastvalid = _fill_starts_from_inbits!(starts, inbits)
lastvalid == nrow(ds) && return true, starts, lastvalid, colsidx, revs, mapformats
# lastvalid = _fill_starts_v2!(starts, inbits, _columns(ds)[colsidx[j]], lastvalid, Base.Order.ord(isless, by[j], revs[j]), Val(T))
end
res, starts, lastvalid, colsidx, revs, mapformats
end

function _fill_starts_from_inbits!(starts, inbits)
lastvalid = 1
@inbounds for i in 1:length(inbits)
if inbits[i]
starts[lastvalid] = i
lastvalid += 1
end
end
lastvalid - 1
end

function _issorted_barrier(v, _ord, lo, hi)
lo >= hi && return true
for i in lo+1:hi
Base.Order.lt(_ord, v[i], v[i-1]) && return false
end
true
end
136 changes: 136 additions & 0 deletions test/sort.jl
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,139 @@ end
@test sort(ds, :) == sort(ds[!, 1:2], :)
end
end

@testset "issorted/issorted!" begin
dv1 = [9, 1, 8, missing, 3, 3, 7, missing]
dv2 = [9, 1, 8, missing, 3, 3, 7, missing]
dv3 = Vector{Union{Int, Missing}}(1:8)
cv1 = CategoricalArray(dv1, ordered=true)

d = Dataset(dv1=dv1, dv2=dv2, dv3=dv3, cv1=cv1)

@test !issorted(d, :cv1)
@test issorted(d, :dv3)
@test !issorted(d, :dv1)

dv1 = [1,3,3,7,8,9, missing, missing]
dv2 = [9, 1, 8, missing, 3, 3, 7, missing]
dv3 = Vector{Union{Int, Missing}}(1:8)
cv1 = CategoricalArray(dv1, ordered=true)

d = Dataset(dv1=dv1, dv2=dv2, dv3=dv3, cv1=cv1)
@test issorted(d, :cv1)
@test issorted(d, :dv1)
@test !issorted(d, :dv2)

ds = Dataset(x = [0xfffffffffffffff3, 0xfffffffffffffff2, 0xfffffffffffffff4, 0xfffffffffffffff1], y = [1,1,2,2])
@test issorted(ds[[4,2,1,3],:],1)
@test issorted(view(ds, [4,2,1,3], :), 1)
@test issorted(ds[[3,1,2,4],:],1, rev = true)
setformat!(ds, 1=>isodd)
@test issorted(ds[[2,3,1,4],:],1)
@test issorted(view(ds, [2,3,1,4], :), 1)
@test issorted(ds[[1,4,2,3],:],1, rev=true)
@test issorted(ds[[2,3,1,4], :], 1:2)
@test issorted(view(ds, [2,3,1,4], :), 1:2)
@test issorted(ds[[3,2,4,1], :], 1:2, rev = [false, true])
@test issorted(view(ds, [3,2,4,1], :), 1:2, rev = [false, true])


x = rand(Int128, 1000)
y = rand(1:100, 1000)
ds = Dataset(x = x, y = y)
@test issorted(sort(ds, 1),1)
@test issorted(sort(ds, 1, rev = true), 1, rev=true)
setformat!(ds, 1=>isodd)
@test issorted(sort(ds, 1),1)
@test issorted(sort(ds, 1, rev = true), 1, rev = true)

ds = Dataset(x = big.([1,4,-1,1,100]), x2 = [45,3,98,100,10])
@test !issorted(ds, 1)
@test issorted(ds[[3,1,4,2,5], 1:1], 1)
@test issorted(view(ds, [5,2,1,4,3], [2,1]), 2, rev = true)
@test issorted(ds[[3, 1, 4, 2, 5], :], 1:2)
@test issorted(ds[[3,4,1,2,5],:], 1:2, rev = [false, true])
ds[2,1]=missing
@test !issorted(ds, 1)
@test issorted(ds[[3,1,4,5,2], :], 1)
@test issorted(view(ds, [2,5,1,4,3], :), 1, rev = true)

for i in 1:100
ds = Dataset(rand(1:10, 1000, 10), :auto)
for j in 1:10
@test issorted(sort(ds, 1:j), 1:j)
@test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
setformat!(ds, 1:10=>isodd)
@test issorted(sort(ds, 1:j), 1:j)
@test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
end
ds = Dataset(rand(1:10., 1000, 10), :auto)
map!(ds, x->rand()<.1 ? missing : x, :)
for j in 1:10
@test issorted(sort(ds, 1:j), 1:j)
@test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
setformat!(ds, 1:10=>sign)
@test issorted(sort(ds, 1:j), 1:j)
@test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
end
ds = Dataset(rand(1:10., 1000, 10), :auto)
map!(ds, x->rand()<.1 ? missing : x, :)
for j in 1:10
ds[!, j] = PooledArray(ds[!, j])
end
for j in 1:10
@test issorted(sort(ds, 1:j), 1:j)
@test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
setformat!(ds, 1:10=>sign)
@test issorted(sort(ds, 1:j), 1:j)
@test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
end
end
for i in 1:100
ds = Dataset(rand(1:10, 1000, 10), :auto)
for j in 1:10
sort!(ds, 1:j)
issorted!(ds, 1:j)
@test IMD._sortedcols(ds) == 1:j
@test issorted(ds, 1:j)

setformat!(ds, 1:10=>isodd)
sort!(ds, 1:j, rev = true)
issorted!(ds, 1:j, rev = true)
@test IMD._sortedcols(ds) == 1:j
@test issorted(ds, 1:j, rev = true)
end
ds = Dataset(rand(1:10., 1000, 10), :auto)
map!(ds, x->rand()<.1 ? missing : x, :)
for j in 1:10
sort!(ds, 1:2:j)
issorted!(ds, 1:2:j)
@test IMD._sortedcols(ds) == collect(1:2:j)
@test issorted(ds, 1:2:j)

setformat!(ds, 1:10=>sign)
sort!(ds, 1:2:j, rev = true)
issorted!(ds, 1:2:j, rev = true)
@test IMD._sortedcols(ds) == collect(1:2:j)
@test issorted(ds, 1:2:j, rev = true)
end
ds = Dataset(rand(1:10., 1000, 10), :auto)
map!(ds, x->rand()<.1 ? missing : x, :)
for j in 1:10
ds[!, j] = PooledArray(ds[!, j])
end
for j in 1:10
sort!(ds, 1:2:j)
issorted!(ds, 1:2:j)
@test IMD._sortedcols(ds) == collect(1:2:j)
@test issorted(ds, 1:2:j)

setformat!(ds, 1:10=>sign)
sort!(ds, 1:2:j, rev = true)
issorted!(ds, 1:2:j, rev = true)
@test IMD._sortedcols(ds) == collect(1:2:j)
@test issorted(ds, 1:2:j, rev = true)
end
end

end