From 9ceaa26fe8f934b2e85767e8addd577c9615e5d7 Mon Sep 17 00:00:00 2001 From: sl-solution Date: Fri, 10 Dec 2021 10:45:31 +1300 Subject: [PATCH] define issorted/! --- docs/src/man/sorting.md | 55 ++++++++++++++++ src/InMemoryDatasets.jl | 1 + src/sort/sort.jl | 81 ++++++++++++++++++++++++ test/sort.jl | 136 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 273 insertions(+) diff --git a/docs/src/man/sorting.md b/docs/src/man/sorting.md index c735df2e..ef463979 100644 --- a/docs/src/man/sorting.md +++ b/docs/src/man/sorting.md @@ -308,3 +308,58 @@ julia> unsort!(ds) ``` ## `issorted`/`issorted!` + +The `issorted` function checks if a data set is sorted by given column(s). The syntax for the function is `issorted(ds, cols)`, and by default the `mapformats` keyword argument is set to `true` and the `rev` keyword argument is set to `false`. The `issorted!` function does the same job, however, if it returns `true` it marks the input data set as a sorted data set, i.e. it attaches some meta information to the data set. + +### Examples + +```jldoctest +julia> ds = Dataset(x1 = [1, 4, 7], x2 = [3.0, 1.1, -10.0], x3 = ["one", "two", "three"]) +3×3 Dataset + Row │ x1 x2 x3 + │ identity identity identity + │ Int64? Float64? String? +─────┼────────────────────────────── + 1 │ 1 3.0 one + 2 │ 4 1.1 two + 3 │ 7 -10.0 three + +julia> issorted(ds, 1) +true + +julia> issorted(ds, 2) +false + +julia> issorted(ds, 2, rev = true) +true + +julia> julia> fmt(x) = x == "one" ? 1 : x=="two" ? 2 : 3 +fmt (generic function with 1 method) + +julia> setformat!(ds, :x3=>fmt) +3×3 Dataset + Row │ x1 x2 x3 + │ identity identity fmt + │ Int64? Float64? String? +─────┼───────────────────────────── + 1 │ 1 3.0 1 + 2 │ 4 1.1 2 + 3 │ 7 -10.0 3 + +julia> issorted(ds, 3) +true + +julia> issorted!(ds, 1:3, rev = [false, true, false]) +true + +julia> ds +3×3 Sorted Dataset + Sorted by: x1, x2, x3 + Row │ x1 x2 x3 + │ identity identity fmt + │ Int64? Float64? String? +─────┼───────────────────────────── + 1 │ 1 3.0 1 + 2 │ 4 1.1 2 + 3 │ 7 -10.0 3 +``` diff --git a/src/InMemoryDatasets.jl b/src/InMemoryDatasets.jl index 6fa55d32..2bddd07f 100644 --- a/src/InMemoryDatasets.jl +++ b/src/InMemoryDatasets.jl @@ -70,6 +70,7 @@ export groupby, gatherby, describe, + issorted!, unsort!, ungroup!, modify, diff --git a/src/sort/sort.jl b/src/sort/sort.jl index 3ed045e0..fbb9b091 100644 --- a/src/sort/sort.jl +++ b/src/sort/sort.jl @@ -133,3 +133,84 @@ function unsort!(ds::Dataset) ds end end + +function Base.issorted(ds::AbstractDataset, cols::MultiColumnIndex; rev = false, mapformats = true) + _issorted(ds, cols, nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64), rev = rev, mapformats = mapformats)[1] +end +Base.issorted(ds::AbstractDataset, col::ColumnIndex; rev = false, mapformats = true) = issorted(ds, [col], rev = rev, mapformats = mapformats) + +function issorted!(ds::Dataset, cols::MultiColumnIndex; rev = false, mapformats = true) + res, starts, lastvalid, colsidx, revs, mapformats = _issorted(ds, cols, nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64), rev = rev, mapformats = mapformats) + if res + _reset_grouping_info!(ds) + append!(index(ds).sortedcols, collect(colsidx)) + append!(index(ds).rev, revs) + append!(index(ds).perm, collect(1:nrow(ds))) + append!(index(ds).starts, starts) + index(ds).ngroups[] = lastvalid + index(ds).fmt[] = mapformats + end + res +end + +issorted!(ds::Dataset, col::ColumnIndex; rev = false, mapformats = true) = issorted!(ds, [col], rev = rev, mapformats = mapformats) + +function _issorted(ds, cols::MultiColumnIndex, ::Val{T}; rev = false, mapformats = true) where T + colsidx = index(ds)[cols] + if rev isa AbstractVector + @assert length(rev) == length(colsidx) "length of rev and the number of selected columns must match" + revs = rev + else + revs = repeat([rev], length(colsidx)) + end + by = Function[] + + if mapformats + for j in 1:length(colsidx) + push!(by, getformat(parent(ds), colsidx[j])) + end + else + for j in 1:length(colsidx) + push!(by, identity) + end + end + res = true + starts = Vector{T}(undef, nrow(ds)) + starts[1] = 1 + lastvalid = 1 + inbits = zeros(Bool, nrow(ds)) + inbits[1] = true + for j in 1:length(colsidx) + v = _columns(ds)[colsidx[j]] + for rng in 1:lastvalid + lo = starts[rng] + rng == lastvalid ? hi = nrow(ds) : hi = starts[rng+1] - 1 + part_res = _issorted_barrier(v, Base.Order.ord(isless, by[j], revs[j]), lo, hi) + !part_res && return false, starts, lastvalid, colsidx, revs, mapformats + end + _find_starts_of_groups!(_columns(ds)[colsidx[j]], 1:nrow(ds), by[j], inbits) + lastvalid = _fill_starts_from_inbits!(starts, inbits) + lastvalid == nrow(ds) && return true, starts, lastvalid, colsidx, revs, mapformats + # lastvalid = _fill_starts_v2!(starts, inbits, _columns(ds)[colsidx[j]], lastvalid, Base.Order.ord(isless, by[j], revs[j]), Val(T)) + end + res, starts, lastvalid, colsidx, revs, mapformats +end + +function _fill_starts_from_inbits!(starts, inbits) + lastvalid = 1 + @inbounds for i in 1:length(inbits) + if inbits[i] + starts[lastvalid] = i + lastvalid += 1 + end + end + lastvalid - 1 +end + +function _issorted_barrier(v, _ord, lo, hi) + lo >= hi && return true + for i in lo+1:hi + Base.Order.lt(_ord, v[i], v[i-1]) && return false + end + true +end diff --git a/test/sort.jl b/test/sort.jl index c7ea2cfd..c191b449 100644 --- a/test/sort.jl +++ b/test/sort.jl @@ -371,3 +371,139 @@ end @test sort(ds, :) == sort(ds[!, 1:2], :) end end + +@testset "issorted/issorted!" begin + dv1 = [9, 1, 8, missing, 3, 3, 7, missing] + dv2 = [9, 1, 8, missing, 3, 3, 7, missing] + dv3 = Vector{Union{Int, Missing}}(1:8) + cv1 = CategoricalArray(dv1, ordered=true) + + d = Dataset(dv1=dv1, dv2=dv2, dv3=dv3, cv1=cv1) + + @test !issorted(d, :cv1) + @test issorted(d, :dv3) + @test !issorted(d, :dv1) + + dv1 = [1,3,3,7,8,9, missing, missing] + dv2 = [9, 1, 8, missing, 3, 3, 7, missing] + dv3 = Vector{Union{Int, Missing}}(1:8) + cv1 = CategoricalArray(dv1, ordered=true) + + d = Dataset(dv1=dv1, dv2=dv2, dv3=dv3, cv1=cv1) + @test issorted(d, :cv1) + @test issorted(d, :dv1) + @test !issorted(d, :dv2) + + ds = Dataset(x = [0xfffffffffffffff3, 0xfffffffffffffff2, 0xfffffffffffffff4, 0xfffffffffffffff1], y = [1,1,2,2]) + @test issorted(ds[[4,2,1,3],:],1) + @test issorted(view(ds, [4,2,1,3], :), 1) + @test issorted(ds[[3,1,2,4],:],1, rev = true) + setformat!(ds, 1=>isodd) + @test issorted(ds[[2,3,1,4],:],1) + @test issorted(view(ds, [2,3,1,4], :), 1) + @test issorted(ds[[1,4,2,3],:],1, rev=true) + @test issorted(ds[[2,3,1,4], :], 1:2) + @test issorted(view(ds, [2,3,1,4], :), 1:2) + @test issorted(ds[[3,2,4,1], :], 1:2, rev = [false, true]) + @test issorted(view(ds, [3,2,4,1], :), 1:2, rev = [false, true]) + + + x = rand(Int128, 1000) + y = rand(1:100, 1000) + ds = Dataset(x = x, y = y) + @test issorted(sort(ds, 1),1) + @test issorted(sort(ds, 1, rev = true), 1, rev=true) + setformat!(ds, 1=>isodd) + @test issorted(sort(ds, 1),1) + @test issorted(sort(ds, 1, rev = true), 1, rev = true) + + ds = Dataset(x = big.([1,4,-1,1,100]), x2 = [45,3,98,100,10]) + @test !issorted(ds, 1) + @test issorted(ds[[3,1,4,2,5], 1:1], 1) + @test issorted(view(ds, [5,2,1,4,3], [2,1]), 2, rev = true) + @test issorted(ds[[3, 1, 4, 2, 5], :], 1:2) + @test issorted(ds[[3,4,1,2,5],:], 1:2, rev = [false, true]) + ds[2,1]=missing + @test !issorted(ds, 1) + @test issorted(ds[[3,1,4,5,2], :], 1) + @test issorted(view(ds, [2,5,1,4,3], :), 1, rev = true) + + for i in 1:100 + ds = Dataset(rand(1:10, 1000, 10), :auto) + for j in 1:10 + @test issorted(sort(ds, 1:j), 1:j) + @test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true) + setformat!(ds, 1:10=>isodd) + @test issorted(sort(ds, 1:j), 1:j) + @test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true) + end + ds = Dataset(rand(1:10., 1000, 10), :auto) + map!(ds, x->rand()<.1 ? missing : x, :) + for j in 1:10 + @test issorted(sort(ds, 1:j), 1:j) + @test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true) + setformat!(ds, 1:10=>sign) + @test issorted(sort(ds, 1:j), 1:j) + @test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true) + end + ds = Dataset(rand(1:10., 1000, 10), :auto) + map!(ds, x->rand()<.1 ? missing : x, :) + for j in 1:10 + ds[!, j] = PooledArray(ds[!, j]) + end + for j in 1:10 + @test issorted(sort(ds, 1:j), 1:j) + @test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true) + setformat!(ds, 1:10=>sign) + @test issorted(sort(ds, 1:j), 1:j) + @test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true) + end + end + for i in 1:100 + ds = Dataset(rand(1:10, 1000, 10), :auto) + for j in 1:10 + sort!(ds, 1:j) + issorted!(ds, 1:j) + @test IMD._sortedcols(ds) == 1:j + @test issorted(ds, 1:j) + + setformat!(ds, 1:10=>isodd) + sort!(ds, 1:j, rev = true) + issorted!(ds, 1:j, rev = true) + @test IMD._sortedcols(ds) == 1:j + @test issorted(ds, 1:j, rev = true) + end + ds = Dataset(rand(1:10., 1000, 10), :auto) + map!(ds, x->rand()<.1 ? missing : x, :) + for j in 1:10 + sort!(ds, 1:2:j) + issorted!(ds, 1:2:j) + @test IMD._sortedcols(ds) == collect(1:2:j) + @test issorted(ds, 1:2:j) + + setformat!(ds, 1:10=>sign) + sort!(ds, 1:2:j, rev = true) + issorted!(ds, 1:2:j, rev = true) + @test IMD._sortedcols(ds) == collect(1:2:j) + @test issorted(ds, 1:2:j, rev = true) + end + ds = Dataset(rand(1:10., 1000, 10), :auto) + map!(ds, x->rand()<.1 ? missing : x, :) + for j in 1:10 + ds[!, j] = PooledArray(ds[!, j]) + end + for j in 1:10 + sort!(ds, 1:2:j) + issorted!(ds, 1:2:j) + @test IMD._sortedcols(ds) == collect(1:2:j) + @test issorted(ds, 1:2:j) + + setformat!(ds, 1:10=>sign) + sort!(ds, 1:2:j, rev = true) + issorted!(ds, 1:2:j, rev = true) + @test IMD._sortedcols(ds) == collect(1:2:j) + @test issorted(ds, 1:2:j, rev = true) + end + end + +end