sl-solution · sl-solution · Dec 9, 2021 · Dec 9, 2021
diff --git a/docs/src/man/sorting.md b/docs/src/man/sorting.md
@@ -308,3 +308,58 @@ julia> unsort!(ds)
 ```
 
 ## `issorted`/`issorted!`
+
+The `issorted` function checks if a data set is sorted by given column(s). The syntax for the function is `issorted(ds, cols)`, and by default the `mapformats` keyword argument is set to `true` and the `rev` keyword argument is set to `false`. The `issorted!` function does the same job, however, if it returns `true` it marks the input data set as a sorted data set, i.e. it attaches some meta information to the data set.
+
+### Examples
+
+```jldoctest
+julia> ds = Dataset(x1 = [1, 4, 7], x2 = [3.0, 1.1, -10.0], x3 = ["one", "two", "three"])
+3×3 Dataset
+ Row │ x1        x2        x3       
+     │ identity  identity  identity
+     │ Int64?    Float64?  String?  
+─────┼──────────────────────────────
+   1 │        1       3.0  one
+   2 │        4       1.1  two
+   3 │        7     -10.0  three
+
+julia> issorted(ds, 1)
+true
+
+julia> issorted(ds, 2)
+false
+
+julia> issorted(ds, 2, rev = true)
+true
+
+julia> julia> fmt(x) = x == "one" ? 1 : x=="two" ? 2 : 3
+fmt (generic function with 1 method)
+
+julia> setformat!(ds, :x3=>fmt)
+3×3 Dataset
+ Row │ x1        x2        x3      
+     │ identity  identity  fmt     
+     │ Int64?    Float64?  String?
+─────┼─────────────────────────────
+   1 │        1       3.0  1
+   2 │        4       1.1  2
+   3 │        7     -10.0  3
+
+julia> issorted(ds, 3)
+true
+
+julia> issorted!(ds, 1:3, rev = [false, true, false])
+true
+
+julia> ds
+3×3 Sorted Dataset
+ Sorted by: x1, x2, x3
+ Row │ x1        x2        x3      
+     │ identity  identity  fmt     
+     │ Int64?    Float64?  String?
+─────┼─────────────────────────────
+   1 │        1       3.0  1
+   2 │        4       1.1  2
+   3 │        7     -10.0  3
+```
diff --git a/src/InMemoryDatasets.jl b/src/InMemoryDatasets.jl
@@ -70,6 +70,7 @@ export
       groupby,
       gatherby,
       describe,
+      issorted!,
       unsort!,
       ungroup!,
       modify,

diff --git a/src/sort/sort.jl b/src/sort/sort.jl
@@ -133,3 +133,84 @@ function unsort!(ds::Dataset)
         ds
     end
 end
+
+function Base.issorted(ds::AbstractDataset, cols::MultiColumnIndex; rev = false, mapformats = true)
+    _issorted(ds, cols, nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64), rev = rev, mapformats = mapformats)[1]
+end
+Base.issorted(ds::AbstractDataset, col::ColumnIndex; rev = false, mapformats = true) = issorted(ds, [col], rev = rev, mapformats = mapformats)
+
+function issorted!(ds::Dataset, cols::MultiColumnIndex; rev = false, mapformats = true)
+    res, starts, lastvalid, colsidx, revs, mapformats = _issorted(ds, cols, nrow(ds) < typemax(Int32) ? Val(Int32) : Val(Int64), rev = rev, mapformats = mapformats)
+    if res
+        _reset_grouping_info!(ds)
+        append!(index(ds).sortedcols, collect(colsidx))
+        append!(index(ds).rev, revs)
+        append!(index(ds).perm, collect(1:nrow(ds)))
+        append!(index(ds).starts, starts)
+        index(ds).ngroups[] = lastvalid
+        index(ds).fmt[] = mapformats
+    end
+    res
+end
+
+issorted!(ds::Dataset, col::ColumnIndex; rev = false, mapformats = true) = issorted!(ds, [col], rev = rev, mapformats = mapformats)
+
+function _issorted(ds, cols::MultiColumnIndex, ::Val{T}; rev = false, mapformats = true) where T
+    colsidx = index(ds)[cols]
+    if rev isa AbstractVector
+        @assert length(rev) == length(colsidx) "length of rev and the number of selected columns must match"
+        revs = rev
+    else
+        revs = repeat([rev], length(colsidx))
+    end
+    by = Function[]
+
+    if mapformats
+        for j in 1:length(colsidx)
+            push!(by, getformat(parent(ds), colsidx[j]))
+        end
+    else
+        for j in 1:length(colsidx)
+            push!(by, identity)
+        end
+    end
+    res = true
+    starts = Vector{T}(undef, nrow(ds))
+    starts[1] = 1
+    lastvalid = 1
+    inbits = zeros(Bool, nrow(ds))
+    inbits[1] = true
+    for j in 1:length(colsidx)
+        v = _columns(ds)[colsidx[j]]
+        for rng in 1:lastvalid
+            lo = starts[rng]
+            rng == lastvalid ? hi = nrow(ds) : hi = starts[rng+1] - 1
+            part_res = _issorted_barrier(v, Base.Order.ord(isless, by[j], revs[j]), lo, hi)
+            !part_res && return false, starts, lastvalid, colsidx, revs, mapformats
+        end
+        _find_starts_of_groups!(_columns(ds)[colsidx[j]], 1:nrow(ds), by[j], inbits)
+        lastvalid = _fill_starts_from_inbits!(starts, inbits)
+        lastvalid == nrow(ds) && return true, starts, lastvalid, colsidx, revs, mapformats
+        # lastvalid = _fill_starts_v2!(starts, inbits, _columns(ds)[colsidx[j]], lastvalid, Base.Order.ord(isless, by[j], revs[j]), Val(T))
+    end
+    res, starts, lastvalid, colsidx, revs, mapformats
+end
+
+function _fill_starts_from_inbits!(starts, inbits)
+    lastvalid = 1
+    @inbounds for i in 1:length(inbits)
+        if inbits[i]
+            starts[lastvalid] = i
+            lastvalid += 1
+        end
+    end
+    lastvalid - 1
+end
+
+function _issorted_barrier(v, _ord, lo, hi)
+    lo >= hi && return true
+    for i in lo+1:hi
+        Base.Order.lt(_ord, v[i], v[i-1]) && return false
+    end
+    true
+end
diff --git a/test/sort.jl b/test/sort.jl
@@ -371,3 +371,139 @@ end
         @test sort(ds, :) == sort(ds[!, 1:2], :)
     end
 end
+
+@testset "issorted/issorted!" begin
+    dv1 = [9, 1, 8, missing, 3, 3, 7, missing]
+    dv2 = [9, 1, 8, missing, 3, 3, 7, missing]
+    dv3 = Vector{Union{Int, Missing}}(1:8)
+    cv1 = CategoricalArray(dv1, ordered=true)
+
+    d = Dataset(dv1=dv1, dv2=dv2, dv3=dv3, cv1=cv1)
+
+    @test !issorted(d, :cv1)
+    @test issorted(d, :dv3)
+    @test !issorted(d, :dv1)
+
+    dv1 = [1,3,3,7,8,9, missing, missing]
+    dv2 = [9, 1, 8, missing, 3, 3, 7, missing]
+    dv3 = Vector{Union{Int, Missing}}(1:8)
+    cv1 = CategoricalArray(dv1, ordered=true)
+
+    d = Dataset(dv1=dv1, dv2=dv2, dv3=dv3, cv1=cv1)
+    @test issorted(d, :cv1)
+    @test issorted(d, :dv1)
+    @test !issorted(d, :dv2)
+
+    ds = Dataset(x = [0xfffffffffffffff3, 0xfffffffffffffff2, 0xfffffffffffffff4, 0xfffffffffffffff1], y = [1,1,2,2])
+    @test issorted(ds[[4,2,1,3],:],1)
+    @test issorted(view(ds, [4,2,1,3], :), 1)
+    @test issorted(ds[[3,1,2,4],:],1, rev = true)
+    setformat!(ds, 1=>isodd)
+    @test issorted(ds[[2,3,1,4],:],1)
+    @test issorted(view(ds, [2,3,1,4], :), 1)
+    @test issorted(ds[[1,4,2,3],:],1, rev=true)
+    @test issorted(ds[[2,3,1,4], :], 1:2)
+    @test issorted(view(ds, [2,3,1,4], :), 1:2)
+    @test issorted(ds[[3,2,4,1], :], 1:2, rev = [false, true])
+    @test issorted(view(ds, [3,2,4,1], :), 1:2, rev = [false, true])
+
+
+    x = rand(Int128, 1000)
+    y = rand(1:100, 1000)
+    ds = Dataset(x = x, y = y)
+    @test issorted(sort(ds, 1),1)
+    @test issorted(sort(ds, 1, rev = true), 1, rev=true)
+    setformat!(ds, 1=>isodd)
+    @test issorted(sort(ds, 1),1)
+    @test issorted(sort(ds, 1, rev = true), 1, rev = true)
+
+    ds = Dataset(x = big.([1,4,-1,1,100]), x2 = [45,3,98,100,10])
+    @test !issorted(ds, 1)
+    @test issorted(ds[[3,1,4,2,5], 1:1], 1)
+    @test issorted(view(ds, [5,2,1,4,3], [2,1]), 2, rev = true)
+    @test issorted(ds[[3, 1, 4, 2, 5], :], 1:2)
+    @test issorted(ds[[3,4,1,2,5],:], 1:2, rev = [false, true])
+    ds[2,1]=missing
+    @test !issorted(ds, 1)
+    @test issorted(ds[[3,1,4,5,2], :], 1)
+    @test issorted(view(ds, [2,5,1,4,3], :), 1, rev = true)
+
+    for i in 1:100
+        ds = Dataset(rand(1:10, 1000, 10), :auto)
+        for j in 1:10
+            @test issorted(sort(ds, 1:j), 1:j)
+            @test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
+            setformat!(ds, 1:10=>isodd)
+            @test issorted(sort(ds, 1:j), 1:j)
+            @test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
+        end
+        ds = Dataset(rand(1:10., 1000, 10), :auto)
+        map!(ds, x->rand()<.1 ? missing : x, :)
+        for j in 1:10
+            @test issorted(sort(ds, 1:j), 1:j)
+            @test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
+            setformat!(ds, 1:10=>sign)
+            @test issorted(sort(ds, 1:j), 1:j)
+            @test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
+        end
+        ds = Dataset(rand(1:10., 1000, 10), :auto)
+        map!(ds, x->rand()<.1 ? missing : x, :)
+        for j in 1:10
+            ds[!, j] = PooledArray(ds[!, j])
+        end
+        for j in 1:10
+            @test issorted(sort(ds, 1:j), 1:j)
+            @test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
+            setformat!(ds, 1:10=>sign)
+            @test issorted(sort(ds, 1:j), 1:j)
+            @test issorted(sort(ds, 1:j, rev = true), 1:j, rev = true)
+        end
+    end
+    for i in 1:100
+        ds = Dataset(rand(1:10, 1000, 10), :auto)
+        for j in 1:10
+            sort!(ds, 1:j)
+            issorted!(ds, 1:j)
+            @test IMD._sortedcols(ds) == 1:j
+            @test issorted(ds, 1:j)
+
+            setformat!(ds, 1:10=>isodd)
+            sort!(ds, 1:j, rev = true)
+            issorted!(ds, 1:j, rev = true)
+            @test IMD._sortedcols(ds) == 1:j
+            @test issorted(ds, 1:j, rev = true)
+        end
+        ds = Dataset(rand(1:10., 1000, 10), :auto)
+        map!(ds, x->rand()<.1 ? missing : x, :)
+        for j in 1:10
+            sort!(ds, 1:2:j)
+            issorted!(ds, 1:2:j)
+            @test IMD._sortedcols(ds) == collect(1:2:j)
+            @test issorted(ds, 1:2:j)
+
+            setformat!(ds, 1:10=>sign)
+            sort!(ds, 1:2:j, rev = true)
+            issorted!(ds, 1:2:j, rev = true)
+            @test IMD._sortedcols(ds) == collect(1:2:j)
+            @test issorted(ds, 1:2:j, rev = true)
+        end
+        ds = Dataset(rand(1:10., 1000, 10), :auto)
+        map!(ds, x->rand()<.1 ? missing : x, :)
+        for j in 1:10
+            ds[!, j] = PooledArray(ds[!, j])
+        end
+        for j in 1:10
+            sort!(ds, 1:2:j)
+            issorted!(ds, 1:2:j)
+            @test IMD._sortedcols(ds) == collect(1:2:j)
+            @test issorted(ds, 1:2:j)
+
+            setformat!(ds, 1:10=>sign)
+            sort!(ds, 1:2:j, rev = true)
+            issorted!(ds, 1:2:j, rev = true)
+            @test IMD._sortedcols(ds) == collect(1:2:j)
+            @test issorted(ds, 1:2:j, rev = true)
+        end
+    end
+
+end