# Working with CategoricalArrays

In [None]:
using DataFrames # load package
using CategoricalArrays # CategoricalArrays.jl is independent from DataFrames.jl but it is often used in combination

## Constructor

In [None]:
x = categorical(["A", "B", "B", "C"]) # unordered

In [None]:
y = categorical(["A", "B", "B", "C"], ordered=true) # ordered, by default order is sorting order

In [None]:
z = categorical(["A","B","B","C", missing]) # unordered with missings

In [None]:
c = cut(1:10, 5) # ordered, into equal counts, possible to rename labels and give custom breaks

(we will cover grouping later, but let us here use it to analyze the results, we use Chain.jl for chaining)

In [None]:
using Chain

In [None]:
@chain DataFrame(x=cut(randn(100000), 10)) begin
      groupby(:x)
      combine(nrow) # just to make sure cut works right
end

In [None]:
v = categorical([1,2,2,3,3]) # contains integers not strings

In [None]:
Vector{Union{String, Missing}}(z) # sometimes you need to convert back to a standard vector

## Managing levels

In [None]:
arr = [x,y,z,c,v]

In [None]:
isordered.(arr) # chcek if categorical array is orderd

In [None]:
ordered!(x, true), isordered(x) # make x ordered

In [None]:
ordered!(x, false), isordered(x) # and unordered again

In [None]:
levels.(arr) # list levels

In [None]:
unique.(arr) # missing will be included

In [None]:
y[1] < y[2] # can compare as y is ordered

In [None]:
v[1] < v[2] # not comparable, v is unordered although it contains integers

In [None]:
y[2] < "A" # comparison against type underlying categorical value is not allowed

In [None]:
y[2] < CategoricalValue("A", y) # you need to explicitly convert a value to a level

In [None]:
y[2] < CategoricalValue("Z", y) # but it is treated as a level, and thus only valid levels are allowed

In [None]:
levels!(y, ["C", "B", "A"]) # you can reorder levels, mostly useful for ordered CategoricalArrays

In [None]:
y[1] < y[2] # observe that the order is changed

In [None]:
levels!(z, ["A", "B"]) # you have to specify all levels that are present

In [None]:
levels!(z, ["A", "B"], allowmissing=true) # unless the underlying array allows for missings and force removal of levels

In [None]:
z[1] = "B"
z # now z has only "B" entries

In [None]:
levels(z) # but it remembers the levels it had (the reason is mostly performance)

In [None]:
droplevels!(z) # this way we can clean it up
levels(z)

## Data manipulation

In [None]:
x, levels(x)

In [None]:
x[2] = "0"
x, levels(x) # new level added at the end (works only for unordered)

In [None]:
v, levels(v)

In [None]:
v[1] + v[2] # even though the underlying data is Int, we cannot operate on it

In [None]:
Vector{Int}(v) # you have either to retrieve the data by conversion (may be expensive)

In [None]:
unwrap(v[1]) + unwrap(v[2]) # or get a single value

In [None]:
unwrap.(v) # this will work for arrays witout missings

In [None]:
unwrap.(z) # also works on missing values

In [None]:
Vector{Union{String, Missing}}(z) # or do the conversion

In [None]:
recode([1,2,3,4,5,missing], 1=>10) # recode some values in an array; has also in place recode! equivalent

In [None]:
recode([1,2,3,4,5,missing], "a", 1=>10, 2=>20) # here we provided a default value for not mapped recodings

In [None]:
recode([1,2,3,4,5,missing], 1=>10, missing=>"missing") # to recode Missing you have to do it explicitly

In [None]:
t = categorical([1:5; missing])
t, levels(t)

In [None]:
recode!(t, [1,3]=>2)
t, levels(t) # note that the levels are dropped after recode

In [None]:
t = categorical([1,2,3], ordered=true)
levels(recode(t, 2=>0, 1=>-1)) # and if you introduce a new levels they are added at the end in the order of appearance

In [None]:
t = categorical([1,2,3,4,5], ordered=true) # when using default it becomes the last level
levels(recode(t, 300, [1,2]=>100, 3=>200))

## Comparisons

In [None]:
x = categorical([1,2,3])
xs = [x, categorical(x), categorical(x, ordered=true), categorical(x, ordered=true)]
levels!(xs[2], [3,2,1])
levels!(xs[4], [2,3,1])
[a == b for a in xs, b in xs] # all are equal - comparison only by contents

In [None]:
signature(x::CategoricalArray) = (x, levels(x), isordered(x)) # this is actually the full signature of CategoricalArray
# all are different, notice that x[1] and x[2] are unordered but have a different order of levels
[signature(a) == signature(b) for a in xs, b in xs]

In [None]:
x[1] < x[2] # you cannot compare elements of unordered CategoricalArray

In [None]:
t[1] < t[2] # but you can do it for an ordered one

In [None]:
isless(x[1], x[2]) # isless works within the same CategoricalArray even if it is not ordered

In [None]:
y = deepcopy(x) # but not across categorical arrays
isless(x[1], y[2])

In [None]:
isless(unwrap(x[1]), unwrap(y[2])) # you can use get to make a comparison of the contents of CategoricalArray

In [None]:
x[1] == y[2] # equality tests works OK across CategoricalArrays

## Categorical columns in a DataFrame

In [None]:
df = DataFrame(x = 1:3, y = 'a':'c', z = ["a","b","c"])

Convert all `String` columns to categorical in-place

In [None]:
transform!(df, names(df, String) => categorical, renamecols=false)

In [None]:
describe(df)