/
dataframe__frame.R
2490 lines (2305 loc) · 75.5 KB
/
dataframe__frame.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#' Inner workings of the DataFrame-class
#'
#' @name DataFrame_class
#' @aliases RPolarsDataFrame
#' @description The `DataFrame`-class is simply two environments of respectively
#' the public and private methods/function calls to the polars Rust side. The
#' instantiated `DataFrame`-object is an `externalptr` to a low-level Rust
#' polars DataFrame object.
#'
#' The S3 method `.DollarNames.RPolarsDataFrame` exposes all public
#' `$foobar()`-methods which are callable onto the object. Most methods return
#' another `DataFrame`- class instance or similar which allows for method
#' chaining. This class system could be called "environment classes" (in lack
#' of a better name) and is the same class system `extendr` provides, except
#' here there are both a public and private set of methods. For implementation
#' reasons, the private methods are external and must be called from
#' `.pr$DataFrame$methodname()`. Also, all private methods must take any
#' `self` as an argument, thus they are pure functions. Having the private
#' methods as pure functions solved/simplified self-referential complications.
#'
#' @section Active bindings:
#'
#' ## columns
#'
#' `$columns` returns a character vector with the column names.
#'
#' ## dtypes
#'
#' `$dtypes` returns a unnamed list with the [data type][pl_dtypes] of each column.
#'
#' ## flags
#'
#' `$flags` returns a nested list with column names at the top level and
#' column flags in each sublist.
#'
#' Flags are used internally to avoid doing unnecessary computations, such as
#' sorting a variable that we know is already sorted. The number of flags
#' varies depending on the column type: columns of type `array` and `list`
#' have the flags `SORTED_ASC`, `SORTED_DESC`, and `FAST_EXPLODE`, while other
#' column types only have the former two.
#'
#' - `SORTED_ASC` is set to `TRUE` when we sort a column in increasing order, so
#' that we can use this information later on to avoid re-sorting it.
#' - `SORTED_DESC` is similar but applies to sort in decreasing order.
#'
#' ## height
#'
#' `$height` returns the number of rows in the DataFrame.
#'
#' ## schema
#'
#' `$schema` returns a named list with the [data type][pl_dtypes] of each column.
#'
#' ## shape
#'
#' `$shape` returns a numeric vector of length two with the number of rows and
#' the number of columns.
#'
#' ## width
#'
#' `$width` returns the number of columns in the DataFrame.
#'
#' @section Conversion to R data types considerations:
#' When converting Polars objects, such as [DataFrames][DataFrame_class]
#' to R objects, for example via the [`as.data.frame()`][as.data.frame.RPolarsDataFrame] generic function,
#' each type in the Polars object is converted to an R type.
#' In some cases, an error may occur because the conversion is not appropriate.
#' In particular, there is a high possibility of an error when converting
#' a [Datetime][DataType_Datetime] type without a time zone.
#' A [Datetime][DataType_Datetime] type without a time zone in Polars is converted
#' to the [POSIXct] type in R, which takes into account the time zone in which
#' the R session is running (which can be checked with the [Sys.timezone()]
#' function). In this case, if ambiguous times are included, a conversion error
#' will occur. In such cases, change the session time zone using
#' [`Sys.setenv(TZ = "UTC")`][base::Sys.setenv] and then perform the conversion, or use the
#' [`$dt$replace_time_zone()`][ExprDT_replace_time_zone] method on the Datetime type column to
#' explicitly specify the time zone before conversion.
#'
#' ```{r}
#' # Due to daylight savings, clocks were turned forward 1 hour on Sunday, March 8, 2020, 2:00:00 am
#' # so this particular date-time doesn't exist
#' non_existent_time = as_polars_series("2020-03-08 02:00:00")$str$strptime(pl$Datetime(), "%F %T")
#'
#' withr::with_envvar(
#' new = c(TZ = "America/New_York"),
#' {
#' tryCatch(
#' # This causes an error due to the time zone (the `TZ` env var is affected).
#' as.vector(non_existent_time),
#' error = function(e) e
#' )
#' }
#' )
#'
#' withr::with_envvar(
#' new = c(TZ = "America/New_York"),
#' {
#' # This is safe.
#' as.vector(non_existent_time$dt$replace_time_zone("UTC"))
#' }
#' )
#' ```
#' @details Check out the source code in
#' [R/dataframe_frame.R](https://github.com/pola-rs/r-polars/blob/main/R/dataframe__frame.R)
#' to see how public methods are derived from private methods. Check out
#' [extendr-wrappers.R](https://github.com/pola-rs/r-polars/blob/main/R/extendr-wrappers.R)
#' to see the `extendr`-auto-generated methods. These are moved to `.pr` and
#' converted into pure external functions in
#' [after-wrappers.R](https://github.com/pola-rs/r-polars/blob/main/R/after-wrappers.R).
#' In [zzz.R](https://github.com/pola-rs/r-polars/blob/main/R/zzz.R) (named
#' `zzz` to be last file sourced) the `extendr`-methods are removed and
#' replaced by any function prefixed `DataFrame_`.
#'
#' @keywords DataFrame
#'
#' @examples
#' # see all public exported method names (normally accessed via a class
#' # instance with $)
#' ls(.pr$env$RPolarsDataFrame)
#'
#' # see all private methods (not intended for regular use)
#' ls(.pr$DataFrame)
#'
#' # make an object
#' df = as_polars_df(iris)
#'
#' # call an active binding
#' df$shape
#'
#' # use a private method, which has mutability
#' result = .pr$DataFrame$set_column_from_robj(df, 150:1, "some_ints")
#'
#' # Column exists in both dataframes-objects now, as they are just pointers to
#' # the same object
#' # There are no public methods with mutability.
#' df2 = df
#'
#' df$columns
#' df2$columns
#'
#' # Show flags
#' df$sort("Sepal.Length")$flags
#'
#' # set_column_from_robj-method is fallible and returned a result which could
#' # be "ok" or an error.
#' # No public method or function will ever return a result.
#' # The `result` is very close to the same as output from functions decorated
#' # with purrr::safely.
#' # To use results on the R side, these must be unwrapped first such that
#' # potentially errors can be thrown. `unwrap(result)` is a way to communicate
#' # errors happening on the Rust side to the R side. `Extendr` default behavior
#' # is to use `panic!`(s) which would cause some unnecessarily confusing and
#' # some very verbose error messages on the inner workings of rust.
#' # `unwrap(result)` in this case no error, just a NULL because this mutable
#' # method does not return any ok-value.
#'
#' # Try unwrapping an error from polars due to unmatching column lengths
#' err_result = .pr$DataFrame$set_column_from_robj(df, 1:10000, "wrong_length")
#' tryCatch(unwrap(err_result, call = NULL), error = \(e) cat(as.character(e)))
NULL
## Active bindings
DataFrame_columns = method_as_active_binding(
\() .pr$DataFrame$columns(self),
setter = TRUE
)
DataFrame_dtypes = method_as_active_binding(\() .pr$DataFrame$dtypes(self))
DataFrame_flags = method_as_active_binding(
function() {
out = lapply(
self$columns,
\(x) self[, x]$flags
)
names(out) = self$columns
out
}
)
DataFrame_height = method_as_active_binding(\() .pr$DataFrame$shape(self)[1L])
DataFrame_schema = method_as_active_binding(\() .pr$DataFrame$schema(self))
DataFrame_shape = method_as_active_binding(\() .pr$DataFrame$shape(self))
DataFrame_width = method_as_active_binding(\() .pr$DataFrame$shape(self)[2L])
#' @title auto complete $-access into a polars object
#' @description called by the interactive R session internally
#' @param x DataFrame
#' @param pattern code-stump as string to auto-complete
#' @return char vec
#' @export
#' @return Doesn't return a value. This is used for autocompletion in RStudio.
#' @noRd
.DollarNames.RPolarsDataFrame = function(x, pattern = "") {
get_method_usages(RPolarsDataFrame, pattern = pattern)
}
#' @title auto complete $-access into a polars object
#' @description called by the interactive R session internally
#' @param x RPolarsVecDataFrame
#' @param pattern code-stump as string to auto-complete
#' @return char vec
#' @export
#' @inherit .DollarNames.RPolarsDataFrame return
#' @noRd
.DollarNames.RPolarsVecDataFrame = function(x, pattern = "") {
get_method_usages(RPolarsVecDataFrame, pattern = pattern)
}
#' Create a new polars DataFrame
#'
#' @param ... One of the following:
#' - a list of mixed vectors and Series of equal length
#' - mixed vectors and/or Series of equal length
#' - a positional argument of a [data.frame] or a [DataFrame][DataFrame_class]
#' (not recommended use). In this case, the object will be passed to [as_polars_df()].
#'
#' Columns will be named as of named arguments or alternatively by names of
#' Series or given a placeholder name.
#'
#' @param make_names_unique If `TRUE` (default), any duplicated names will be
#' prefixed a running number.
#' @param schema A named list that will be used to convert a variable to a
#' specific DataType. See Examples.
#' @seealso
#' - [as_polars_df()]
#' @return [DataFrame][DataFrame_class]
#'
#' @examples
#' pl$DataFrame(
#' a = list(c(1, 2, 3, 4, 5)), # NB if first column should be a list, wrap it in a Series
#' b = 1:5,
#' c = letters[1:5],
#' d = list(1:1, 1:2, 1:3, 1:4, 1:5)
#' ) # directly from vectors
#'
#' # from a list of vectors
#' pl$DataFrame(list(
#' a = c(1, 2, 3, 4, 5),
#' b = 1:5,
#' c = letters[1:5],
#' d = list(1L, 1:2, 1:3, 1:4, 1:5)
#' ))
#'
#' # from a data.frame
#' pl$DataFrame(mtcars)
#'
#' # custom schema
#' pl$DataFrame(iris, schema = list(Sepal.Length = pl$Float32, Species = pl$String))
pl_DataFrame = function(..., make_names_unique = TRUE, schema = NULL) {
uw = \(res) unwrap(res, "in $DataFrame():")
skip_classes = c("data.frame", "RPolarsDataFrame")
largs = unpack_list(..., skip_classes = skip_classes) |>
result() |>
uw()
# pass to `as_polars_df()`
if (length(largs) == 1L && is.null(names(largs)) &&
(inherits(largs[[1]], skip_classes))) {
# TODO: schema v.s. schema_overrides <https://github.com/pola-rs/r-polars/issues/897>
out = as_polars_df(largs[[1]], make_names_unique = make_names_unique, schema_overrides = schema) |>
result() |>
uw()
return(out)
}
if (length(largs) > 0 && !is.null(schema) && !all(names(schema) %in% names(largs))) {
Err_plain("Some columns in `schema` are not in the DataFrame.") |>
uw()
}
# no args create empty DataFrame
if (length(largs) == 0L) {
if (!is.null(schema)) {
out = lapply(seq_along(schema), \(x) {
pl$lit(numeric(0))$cast(schema[[x]])$alias(names(schema)[x])
}) |>
pl$select()
} else {
out = .pr$DataFrame$default()
}
return(out)
}
# keys are tentative new column names
keys = names(largs)
if (length(keys) == 0) keys = rep(NA_character_, length(largs))
keys = mapply(largs, keys, FUN = function(column, key) {
if (is.na(key) || nchar(key) == 0) {
if (inherits(column, "RPolarsSeries")) {
key = column$name
} else {
key = "new_column"
}
}
return(key)
})
result({
# check for conflicting names, to avoid silent overwrite
if (anyDuplicated(keys) > 0) {
if (make_names_unique) {
keys = make.unique(keys, sep = "_")
} else {
stop(
paste(
"conflicting column names not allowed:",
paste(unique(keys[duplicated(keys)]), collapse = ", ")
)
)
}
}
## pass each arg to pl$lit and all args to pl$select
names(largs) = keys
lapply(seq_along(largs), \(x) {
varname = keys[x]
out = pl$lit(largs[[x]])
if (!is.null(schema) && varname %in% names(schema)) {
out = out$cast(schema[[varname]], strict = TRUE)
}
out$alias(varname)
}) |>
do.call(what = pl$select)
}) |>
uw()
}
#' S3 method to print a DataFrame
#'
#' @noRd
#' @param x DataFrame
#' @param ... not used
#'
#' @return self
#' @export
#'
#' @examples pl$DataFrame(iris)
print.RPolarsDataFrame = function(x, ...) {
x$print()
invisible(x)
}
#' internal method print DataFrame
#' @noRd
#' @return self
#'
#' @examples pl$DataFrame(iris)
DataFrame_print = function() {
.pr$DataFrame$print(self)
invisible(self)
}
## internal bookkeeping of methods which should behave as properties
DataFrame.property_setters = new.env(parent = emptyenv())
#' generic setter method
#' @noRd
#' @param self DataFrame
#' @param name name method/property to set
#' @param value value to insert
#'
#' @description set value of properties of DataFrames
#'
#' @return value
#' @keywords DataFrame
#' @details settable polars object properties may appear to be R objects, but they are not.
#' See `[[method_name]]` example
#'
#' @export
#' @examples
#' # For internal use
#' # show what methods of DataFrame have active property setters
#' with(.pr$env, ls(DataFrame.property_setters))
#'
#' # specific use case for one object property 'columns' (names)
#' df = pl$DataFrame(iris)
#'
#' # get values
#' df$columns
#'
#' # set + get values
#' df$columns = letters[1:5] #<- is fine too
#' df$columns
#'
#' # Rstudio is not using the standard R code completion tool
#' # and it will backtick any special characters. It is possible
#' # to completely customize the R / Rstudio code completion except
#' # it will trigger Rstudio to backtick any completion! Also R does
#' # not support package isolated customization.
#'
#'
#' # Concrete example if tabbing on 'df$' the raw R suggestion is df$columns<-
#' # however Rstudio backticks it into df$`columns<-`
#' # to make life simple, this is valid polars syntax also, and can be used in fast scripting
#' df$`columns<-` = letters[5:1]
#'
#' # for stable code prefer e.g. df$columns = letters[5:1]
#'
#' # to verify inside code of a property, use the [[]] syntax instead.
#' df[["columns"]] # to see property code, .pr is the internal polars api into rust polars
#' DataFrame.property_setters$columns # and even more obscure to see setter code
"$<-.RPolarsDataFrame" = function(self, name, value) {
name = sub("<-$", "", name)
# stop if method is not a setter
if (!inherits(self[[name]], "setter")) {
pstop(err = paste("no setter method for", name))
}
if (polars_options()$strictly_immutable) self = self$clone()
func = DataFrame.property_setters[[name]]
func(self, value)
self
}
#' @title Add a column for row indices
#' @description Add a new column at index 0 that counts the rows
#' @keywords DataFrame
#' @param name string name of the created column
#' @param offset positive integer offset for the start of the counter
#' @return A new `DataFrame` object with a counter column in front
#' @docType NULL
#' @examples
#' df = pl$DataFrame(mtcars)
#'
#' # by default, the index starts at 0 (to mimic the behavior of Python Polars)
#' df$with_row_index("idx")
#'
#' # but in R, we use a 1-index
#' df$with_row_index("idx", offset = 1)
DataFrame_with_row_index = function(name, offset = NULL) {
.pr$DataFrame$with_row_index(self, name, offset) |>
unwrap("in $with_row_index():")
}
# define setter function
DataFrame.property_setters$columns = function(self, names) {
unwrap(.pr$DataFrame$set_column_names_mut(self, names))
}
#' Drop columns of a DataFrame
#'
#' @param ... Characters of column names to drop. Passed to [`pl$col()`][pl_col].
#'
#' @return DataFrame
#' @examples
#' pl$DataFrame(mtcars)$drop(c("mpg", "hp"))
#'
#' # equivalent
#' pl$DataFrame(mtcars)$drop("mpg", "hp")
DataFrame_drop = function(...) {
self$lazy()$drop(...)$collect()
}
#' @title Drop nulls (missing values)
#' @description Drop all rows that contain nulls (which correspond to `NA` in R).
#' @keywords DataFrame
#' @param subset A character vector with the names of the column(s) for which
#' nulls are considered. If `NULL` (default), use all columns.
#'
#' @return DataFrame
#' @examples
#' tmp = mtcars
#' tmp[1:3, "mpg"] = NA
#' tmp[4, "hp"] = NA
#' tmp = pl$DataFrame(tmp)
#'
#' # number of rows in `tmp` before dropping nulls
#' tmp$height
#'
#' tmp$drop_nulls()$height
#' tmp$drop_nulls("mpg")$height
#' tmp$drop_nulls(c("mpg", "hp"))$height
DataFrame_drop_nulls = function(subset = NULL) {
self$lazy()$drop_nulls(subset)$collect()
}
#' Drop duplicated rows
#'
#' @param subset A character vector with the names of the column(s) to use to
#' identify duplicates. If `NULL` (default), use all columns.
#' @param ... Not used.
#' @param keep Which of the duplicate rows to keep:
#' * `"any"` (default): Does not give any guarantee of which row is kept. This
#' allows more optimizations.
#' * `"first"`: Keep first unique row.
#' * `"last"`: Keep last unique row.
#' * `"none"`: Don’t keep duplicate rows.
#' @param maintain_order Keep the same order as the original data. Setting this
#' to `TRUE` makes it more expensive to compute and blocks the possibility to
#' run on the streaming engine.
#'
#' @return DataFrame
#' @examples
#' df = pl$DataFrame(
#' x = c(1:3, 1:3, 3:1, 1L),
#' y = c(1:3, 1:3, 1:3, 1L)
#' )
#' df$height
#'
#' df$unique()$height
#'
#' # subset to define unique, keep only last or first
#' df$unique(subset = "x", keep = "last")
#' df$unique(subset = "x", keep = "first")
#'
#' # only keep unique rows
#' df$unique(keep = "none")
DataFrame_unique = function(
subset = NULL,
...,
keep = "any",
maintain_order = FALSE) {
self$lazy()$unique(subset = subset, keep = keep, maintain_order = maintain_order) |>
.pr$LazyFrame$collect() |>
unwrap("in $unique():")
}
#' Data types information
#' @name DataFrame_dtype_strings
#' @description Get the data type of all columns as strings. You can see all
#' available types with `names(pl$dtypes)`. The data type of each column is also
#' shown when printing the DataFrame.
#'
#' @docType NULL
#' @format NULL
#' @return A character vector with the data type of each column
#' @keywords DataFrame
#' @examples
#' pl$DataFrame(iris)$dtype_strings()
DataFrame_dtype_strings = use_extendr_wrapper
#' Convert an existing DataFrame to a LazyFrame
#' @name DataFrame_lazy
#' @description Start a new lazy query from a DataFrame.
#'
#' @docType NULL
#' @format NULL
#' @return A LazyFrame
#' @aliases lazy
#' @keywords DataFrame LazyFrame_new
#' @examples
#' pl$DataFrame(iris)$lazy()
DataFrame_lazy = use_extendr_wrapper
#' Clone a DataFrame
#'
#' This makes a very cheap deep copy/clone of an existing
#' [`DataFrame`][DataFrame_class]. Rarely useful as `DataFrame`s are nearly 100%
#' immutable. Any modification of a `DataFrame` should lead to a clone anyways,
#' but this can be useful when dealing with attributes (see examples).
#'
#' @return A DataFrame
#' @examples
#' df1 = pl$DataFrame(iris)
#'
#' # Make a function to take a DataFrame, add an attribute, and return a DataFrame
#' give_attr = function(data) {
#' attr(data, "created_on") = "2024-01-29"
#' data
#' }
#' df2 = give_attr(df1)
#'
#' # Problem: the original DataFrame also gets the attribute while it shouldn't!
#' attributes(df1)
#'
#' # Use $clone() inside the function to avoid that
#' give_attr = function(data) {
#' data = data$clone()
#' attr(data, "created_on") = "2024-01-29"
#' data
#' }
#' df1 = pl$DataFrame(iris)
#' df2 = give_attr(df1)
#'
#' # now, the original DataFrame doesn't get this attribute
#' attributes(df1)
DataFrame_clone = function() {
.pr$DataFrame$clone_in_rust(self)
}
#' Get the DataFrame as a List of Series
#'
#' @return A list of [Series][Series_class]
#' @seealso
#' - [`<DataFrame>$to_list()`][DataFrame_to_list]:
#' Similar to this method but returns a list of vectors instead of [Series][Series_class].
#' @examples
#' df = pl$DataFrame(foo = 1L:3L, bar = 4L:6L)
#' df$get_columns()
#'
#' df = pl$DataFrame(
#' a = 1:4,
#' b = c(0.5, 4, 10, 13),
#' c = c(TRUE, TRUE, FALSE, TRUE)
#' )
#' df$get_columns()
DataFrame_get_columns = use_extendr_wrapper
#' Get column (as one Series)
#' @name DataFrame_get_column
#' @description Extract a DataFrame column as a Polars series.
#'
#' @param name Name of the column to extract.
#'
#' @return Series
#' @aliases DataFrame_get_column
#' @keywords DataFrame
#' @examples
#' df = pl$DataFrame(iris[1:2, ])
#' df$get_column("Species")
DataFrame_get_column = function(name) {
unwrap(.pr$DataFrame$get_column(self, name), "in $get_column():")
}
#' Get column by index
#'
#' @name DataFrame_to_series
#' @description Extract a DataFrame column (by index) as a Polars series. Unlike
#' `get_column()`, this method will not fail but will return a `NULL` if the
#' index doesn't exist in the DataFrame. Keep in mind that Polars is 0-indexed
#' so "0" is the first column.
#'
#' @param idx Index of the column to return as Series. Defaults to 0, which is
#' the first column.
#'
#' @return Series or NULL
#' @keywords DataFrame
#' @examples
#' df = pl$DataFrame(iris[1:10, ])
#'
#' # default is to extract the first column
#' df$to_series()
#'
#' # Polars is 0-indexed, so we use idx = 1 to extract the *2nd* column
#' df$to_series(idx = 1)
#'
#' # doesn't error if the column isn't there
#' df$to_series(idx = 8)
DataFrame_to_series = function(idx = 0) {
if (!is.numeric(idx) || isTRUE(idx < 0)) {
pstop(err = "idx must be non-negative numeric")
}
.pr$DataFrame$select_at_idx(self, idx)$ok
}
#' Sort a DataFrame
#' @inherit LazyFrame_sort details description params
#' @inheritParams DataFrame_unique
#' @return DataFrame
#' @keywords DataFrame
#' @examples
#' df = mtcars
#' df$mpg[1] = NA
#' df = pl$DataFrame(df)
#' df$sort("mpg")
#' df$sort("mpg", nulls_last = TRUE)
#' df$sort("cyl", "mpg")
#' df$sort(c("cyl", "mpg"))
#' df$sort(c("cyl", "mpg"), descending = TRUE)
#' df$sort(c("cyl", "mpg"), descending = c(TRUE, FALSE))
#' df$sort(pl$col("cyl"), pl$col("mpg"))
DataFrame_sort = function(
by,
...,
descending = FALSE,
nulls_last = FALSE,
maintain_order = FALSE) {
self$lazy()$sort(
by, ...,
descending = descending, nulls_last = nulls_last, maintain_order = maintain_order
)$collect()
}
#' Select and modify columns of a DataFrame
#' @name DataFrame_select
#' @description Similar to `dplyr::mutate()`. However, it discards unmentioned
#' columns (like `.()` in `data.table`).
#'
#' @param ... Columns to keep. Those can be expressions (e.g `pl$col("a")`),
#' column names (e.g `"a"`), or list containing expressions or column names
#' (e.g `list(pl$col("a"))`).
#'
#' @aliases select
#' @return DataFrame
#' @keywords DataFrame
#' @examples
#' pl$DataFrame(iris)$select(
#' pl$col("Sepal.Length")$abs()$alias("abs_SL"),
#' (pl$col("Sepal.Length") + 2)$alias("add_2_SL")
#' )
DataFrame_select = function(...) {
.pr$DataFrame$select(self, unpack_list(..., .context = "in $select()")) |>
unwrap("in $select()")
}
#' @inherit DataFrame_select title params return
#'
#' @description
#' Similar to `dplyr::mutate()`. However, it discards unmentioned columns (like
#' `.()` in `data.table`).
#'
#' This will run all expression sequentially instead of in parallel. Use this
#' when the work per expression is cheap. Otherwise, `$select()` should be
#' preferred.
#'
#' @examples
#' pl$DataFrame(iris)$select_seq(
#' pl$col("Sepal.Length")$abs()$alias("abs_SL"),
#' (pl$col("Sepal.Length") + 2)$alias("add_2_SL")
#' )
DataFrame_select_seq = function(...) {
.pr$DataFrame$select_seq(self, unpack_list(..., .context = "in $select_seq()")) |>
unwrap("in $select_seq()")
}
#' Drop in place
#' @name DataFrame_drop_in_place
#' @description Drop a single column in-place and return the dropped column.
#'
#' @param name string Name of the column to drop.
#' @return Series
#' @keywords DataFrame
#' @examples
#' dat = pl$DataFrame(iris)
#' x = dat$drop_in_place("Species")
#' x
#' dat$columns
DataFrame_drop_in_place = function(name) {
.pr$DataFrame$drop_in_place(self, name)
}
#' Compare two DataFrames
#' @name DataFrame_equals
#' @description Check if two DataFrames are equal.
#'
#' @param other DataFrame to compare with.
#' @return A logical value
#' @keywords DataFrame
#' @examples
#' dat1 = pl$DataFrame(iris)
#' dat2 = pl$DataFrame(iris)
#' dat3 = pl$DataFrame(mtcars)
#' dat1$equals(dat2)
#' dat1$equals(dat3)
DataFrame_equals = function(other) {
.pr$DataFrame$equals(self, other)
}
#' Shift a DataFrame
#'
#' @description Shift the values by a given period. If the period (`n`) is positive,
#' then `n` rows will be inserted at the top of the DataFrame and the last `n`
#' rows will be discarded. Vice-versa if the period is negative. In the end,
#' the total number of rows of the DataFrame doesn't change.
#'
#' @keywords DataFrame
#' @param periods Number of periods to shift (can be negative).
#' @return DataFrame
#' @examples
#' pl$DataFrame(mtcars)$shift(2)
#'
#' pl$DataFrame(mtcars)$shift(-2)
DataFrame_shift = function(periods = 1) {
self$lazy()$shift(periods)$collect()
}
#' @title Shift and fill
#'
#' @description Shift the values by a given period and fill the resulting null
#' values. See the docs of `$shift()` for more details on shifting.
#' @keywords DataFrame
#'
#' @param fill_value Fill new `NULL` values with this value. Must of length 1.
#' A logical value will be converted to numeric.
#' @param periods Number of periods to shift (can be negative).
#' @return DataFrame
#' @examples
#' df = pl$DataFrame(mtcars)
#'
#' # insert two rows filled with 0 at the top of the DataFrame
#' df$shift_and_fill(0, 2)
#'
#' # automatic conversion of logical value to numeric
#' df$shift_and_fill(TRUE, 2)
DataFrame_shift_and_fill = function(fill_value, periods = 1) {
self$lazy()$shift_and_fill(fill_value, periods)$collect()
}
#' Modify/append column(s)
#'
#' Add columns or modify existing ones with expressions. This is
#' the equivalent of `dplyr::mutate()` as it keeps unmentioned columns (unlike
#' `$select()`).
#'
#' @name DataFrame_with_columns
#' @aliases with_columns
#' @param ... Any expressions or string column name, or same wrapped in a list.
#' If first and only element is a list, it is unwrapped as a list of args.
#' @keywords DataFrame
#' @return A DataFrame
#' @examples
#' pl$DataFrame(iris)$with_columns(
#' pl$col("Sepal.Length")$abs()$alias("abs_SL"),
#' (pl$col("Sepal.Length") + 2)$alias("add_2_SL")
#' )
#'
#' # same query
#' l_expr = list(
#' pl$col("Sepal.Length")$abs()$alias("abs_SL"),
#' (pl$col("Sepal.Length") + 2)$alias("add_2_SL")
#' )
#' pl$DataFrame(iris)$with_columns(l_expr)
#'
#' pl$DataFrame(iris)$with_columns(
#' pl$col("Sepal.Length")$abs(), # not named expr will keep name "Sepal.Length"
#' SW_add_2 = (pl$col("Sepal.Width") + 2)
#' )
DataFrame_with_columns = function(...) {
.pr$DataFrame$with_columns(self, unpack_list(..., .context = "in $with_columns()")) |>
unwrap("in $with_columns()")
}
#' @inherit DataFrame_with_columns title params return
#'
#' @description
#' Add columns or modify existing ones with expressions. This is
#' the equivalent of `dplyr::mutate()` as it keeps unmentioned columns (unlike
#' `$select()`).
#'
#' This will run all expression sequentially instead of in parallel. Use this
#' when the work per expression is cheap. Otherwise, `$with_columns()` should be
#' preferred.
#'
#' @examples
#' pl$DataFrame(iris)$with_columns_seq(
#' pl$col("Sepal.Length")$abs()$alias("abs_SL"),
#' (pl$col("Sepal.Length") + 2)$alias("add_2_SL")
#' )
#'
#' # same query
#' l_expr = list(
#' pl$col("Sepal.Length")$abs()$alias("abs_SL"),
#' (pl$col("Sepal.Length") + 2)$alias("add_2_SL")
#' )
#' pl$DataFrame(iris)$with_columns_seq(l_expr)
#'
#' pl$DataFrame(iris)$with_columns_seq(
#' pl$col("Sepal.Length")$abs(), # not named expr will keep name "Sepal.Length"
#' SW_add_2 = (pl$col("Sepal.Width") + 2)
#' )
DataFrame_with_columns_seq = function(...) {
.pr$DataFrame$with_columns_seq(self, unpack_list(..., .context = "in $with_columns_seq()")) |>
unwrap("in $with_columns_seq()")
}
#' @inherit LazyFrame_head title details
#' @param n Number of rows to return. If a negative value is passed,
#' return all rows except the last [`abs(n)`][abs].
#' @return A [DataFrame][DataFrame_class]
#' @examples
#' df = pl$DataFrame(foo = 1:5, bar = 6:10, ham = letters[1:5])
#'
#' df$head(3)
#'
#' # Pass a negative value to get all rows except the last `abs(n)`.
#' df$head(-3)
DataFrame_head = function(n = 5L) {
if (isTRUE(n < 0)) n = max(0, self$height + n)
self$lazy()$head(n)$collect()
}
#' @rdname DataFrame_head
DataFrame_limit = DataFrame_head
#' @inherit LazyFrame_tail title
#' @param n Number of rows to return. If a negative value is passed,
#' return all rows except the first [`abs(n)`][abs].
#' @inherit DataFrame_head return
#' @examples
#' df = pl$DataFrame(foo = 1:5, bar = 6:10, ham = letters[1:5])
#'
#' df$tail(3)
#'
#' # Pass a negative value to get all rows except the first `abs(n)`.
#' df$tail(-3)
DataFrame_tail = function(n = 5L) {
if (isTRUE(n < 0)) n = max(0, self$height + n)
self$lazy()$tail(n)$collect()
}
#' Filter rows of a DataFrame
#' @name DataFrame_filter
#'
#' @inherit LazyFrame_filter description params details
#'
#' @keywords DataFrame
#' @return A DataFrame with only the rows where the conditions are `TRUE`.
#' @examples
#' df = pl$DataFrame(iris)
#'
#' df$filter(pl$col("Sepal.Length") > 5)
#'
#' # This is equivalent to
#' # df$filter(pl$col("Sepal.Length") > 5 & pl$col("Petal.Width") < 1)
#' df$filter(pl$col("Sepal.Length") > 5, pl$col("Petal.Width") < 1)
#'
#' # rows where condition is NA are dropped
#' iris2 = iris
#' iris2[c(1, 3, 5), "Species"] = NA
#' df = pl$DataFrame(iris2)
#'
#' df$filter(pl$col("Species") == "setosa")
DataFrame_filter = function(...) {
.pr$DataFrame$lazy(self)$filter(...)$collect()
}
#' Group a DataFrame
#' @inheritParams LazyFrame_group_by
#' @inherit LazyFrame_group_by description params
#' @details Within each group, the order of the rows is always preserved,
#' regardless of the `maintain_order` argument.
#' @return [GroupBy][GroupBy_class] (a DataFrame with special groupby methods like `$agg()`)
#' @seealso
#' - [`<DataFrame>$partition_by()`][DataFrame_partition_by]
#' @examples
#' df = pl$DataFrame(
#' a = c("a", "b", "a", "b", "c"),
#' b = c(1, 2, 1, 3, 3),
#' c = c(5, 4, 3, 2, 1)
#' )
#'
#' df$group_by("a")$agg(pl$col("b")$sum())
#'
#' # Set `maintain_order = TRUE` to ensure the order of the groups is consistent with the input.
#' df$group_by("a", maintain_order = TRUE)$agg(pl$col("c"))
#'
#' # Group by multiple columns by passing a list of column names.
#' df$group_by(c("a", "b"))$agg(pl$max("c"))
#'
#' # Or pass some arguments to group by multiple columns in the same way.
#' # Expressions are also accepted.
#' df$group_by("a", pl$col("b") %/% 2)$agg(
#' pl$col("c")$mean()
#' )
#'
#' # The columns will be renamed to the argument names.
#' df$group_by(d = "a", e = pl$col("b") %/% 2)$agg(
#' pl$col("c")$mean()
#' )
DataFrame_group_by = function(..., maintain_order = polars_options()$maintain_order) {
# clone the DataFrame, bundle args as attributes. Non fallible.
construct_group_by(
self,
groupby_input = unpack_list(..., .context = "$group_by():"),
maintain_order = maintain_order
)
}
#' Return Polars DataFrame as R data.frame
#'
#' @param ... Any args pased to `as.data.frame()`.
#' @param int64_conversion How should Int64 values be handled when converting a
#' polars object to R?
#'
#' * `"double"` (default) converts the integer values to double.
#' * `"bit64"` uses `bit64::as.integer64()` to do the conversion (requires
#' the package `bit64` to be attached).
#' * `"string"` converts Int64 values to character.