/
highlevel.py
2722 lines (2187 loc) · 99.5 KB
/
highlevel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
# TODO in Array:
#
# - [ ] all docstrings are old
# - [ ] 'Mask' nested class and 'mask' property
# - [ ] `__array__`
# - [ ] `__array_ufunc__`
# - [ ] `__array_function__`
# - [X] `numba_type`
# - [ ] `__copy__`
# - [ ] `__deepcopy__`
# - [X] `__contains__`
#
# TODO in Array:
#
# - [ ] all docstrings are old
# - [ ] `__array_ufunc__`
# - [X] `numba_type`
# - [ ] `__copy__`
# - [ ] `__deepcopy__`
# - [X] `__contains__`
#
# TODO in ArrayBuilder: everything
import sys
import re
import keyword
from collections.abc import Iterable
from collections.abc import Sized
from collections.abc import Mapping
import awkward as ak
from awkward._v2._connect.numpy import NDArrayOperatorsMixin
np = ak.nplike.NumpyMetadata.instance()
numpy = ak.nplike.Numpy.instance()
_dir_pattern = re.compile(r"^[a-zA-Z_]\w*$")
# def _suffix(array):
# out = ak._v2.operations.convert.kernels(array)
# if out is None or out == "cpu":
# return ""
# else:
# return ":" + out
class Array(NDArrayOperatorsMixin, Iterable, Sized):
"""
Args:
data (#ak.layout.Content, #ak.Array, `np.ndarray`, `cp.ndarray`, `pyarrow.*`, str, dict, or iterable):
Data to wrap or convert into an array.
- If a NumPy array, the regularity of its dimensions is preserved
and the data are viewed, not copied.
- CuPy arrays are treated the same way as NumPy arrays except that
they default to `backend="cuda"`, rather than `backend="cpu"`.
- If a pyarrow object, calls #ak.from_arrow, preserving as much
metadata as possible, usually zero-copy.
- If a dict of str \u2192 columns, combines the columns into an
array of records (like Pandas's DataFrame constructor).
- If a string, the data are assumed to be JSON.
- If an iterable, calls #ak.from_iter, which assumes all dimensions
have irregular lengths.
behavior (None or dict): Custom #ak.behavior for this Array only.
with_name (None or str): Gives tuples and records a name that can be
used to override their behavior (see below).
check_valid (bool): If True, verify that the #layout is valid.
backend (None, `"cpu"`, or `"cuda"`): If `"cpu"`, the Array will be placed in
main memory for use with other `"cpu"` Arrays and Records; if `"cuda"`,
the Array will be placed in GPU global memory using CUDA; if None,
the `data` are left untouched. For `"cuda"`,
[awkward-cuda-kernels](https://pypi.org/project/awkward-cuda-kernels)
must be installed, which can be invoked with
`pip install awkward[cuda] --upgrade`.
High-level array that can contain data of any type.
For most users, this is the only class in Awkward Array that matters: it
is the entry point for data analysis with an emphasis on usability. It
intentionally has a minimum of methods, preferring standalone functions
like
ak.num(array1)
ak.combinations(array1)
ak.cartesian([array1, array2])
ak.zip({"x": array1, "y": array2, "z": array3})
instead of bound methods like
array1.num()
array1.combinations()
array1.cartesian([array2, array3])
array1.zip(...) # ?
because its namespace is valuable for domain-specific parameters and
functionality. For example, if records contain a field named `"num"`,
they can be accessed as
array1.num
instead of
array1["num"]
without any confusion or interference from #ak.num. The same is true
for domain-specific methods that have been attached to the data. For
instance, an analysis of mailing addresses might have a function that
computes zip codes, which can be attached to the data with a method
like
latlon.zip()
without any confusion or interference from #ak.zip. Custom methods like
this can be added with #ak.behavior, and so the namespace of Array
attributes must be kept clear for such applications.
See also #ak.Record.
Interfaces to other libraries
=============================
NumPy
*****
When NumPy
[universal functions](https://docs.scipy.org/doc/numpy/reference/ufuncs.html)
(ufuncs) are applied to an ak.Array, they are passed through the Awkward
data structure, applied to the numerical data at its leaves, and the output
maintains the original structure.
For example,
>>> array = ak.Array([[1, 4, 9], [], [16, 25]])
>>> np.sqrt(array)
<Array [[1, 2, 3], [], [4, 5]] type='3 * var * float64'>
See also #ak.Array.__array_ufunc__.
Some NumPy functions other than ufuncs are also handled properly in
NumPy >= 1.17 (see
[NEP 18](https://numpy.org/neps/nep-0018-array-function-protocol.html))
and if an Awkward override exists. That is,
np.concatenate
can be used on an Awkward Array because
ak.concatenate
exists. If your NumPy is older than 1.17, use `ak.concatenate` directly.
Pandas
******
Ragged arrays (list type) can be converted into Pandas
[MultiIndex](https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html)
rows and nested records can be converted into MultiIndex columns. If the
Awkward Array has only one "branch" of nested lists (i.e. different record
fields do not have different-length lists, but a single chain of lists-of-lists
is okay), then it can be losslessly converted into a single DataFrame.
Otherwise, multiple DataFrames are needed, though they can be merged (with a
loss of information).
The #ak.to_pandas function performs this conversion; if `how=None`, it
returns a list of DataFrames; otherwise, `how` is passed to `pd.merge` when
merging the resultant DataFrames.
Numba
*****
Arrays can be used in [Numba](http://numba.pydata.org/): they can be
passed as arguments to a Numba-compiled function or returned as return
values. The only limitation is that Awkward Arrays cannot be *created*
inside the Numba-compiled function; to make outputs, consider
#ak.ArrayBuilder.
Arrow
*****
Arrays are convertible to and from [Apache Arrow](https://arrow.apache.org/),
a standard for representing nested data structures in columnar arrays.
See #ak.to_arrow and #ak.from_arrow.
NumExpr
*******
[NumExpr](https://numexpr.readthedocs.io/en/latest/user_guide.html) can
calculate expressions on a set of ak.Arrays, but only if the functions in
`ak.numexpr` are used, not the functions in the `numexpr` library directly.
Like NumPy ufuncs, the expression is evaluated on the numeric leaves of the
data structure, maintaining structure in the output.
See #ak.numexpr.evaluate to calculate an expression.
See #ak.numexpr.re_evaluate to recalculate an expression without
rebuilding its virtual machine.
Autograd
********
Derivatives of a calculation on a set of ak.Arrays can be calculated with
[Autograd](https://github.com/HIPS/autograd#readme), but only if the
function in `ak.autograd` is used, not the functions in the `autograd`
library directly.
Like NumPy ufuncs, the function and its derivatives are evaluated on the
numeric leaves of the data structure, maintaining structure in the output.
See #ak.autograd.elementwise_grad to calculate a function and its
derivatives elementwise on each numeric value in an ak.Array.
"""
def __init__(
self,
data,
behavior=None,
with_name=None,
check_valid=False,
backend=None,
):
if isinstance(data, ak._v2.contents.Content):
layout = data
elif isinstance(data, Array):
layout = data._layout
elif isinstance(data, np.ndarray) and data.dtype != np.dtype("O"):
layout = ak._v2.operations.convert.from_numpy(data, highlevel=False)
elif ak._v2._util.in_module(data, "cupy"):
layout = ak._v2.operations.convert.from_cupy(data, highlevel=False)
elif ak._v2._util.in_module(data, "pyarrow"):
layout = ak._v2.operations.convert.from_arrow(data, highlevel=False)
elif isinstance(data, dict):
fields = []
contents = []
length = None
for k, v in data.items():
fields.append(k)
contents.append(Array(v).layout)
if length is None:
length = len(contents[-1])
elif length != len(contents[-1]):
raise ValueError(
"dict of arrays in ak.Array constructor must have arrays "
"of equal length ({} vs {})".format(length, len(contents[-1]))
)
layout = ak._v2.contents.RecordArray(contents, fields)
elif isinstance(data, str):
layout = ak._v2.operations.convert.from_json(data, highlevel=False)
else:
layout = ak._v2.operations.convert.from_iter(
data, highlevel=False, allow_record=False
)
if not isinstance(layout, ak._v2.contents.Content):
raise TypeError("could not convert data into an ak.Array")
if with_name is not None:
layout = ak._v2.operations.structure.with_name(
layout, with_name, highlevel=False
)
if backend is not None and backend != ak._v2.operations.describe.backend(
layout
):
layout = ak._v2.operations.describe.to_backend(
layout, backend, highlevel=False
)
self.layout = layout
self.behavior = behavior
docstr = layout.purelist_parameter("__doc__")
if ak._v2._util.isstr(docstr):
self.__doc__ = docstr
if check_valid:
ak._v2.operations.describe.validity_error(self, exception=True)
@property
def layout(self):
"""
The composable #ak.layout.Content elements that determine how this
Array is structured.
This may be considered a "low-level" view, as it distinguishes between
arrays that have the same logical meaning (i.e. same JSON output and
high-level #type) but different
* node types, such as #ak.layout.ListArray64 and
#ak.layout.ListOffsetArray64,
* integer type specialization, such as #ak.layout.ListArray64
and #ak.layout.ListArray32,
* or specific values, such as gaps in a #ak.layout.ListArray64.
The #ak.layout.Content elements are fully composable, whereas an
Array is not; the high-level Array is a single-layer "shell" around
its layout.
Layouts are rendered as XML instead of a nested list. For example,
the following `array`
ak.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]])
is presented as
<Array [[1.1, 2.2, 3.3], [], [4.4, 5.5]] type='3 * var * float64'>
but `array.layout` is presented as
<ListOffsetArray64>
<offsets>
<Index64 i="[0 3 3 5]" offset="0" length="4" at="0x55a26df62590"/>
</offsets>
<content>
<NumpyArray format="d" shape="5" data="1.1 2.2 3.3 4.4 5.5" at="0x55a26e0c5f50"/>
</content>
</ListOffsetArray64>
(with truncation for large arrays).
"""
return self._layout
@layout.setter
def layout(self, layout):
if isinstance(layout, ak._v2.contents.Content):
self._layout = layout
self._numbaview = None
else:
raise TypeError("layout must be a subclass of ak._v2.contents.Content")
@property
def behavior(self):
"""
The `behavior` parameter passed into this Array's constructor.
* If a dict, this `behavior` overrides the global #ak.behavior.
Any keys in the global #ak.behavior but not this `behavior` are
still valid, but any keys in both are overridden by this
`behavior`. Keys with a None value are equivalent to missing keys,
so this `behavior` can effectively remove keys from the
global #ak.behavior.
* If None, the Array defaults to the global #ak.behavior.
See #ak.behavior for a list of recognized key patterns and their
meanings.
"""
return self._behavior
@behavior.setter
def behavior(self, behavior):
if behavior is None or isinstance(behavior, Mapping):
self.__class__ = ak._v2._util.arrayclass(self._layout, behavior)
self._behavior = behavior
else:
raise TypeError("behavior must be None or a dict")
# class Mask(object):
# def __init__(self, array, valid_when):
# self._array = array
# self._valid_when = valid_when
# def __str__(self):
# return self._str()
# def __repr__(self):
# return self._repr()
# def _str(self, limit_value=85):
# return self._array._str(limit_value=limit_value)
# def _repr(self, limit_value=40, limit_total=85):
# suffix = _suffix(self)
# limit_value -= len(suffix)
# value = ak._v2._util.minimally_touching_string(
# limit_value, self._array.layout, self._array._behavior
# )
# try:
# name = super(Array, self._array).__getattribute__("__name__")
# except AttributeError:
# name = type(self._array).__name__
# limit_type = limit_total - (len(value) + len(name) + len("<.mask type=>"))
# typestr = repr(
# str(
# ak._v2._util.highlevel_type(
# self._array.layout, self._array._behavior, True
# )
# )
# )
# if len(typestr) > limit_type:
# typestr = typestr[: (limit_type - 4)] + "..." + typestr[-1]
# return "<{0}.mask{1} {2} type={3}>".format(name, suffix, value, typestr)
# def __getitem__(self, where):
# return ak._v2.operations.structure.mask(self._array, where, self._valid_when)
# @property
# def mask(self, valid_when=True):
# """
# Whereas
# array[array_of_booleans]
# removes elements from `array` in which `array_of_booleans` is False,
# array.mask[array_of_booleans]
# returns data with the same length as the original `array` but False
# values in `array_of_booleans` are mapped to None. Such an output
# can be used in mathematical expressions with the original `array`
# because they are still aligned.
# See <<filtering>> and #ak.mask.
# """
# return self.Mask(self, valid_when)
def tolist(self):
"""
Converts this Array into Python objects; same as #ak.to_list
(but without the underscore, like NumPy's
[tolist](https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.tolist.html)).
"""
return self.to_list()
def to_list(self):
"""
Converts this Array into Python objects; same as #ak.to_list.
"""
return self._layout.to_list(self._behavior)
def to_numpy(self, allow_missing=True):
"""
Converts this Array into a NumPy array, if possible; same as #ak.to_numpy.
"""
return ak._v2.operations.convert.to_numpy(self, allow_missing=allow_missing)
@property
def nbytes(self):
"""
The total number of bytes in all the #ak.layout.Index,
#ak.layout.Identifier, and #ak.layout.NumpyArray buffers in this
array tree.
It does not count buffers that must be kept in memory because
of ownership, but are not directly used in the array. Nor does it count
the (small) C++ nodes or Python objects that reference the (large)
array buffers.
"""
return self._layout.nbytes
@property
def ndim(self):
"""
Number of dimensions (nested variable-length lists and/or regular arrays)
before reaching a numeric type or a record.
There may be nested lists within the record, as field values, but this
number of dimensions does not count those.
(Some fields may have different depths than others, which is why they
are not counted.)
"""
return self._layout.purelist_depth
@property
def fields(self):
"""
List of field names or tuple slot numbers (as strings) of the outermost
record or tuple in this array.
If the array contains nested records, only the fields of the outermost
record are shown. If it contains tuples instead of records, its fields
are string representations of integers, such as `"0"`, `"1"`, `"2"`, etc.
The records or tuples may be within multiple layers of nested lists.
If the array contains neither tuples nor records, it is an empty list.
See also #ak.fields.
"""
return self._layout.fields
def _ipython_key_completions_(self):
return self._layout.fields
@property
def type(self):
"""
The high-level type of this Array; same as #ak.type.
Note that the outermost element of an Array's type is always an
#ak.types.ArrayType, which specifies the number of elements in the array.
The type of a #ak.layout.Content (from #ak.Array.layout) is not
wrapped by an #ak.types.ArrayType.
"""
return ak._v2.types.ArrayType(
self._layout.form.type_from_behavior(self._behavior), len(self._layout)
)
@property
def typestr(self):
"""
The high-level type of this Array, presented as a string.
Note that the outermost element of an Array's type is always an
#ak.types.ArrayType, which specifies the number of elements in the array.
The type of a #ak.layout.Content (from #ak.Array.layout) is not
wrapped by an #ak.types.ArrayType.
"""
return str(self.type)
def __len__(self):
"""
The length of this Array, only counting the outermost structure.
For example, the length of
ak.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]])
is `3`, not `5`.
"""
return len(self._layout)
def __iter__(self):
"""
Iterates over this Array in Python.
Note that this is the *slowest* way to access data (even slower than
native Python objects, like lists and dicts). Usually, you should
express your problems in array-at-a-time operations.
In other words, do this:
>>> print(np.sqrt(ak.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]])))
[[1.05, 1.48, 1.82], [], [2.1, 2.35]]
not this:
>>> for outer in ak.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]]):
... for inner in outer:
... print(np.sqrt(inner))
...
1.0488088481701516
1.4832396974191326
1.816590212458495
2.0976176963403033
2.345207879911715
Iteration over Arrays exists so that they can be more easily inspected
as Python objects.
See also #ak.to_list.
"""
if isinstance(self._layout, ak._v2.contents.NumpyArray):
array = self._layout.raw(numpy)
array_param = self._layout.parameter("__array__")
if array_param == "byte":
for x in ak._v2._util.tobytes(array):
yield x
elif array_param == "char":
for x in ak._v2._util.tobytes(array).decode(errors="surrogateescape"):
yield x
else:
for x in array:
yield x
else:
for x in self._layout:
if isinstance(x, ak._v2.contents.NumpyArray):
array_param = x.parameter("__array__")
if array_param == "byte":
yield ak._v2._util.tobytes(x.raw(numpy))
elif array_param == "char":
yield ak._v2._util.tobytes(x.raw(numpy)).decode(
errors="surrogateescape"
)
else:
yield x
elif isinstance(x, (ak._v2.contents.Content, ak._v2.record.Record)):
yield ak._v2._util.wrap(x, self._behavior)
else:
yield x
def __getitem__(self, where):
"""
Args:
where (many types supported; see below): Index of positions to
select from this Array.
Select items from the Array using an extension of NumPy's (already
quite extensive) rules.
All methods of selecting items described in
[NumPy indexing](https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html)
are supported with one exception
([combining advanced and basic indexing](https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#combining-advanced-and-basic-indexing)
with basic indexes *between* two advanced indexes: the definition
NumPy chose for the result does not have a generalization beyond
rectilinear arrays).
The `where` parameter can be any of the following or a tuple of
the following.
* **An integer** selects one element. Like Python/NumPy, it is
zero-indexed: `0` is the first item, `1` is the second, etc.
Negative indexes count from the end of the list: `-1` is the
last, `-2` is the second-to-last, etc.
Indexes beyond the size of the array, either because they're too
large or because they're too negative, raise errors. In
particular, some nested lists might contain a desired element
while others don't; this would raise an error.
* **A slice** (either a Python `slice` object or the
`start:stop:step` syntax) selects a range of elements. The
`start` and `stop` values are zero-indexed; `start` is inclusive
and `stop` is exclusive, like Python/NumPy. Negative `step`
values are allowed, but a `step` of `0` is an error. Slices
beyond the size of the array are not errors but are truncated,
like Python/NumPy.
* **A string** selects a tuple or record field, even if its
position in the tuple is to the left of the dimension where the
tuple/record is defined. (See <<projection>> below.) This is
similar to NumPy's
[field access](https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#field-access),
except that strings are allowed in the same tuple with other
slice types. While record fields have names, tuple fields are
integer strings, such as `"0"`, `"1"`, `"2"` (always
non-negative). Be careful to distinguish these from non-string
integers.
* **An iterable of strings** (not the top-level tuple) selects
multiple tuple/record fields.
* **An ellipsis** (either the Python `Ellipsis` object or the
`...` syntax) skips as many dimensions as needed to put the
rest of the slice items to the innermost dimensions.
* **A np.newaxis** or its equivalent, None, does not select items
but introduces a new regular dimension in the output with size
`1`. This is a convenient way to explicitly choose a dimension
for broadcasting.
* **A boolean array** with the same length as the current dimension
(or any iterable, other than the top-level tuple) selects elements
corresponding to each True value in the array, dropping those
that correspond to each False. The behavior is similar to
NumPy's
[compress](https://docs.scipy.org/doc/numpy/reference/generated/numpy.compress.html)
function.
* **An integer array** (or any iterable, other than the top-level
tuple) selects elements like a single integer, but produces a
regular dimension of as many as are desired. The array can have
any length, any order, and it can have duplicates and incomplete
coverage. The behavior is similar to NumPy's
[take](https://docs.scipy.org/doc/numpy/reference/generated/numpy.take.html)
function.
* **An integer Array with missing (None) items** selects multiple
values by index, as above, but None values are passed through
to the output. This behavior matches pyarrow's
[Array.take](https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.take)
which also manages arrays with missing values. See
<<option indexing>> below.
* **An Array of nested lists**, ultimately containing booleans or
integers and having the same lengths of lists at each level as
the Array to which they're applied, selects by boolean or by
integer at the deeply nested level. Missing items at any level
above the deepest level must broadcast. See <<nested indexing>> below.
A tuple of the above applies each slice item to a dimension of the
data, which can be very expressive. More than one flat boolean/integer
array are "iterated as one" as described in the
[NumPy documentation](https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#integer-array-indexing).
Filtering
*********
A common use of selection by boolean arrays is to filter a dataset by
some property. For instance, to get the odd values of the `array`
ak.Array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
one can put an array expression with True for each odd value inside
square brackets:
>>> array[array % 2 == 1]
<Array [1, 3, 5, 7, 9] type='5 * int64'>
This technique is so common in NumPy and Pandas data analysis that it
is often read as a syntax, rather than a consequence of array slicing.
The extension to nested arrays like
ak.Array([[[0, 1, 2], [], [3, 4], [5]], [[6, 7, 8], [9]]])
allows us to use the same syntax more generally.
>>> array[array % 2 == 1]
<Array [[[1], [], [3], [5]], [[7], [9]]] type='2 * var * var * int64'>
In this example, the boolean array is itself nested (see
<<nested indexing>> below).
>>> array % 2 == 1
<Array [[[False, True, False], ... [True]]] type='2 * var * var * bool'>
This also applies to data with record structures.
For nested data, we often need to select the first or first two
elements from variable-length lists. That can be a problem if some
lists are empty. A function like #ak.num can be useful for first
selecting by the lengths of lists.
>>> array = ak.Array([[1.1, 2.2, 3.3],
... [],
... [4.4, 5.5],
... [6.6],
... [],
... [7.7, 8.8, 9.9]])
...
>>> array[ak.num(array) > 0, 0]
<Array [1.1, 4.4, 6.6, 7.7] type='4 * float64'>
>>> array[ak.num(array) > 1, 1]
<Array [2.2, 5.5, 8.8] type='3 * float64'>
It's sometimes also a problem that "cleaning" the dataset by dropping
empty lists changes its alignment, so that it can no longer be used
in calculations with "uncleaned" data. For this, #ak.mask can be
useful because it inserts None in positions that fail the filter,
rather than removing them.
>>> print(ak.mask(array, ak.num(array) > 1))
[[1.1, 2.2, 3.3], None, [4.4, 5.5], None, None, [7.7, 8.8, 9.9]]
Note, however, that the `0` or `1` to pick the first or second
item of each nested list is in the second dimension, so the first
dimension of the slice must be a `:`.
>>> ak.mask(array, ak.num(array) > 1)[:, 0]
<Array [1.1, None, 4.4, None, None, 7.7] type='6 * ?float64'>
>>> ak.mask(array, ak.num(array) > 1)[:, 1]
<Array [2.2, None, 5.5, None, None, 8.8] type='6 * ?float64'>
Another syntax for
ak.mask(array, array_of_booleans)
is
array.mask[array_of_booleans]
(which is 5 characters away from simply filtering the `array`).
Projection
**********
The following `array`
ak.Array([[{"x": 1.1, "y": [1]}, {"x": 2.2, "y": [2, 2]}],
[{"x": 3.3, "y": [3, 3, 3]}],
[{"x": 0, "y": []}, {"x": 1.1, "y": [1, 1, 1]}]])
has records inside of nested lists:
>>> ak.type(array)
3 * var * {"x": float64, "y": var * int64}
In principle, one should select nested lists before record fields,
>>> array[2, :, "x"]
<Array [0, 1.1] type='2 * float64'>
>>> array[::2, :, "x"]
<Array [[1.1, 2.2], [0, 1.1]] type='2 * var * float64'>
but it's also possible to select record fields first.
>>> array["x"]
<Array [[1.1, 2.2], [3.3], [0, 1.1]] type='3 * var * float64'>
The string can "commute" to the left through integers and slices to
get the same result as it would in its "natural" position.
>>> array[2, :, "x"]
<Array [0, 1.1] type='2 * float64'>
>>> array[2, "x", :]
<Array [0, 1.1] type='2 * float64'>
>>> array["x", 2, :]
<Array [0, 1.1] type='2 * float64'>
The is analogous to selecting rows (integer indexes) before columns
(string names) or columns before rows, except that the rows are
more complex (like a Pandas
[MultiIndex](https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html)).
This would be an expensive operation in a typical object-oriented
environment, in which the records with fields `"x"` and `"y"` are
akin to C structs, but for columnar Awkward Arrays, projecting
through all records to produce an array of nested lists of `"x"`
values just changes the metadata (no loop over data, and therefore
fast).
Thus, data analysts should think of records as fluid objects that
can be easily projected apart and zipped back together with
#ak.zip.
Note, however, that while a column string can "commute" with row
indexes to the left of its position in the tree, it can't commute
to the right. For example, it's possible to use slices inside
`"y"` because `"y"` is a list:
>>> array[0, :, "y"]
<Array [[1], [2, 2]] type='2 * var * int64'>
>>> array[0, :, "y", 0]
<Array [1, 2] type='2 * int64'>
but it's not possible to move `"y"` to the right
>>> array[0, :, 0, "y"]
ValueError: in NumpyArray, too many dimensions in slice
because the `array[0, :, 0, ...]` slice applies to both `"x"` and
`"y"` before `"y"` is selected, and `"x"` is a one-dimensional
NumpyArray that can't take more than its share of slices.
Finally, note that the dot (`__getattr__`) syntax is equivalent to a single
string in a slice (`__getitem__`) if the field name is a valid Python
identifier and doesn't conflict with #ak.Array methods or properties.
>>> array.x
<Array [[1.1, 2.2], [3.3], [0, 1.1]] type='3 * var * float64'>
>>> array.y
<Array [[[1], [2, 2]], ... [[], [1, 1, 1]]] type='3 * var * var * int64'>
Nested Projection
*****************
If records are nested within records, you can use a series of strings in
the selector to drill down. For instance, with the following `array`,
ak.Array([
{"a": {"x": 1, "y": 2}, "b": {"x": 10, "y": 20}, "c": {"x": 1.1, "y": 2.2}},
{"a": {"x": 1, "y": 2}, "b": {"x": 10, "y": 20}, "c": {"x": 1.1, "y": 2.2}},
{"a": {"x": 1, "y": 2}, "b": {"x": 10, "y": 20}, "c": {"x": 1.1, "y": 2.2}}])
we can go directly to the numerical data by specifying a string for the
outer field and a string for the inner field.
>>> array["a", "x"]
<Array [1, 1, 1] type='3 * int64'>
>>> array["a", "y"]
<Array [2, 2, 2] type='3 * int64'>
>>> array["b", "y"]
<Array [20, 20, 20] type='3 * int64'>
>>> array["c", "y"]
<Array [2.2, 2.2, 2.2] type='3 * float64'>
As with single projections, the dot (`__getattr__`) syntax is equivalent
to a single string in a slice (`__getitem__`) if the field name is a valid
Python identifier and doesn't conflict with #ak.Array methods or properties.
>>> array.a.x
<Array [1, 1, 1] type='3 * int64'>
You can even get every field of the same name within an outer record using
a list of field names for the outer record. The following selects the `"x"`
field of `"a"`, `"b"`, and `"c"` records:
>>> array[["a", "b", "c"], "x"].tolist()
[{'a': 1, 'b': 10, 'c': 1.1},
{'a': 1, 'b': 10, 'c': 1.1},
{'a': 1, 'b': 10, 'c': 1.1}]
You don't need to get all fields:
>>> array[["a", "b"], "x"].tolist()
[{'a': 1, 'b': 10},
{'a': 1, 'b': 10},
{'a': 1, 'b': 10}]
And you can select lists of field names at all levels:
>>> array[["a", "b"], ["x", "y"]].tolist()
[{'a': {'x': 1, 'y': 2}, 'b': {'x': 10, 'y': 20}},
{'a': {'x': 1, 'y': 2}, 'b': {'x': 10, 'y': 20}},
{'a': {'x': 1, 'y': 2}, 'b': {'x': 10, 'y': 20}}]
Option indexing
***************
NumPy arrays can be sliced by all of the above slice types except
arrays with missing values and arrays with nested lists, both of
which are inexpressible in NumPy. Missing values, represented by
None in Python, are called option types (#ak.types.OptionType) in
Awkward Array and can be used as a slice.
For example, an `array` like
ak.Array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9])
can be sliced with a boolean array
>>> array[[False, False, False, False, True, False, True, False, True]]
<Array [5.5, 7.7, 9.9] type='3 * float64'>
or a boolean array containing None values:
>>> array[[False, False, False, False, True, None, True, None, True]]
<Array [5.5, None, 7.7, None, 9.9] type='5 * ?float64'>
Similarly for arrays of integers and None:
>>> array[[0, 1, None, None, 7, 8]]
<Array [1.1, 2.2, None, None, 8.8, 9.9] type='6 * ?float64'>
This is the same behavior as pyarrow's
[Array.take](https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.take),
which establishes a convention for how to interpret slice arrays
with option type:
>>> import pyarrow as pa
>>> array = pa.array([1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9])
>>> array.take(pa.array([0, 1, None, None, 7, 8]))
<pyarrow.lib.DoubleArray object at 0x7efc7f060210>
[
1.1,
2.2,
null,
null,
8.8,
9.9
]
Nested indexing
***************
Awkward Array's nested lists can be used as slices as well, as long
as the type at the deepest level of nesting is boolean or integer.
For example, the `array`
ak.Array([[[0.0, 1.1, 2.2], [], [3.3, 4.4]], [], [[5.5]]])
can be sliced at the top level with one-dimensional arrays:
>>> array[[False, True, True]]
<Array [[], [[5.5]]] type='2 * var * var * float64'>
>>> array[[1, 2]]
<Array [[], [[5.5]]] type='2 * var * var * float64'>
with singly nested lists:
>>> array[[[False, True, True], [], [True]]]
<Array [[[], [3.3, 4.4]], [], [[5.5]]] type='3 * var * var * float64'>
>>> array[[[1, 2], [], [0]]]
<Array [[[], [3.3, 4.4]], [], [[5.5]]] type='3 * var * var * float64'>
and with doubly nested lists:
>>> array[[[[False, True, False], [], [True, False]], [], [[False]]]]
<Array [[[1.1], [], [3.3]], [], [[]]] type='3 * var * var * float64'>
>>> array[[[[1], [], [0]], [], [[]]]]
<Array [[[1.1], [], [3.3]], [], [[]]] type='3 * var * var * float64'>
The key thing is that the nested slice has the same number of elements
as the array it's slicing at every level of nesting that it reproduces.
This is similar to the requirement that boolean arrays have the same
length as the array they're filtering.
This kind of slicing is useful because NumPy's
[universal functions](https://docs.scipy.org/doc/numpy/reference/ufuncs.html)
produce arrays with the same structure as the original array, which
can then be used as filters.
>>> print((array * 10) % 2 == 1)
[[[False, True, False], [], [True, False]], [], [[True]]]
>>> print(array[(array * 10) % 2 == 1])
[[[1.1], [], [3.3]], [], [[5.5]]]
Functions whose names start with "arg" return index positions, which
can be used with the integer form.
>>> print(np.argmax(array, axis=-1))
[[2, None, 1], [], [0]]
>>> print(array[np.argmax(array, axis=-1)])
[[[3.3, 4.4], None, []], [], [[5.5]]]
Here, the `np.argmax` returns the integer position of the maximum
element or None for empty arrays. It's a nice example of
<<option indexing>> with <<nested indexing>>.
When applying a nested index with missing (None) entries at levels
higher than the last level, the indexer must have the same dimension
as the array being indexed, and the resulting output will have missing