This repository has been archived by the owner on Jan 7, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 54
/
_utils.py
236 lines (194 loc) · 7.59 KB
/
_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import numpy as np
import operator
from .extern.six import string_types
from ._librootnumpy import _blockwise_inner_join
__all__ = [
'rec2array',
'stack',
'stretch',
'dup_idx',
'blockwise_inner_join',
]
VLEN = np.vectorize(len)
def rec2array(rec, fields=None):
"""Convert a record array into a ndarray with a homogeneous data type.
Parameters
----------
rec : NumPy record/structured array
A NumPy structured array that will be cast into a homogenous data type.
fields : list of strings, optional (default=None)
The fields to include as columns in the output array. If None, then all
columns will be included.
Returns
-------
array : NumPy ndarray
A new NumPy ndarray with homogeneous data types for all columns.
"""
if fields is None:
fields = rec.dtype.names
if len(fields) == 1:
return rec[fields[0]]
# Creates a copy and recasts data to a consistent datatype
return np.vstack([rec[field] for field in fields]).T
def stack(recs, fields=None):
"""Stack common fields in multiple record arrays (concatenate them).
Parameters
----------
recs : list
List of NumPy record arrays
fields : list of strings, optional (default=None)
The list of fields to include in the stacked array. If None, then
include the fields in common to all the record arrays.
Returns
-------
rec : NumPy record array
The stacked array.
"""
if fields is None:
fields = list(set.intersection(
*[set(rec.dtype.names) for rec in recs]))
# preserve order of fields wrt first record array
if set(fields) == set(recs[0].dtype.names):
fields = list(recs[0].dtype.names)
return np.hstack([rec[fields] for rec in recs])
def stretch(arr, fields=None):
"""Stretch an array.
Stretch an array by ``hstack()``-ing multiple array fields while
preserving column names and record array structure. If a scalar field is
specified, it will be stretched along with array fields.
Parameters
----------
arr : NumPy structured or record array
The array to be stretched.
fields : list of strings, optional (default=None)
A list of column names to stretch. If None, then stretch all fields.
Returns
-------
ret : A NumPy structured array
The stretched array.
Examples
--------
>>> import numpy as np
>>> from root_numpy import stretch
>>> arr = np.empty(2, dtype=[('scalar', np.int), ('array', 'O')])
>>> arr[0] = (0, np.array([1, 2, 3], dtype=np.float))
>>> arr[1] = (1, np.array([4, 5, 6], dtype=np.float))
>>> stretch(arr, ['scalar', 'array'])
array([(0, 1.0), (0, 2.0), (0, 3.0), (1, 4.0), (1, 5.0), (1, 6.0)],
dtype=[('scalar', '<i8'), ('array', '<f8')])
"""
dtype = []
len_array = None
if fields is None:
fields = arr.dtype.names
# Construct dtype and check consistency
for field in fields:
dt = arr.dtype[field]
if dt == 'O' or len(dt.shape):
if dt == 'O':
# Variable-length array field
lengths = VLEN(arr[field])
else:
lengths = np.repeat(dt.shape[0], arr.shape[0])
# Fixed-length array field
if len_array is None:
len_array = lengths
elif not np.array_equal(lengths, len_array):
raise ValueError(
"inconsistent lengths of array columns in input")
if dt == 'O':
dtype.append((field, arr[field][0].dtype))
else:
dtype.append((field, arr[field].dtype, dt.shape[1:]))
else:
# Scalar field
dtype.append((field, dt))
if len_array is None:
raise RuntimeError("no array column in input")
# Build stretched output
ret = np.empty(np.sum(len_array), dtype=dtype)
for field in fields:
dt = arr.dtype[field]
if dt == 'O' or len(dt.shape) == 1:
# Variable-length or 1D fixed-length array field
ret[field] = np.hstack(arr[field])
elif len(dt.shape):
# Multidimensional fixed-length array field
ret[field] = np.vstack(arr[field])
else:
# Scalar field
ret[field] = np.repeat(arr[field], len_array)
return ret
def dup_idx(arr):
"""Return the indices of all duplicated array elements.
Parameters
----------
arr : array-like object
An array-like object
Returns
-------
idx : NumPy array
An array containing the indices of the duplicated elements
Examples
--------
>>> from root_numpy import dup_idx
>>> dup_idx([1, 2, 3, 4, 5])
array([], dtype=int64)
>>> dup_idx([1, 2, 3, 4, 5, 5])
array([4, 5])
>>> dup_idx([1, 2, 3, 4, 5, 5, 1])
array([0, 4, 5, 6])
"""
_, b = np.unique(arr, return_inverse=True)
return np.nonzero(np.logical_or.reduce(
b[:, np.newaxis] == np.nonzero(np.bincount(b) > 1),
axis=1))[0]
def blockwise_inner_join(data, left, foreign_key, right,
force_repeat=None,
foreign_key_name=None):
"""Perform a blockwise inner join.
Perform a blockwise inner join from names specified in ``left`` to
``right`` via ``foreign_key``: left->foreign_key->right.
Parameters
----------
data : array
A structured NumPy array.
left : array
Array of left side column names.
foreign_key : array or string
NumPy array or string ``foreign_key`` column name. This column can be
either an integer or an array of ints. If ``foreign_key`` is an array
of int column, left column will be treated according to left column
type:
* Scalar columns or columns in ``force_repeat`` will be repeated
* Array columns not in ``force_repeat`` will be assumed to the
same length as ``foreign_key`` and will be stretched by index
right : array
Array of right side column names. These are array columns that each
index ``foreign_key`` points to. These columns are assumed to have the
same length.
force_repeat : array, optional (default=None)
Array of left column names that will be forced to stretch even if it's
an array (useful when you want to emulate a multiple join).
foreign_key_name : str, optional (default=None)
The name of foreign key column in the output array.
Examples
--------
>>> import numpy as np
>>> from root_numpy import blockwise_inner_join
>>> test_data = np.array([
(1.0, np.array([11, 12, 13]), np.array([1, 0, 1]), 0, np.array([1, 2, 3])),
(2.0, np.array([21, 22, 23]), np.array([-1, 2, -1]), 1, np.array([31, 32, 33]))],
dtype=[('sl', np.float), ('al', 'O'), ('fk', 'O'), ('s_fk', np.int), ('ar', 'O')])
>>> blockwise_inner_join(test_data, ['sl', 'al'], test_data['fk'], ['ar'])
array([(1.0, 11, 2, 1), (1.0, 12, 1, 0), (1.0, 13, 2, 1), (2.0, 22, 33, 2)],
dtype=[('sl', '<f8'), ('al', '<i8'), ('ar', '<i8'), ('fk', '<i8')])
>>> blockwise_inner_join(test_data, ['sl', 'al'], test_data['fk'], ['ar'], force_repeat=['al'])
array([(1.0, [11, 12, 13], 2, 1), (1.0, [11, 12, 13], 1, 0),
(1.0, [11, 12, 13], 2, 1), (2.0, [21, 22, 23], 33, 2)],
dtype=[('sl', '<f8'), ('al', '|O8'), ('ar', '<i8'), ('fk', '<i8')])
"""
if isinstance(foreign_key, string_types):
foreign_key = data[foreign_key]
return _blockwise_inner_join(data, left, foreign_key, right,
force_repeat, foreign_key_name)