/
index.py
229 lines (195 loc) · 7.78 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
from __future__ import annotations
import collections.abc as cabc
from collections.abc import Sequence
from functools import singledispatch
from itertools import repeat
import h5py
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix, issparse, spmatrix
from ..compat import AwkArray, DaskArray, Index, Index1D
def _normalize_indices(
index: Index | None, names0: pd.Index, names1: pd.Index
) -> tuple[slice, slice]:
# deal with tuples of length 1
if isinstance(index, tuple) and len(index) == 1:
index = index[0]
# deal with pd.Series
if isinstance(index, pd.Series):
index: Index = index.values
if isinstance(index, tuple):
if len(index) > 2:
raise ValueError("AnnData can only be sliced in rows and columns.")
# deal with pd.Series
# TODO: The series should probably be aligned first
if isinstance(index[1], pd.Series):
index = index[0], index[1].values
if isinstance(index[0], pd.Series):
index = index[0].values, index[1]
ax0, ax1 = unpack_index(index)
ax0 = _normalize_index(ax0, names0)
ax1 = _normalize_index(ax1, names1)
return ax0, ax1
def _normalize_index(
indexer: slice
| np.integer
| int
| str
| Sequence[bool | int | np.integer]
| np.ndarray
| pd.Index,
index: pd.Index,
) -> slice | int | np.ndarray: # ndarray of int or bool
if not isinstance(index, pd.RangeIndex):
assert (
index.dtype != float and index.dtype != int
), "Don’t call _normalize_index with non-categorical/string names"
# the following is insanely slow for sequences,
# we replaced it using pandas below
def name_idx(i):
if isinstance(i, str):
i = index.get_loc(i)
return i
if isinstance(indexer, slice):
start = name_idx(indexer.start)
stop = name_idx(indexer.stop)
# string slices can only be inclusive, so +1 in that case
if isinstance(indexer.stop, str):
stop = None if stop is None else stop + 1
step = indexer.step
return slice(start, stop, step)
elif isinstance(indexer, (np.integer, int)):
return indexer
elif isinstance(indexer, str):
return index.get_loc(indexer) # int
elif isinstance(indexer, (Sequence, np.ndarray, pd.Index, spmatrix, np.matrix)):
if hasattr(indexer, "shape") and (
(indexer.shape == (index.shape[0], 1))
or (indexer.shape == (1, index.shape[0]))
):
if isinstance(indexer, spmatrix):
indexer = indexer.toarray()
indexer = np.ravel(indexer)
if not isinstance(indexer, (np.ndarray, pd.Index)):
indexer = np.array(indexer)
if len(indexer) == 0:
indexer = indexer.astype(int)
if issubclass(indexer.dtype.type, (np.integer, np.floating)):
return indexer # Might not work for range indexes
elif issubclass(indexer.dtype.type, np.bool_):
if indexer.shape != index.shape:
raise IndexError(
f"Boolean index does not match AnnData’s shape along this "
f"dimension. Boolean index has shape {indexer.shape} while "
f"AnnData index has shape {index.shape}."
)
return indexer
else: # indexer should be string array
positions = index.get_indexer(indexer)
if np.any(positions < 0):
not_found = indexer[positions < 0]
raise KeyError(
f"Values {list(not_found)}, from {list(indexer)}, "
"are not valid obs/ var names or indices."
)
return positions # np.ndarray[int]
else:
raise IndexError(f"Unknown indexer {indexer!r} of type {type(indexer)}")
def _fix_slice_bounds(s: slice, length: int) -> slice:
"""The slice will be clipped to length, and the step won't be None.
E.g. infer None valued attributes.
"""
step = s.step if s.step is not None else 1
# slice constructor would have errored if step was 0
if step > 0:
start = s.start if s.start is not None else 0
stop = s.stop if s.stop is not None else length
elif step < 0:
# Reverse
start = s.start if s.start is not None else length
stop = s.stop if s.stop is not None else 0
return slice(start, stop, step)
def unpack_index(index: Index) -> tuple[Index1D, Index1D]:
if not isinstance(index, tuple):
return index, slice(None)
elif len(index) == 2:
return index
elif len(index) == 1:
return index[0], slice(None)
else:
raise IndexError("invalid number of indices")
@singledispatch
def _subset(a: np.ndarray | pd.DataFrame, subset_idx: Index):
# Select as combination of indexes, not coordinates
# Correcting for indexing behaviour of np.ndarray
if all(isinstance(x, cabc.Iterable) for x in subset_idx):
subset_idx = np.ix_(*subset_idx)
return a[subset_idx]
@_subset.register(DaskArray)
def _subset_dask(a: DaskArray, subset_idx: Index):
if len(subset_idx) > 1 and all(isinstance(x, cabc.Iterable) for x in subset_idx):
if isinstance(a._meta, csc_matrix):
return a[:, subset_idx[1]][subset_idx[0], :]
return a[subset_idx[0], :][:, subset_idx[1]]
return a[subset_idx]
@_subset.register(spmatrix)
def _subset_spmatrix(a: spmatrix, subset_idx: Index):
# Correcting for indexing behaviour of sparse.spmatrix
if len(subset_idx) > 1 and all(isinstance(x, cabc.Iterable) for x in subset_idx):
first_idx = subset_idx[0]
if issubclass(first_idx.dtype.type, np.bool_):
first_idx = np.where(first_idx)[0]
subset_idx = (first_idx.reshape(-1, 1), *subset_idx[1:])
return a[subset_idx]
@_subset.register(pd.DataFrame)
def _subset_df(df: pd.DataFrame, subset_idx: Index):
return df.iloc[subset_idx]
@_subset.register(AwkArray)
def _subset_awkarray(a: AwkArray, subset_idx: Index):
if all(isinstance(x, cabc.Iterable) for x in subset_idx):
subset_idx = np.ix_(*subset_idx)
return a[subset_idx]
# Registration for SparseDataset occurs in sparse_dataset.py
@_subset.register(h5py.Dataset)
def _subset_dataset(d, subset_idx):
if not isinstance(subset_idx, tuple):
subset_idx = (subset_idx,)
ordered = list(subset_idx)
rev_order = [slice(None) for _ in range(len(subset_idx))]
for axis, axis_idx in enumerate(ordered.copy()):
if isinstance(axis_idx, np.ndarray):
if axis_idx.dtype == bool:
axis_idx = np.where(axis_idx)[0]
order = np.argsort(axis_idx)
ordered[axis] = axis_idx[order]
rev_order[axis] = np.argsort(order)
# from hdf5, then to real order
return d[tuple(ordered)][tuple(rev_order)]
def make_slice(idx, dimidx, n=2):
mut = list(repeat(slice(None), n))
mut[dimidx] = idx
return tuple(mut)
def get_vector(adata, k, coldim, idxdim, layer=None):
# adata could be self if Raw and AnnData shared a parent
dims = ("obs", "var")
col = getattr(adata, coldim).columns
idx = getattr(adata, f"{idxdim}_names")
in_col = k in col
in_idx = k in idx
if (in_col + in_idx) == 2:
raise ValueError(
f"Key {k} could be found in both .{idxdim}_names and .{coldim}.columns"
)
elif (in_col + in_idx) == 0:
raise KeyError(
f"Could not find key {k} in .{idxdim}_names or .{coldim}.columns."
)
elif in_col:
return getattr(adata, coldim)[k].values
elif in_idx:
selected_dim = dims.index(idxdim)
idx = adata._normalize_indices(make_slice(k, selected_dim))
a = adata._get_X(layer=layer)[idx]
if issparse(a):
a = a.toarray()
return np.ravel(a)