-
Notifications
You must be signed in to change notification settings - Fork 580
/
_normalization.py
240 lines (208 loc) · 7.99 KB
/
_normalization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
from typing import Optional, Union, Iterable, Dict, Literal
from warnings import warn
import numpy as np
from anndata import AnnData
from scipy.sparse import issparse
from sklearn.utils import sparsefuncs
try:
from dask.array import Array as DaskArray
except ImportError:
class DaskArray:
pass
from scanpy import logging as logg
from scanpy._utils import view_to_actual
from scanpy.get import _get_obs_rep, _set_obs_rep
def _normalize_data(X, counts, after=None, copy=False):
X = X.copy() if copy else X
if issubclass(X.dtype.type, (int, np.integer)):
X = X.astype(np.float32) # TODO: Check if float64 should be used
if isinstance(counts, DaskArray):
counts_greater_than_zero = counts[counts > 0].compute_chunk_sizes()
else:
counts_greater_than_zero = counts[counts > 0]
after = np.median(counts_greater_than_zero, axis=0) if after is None else after
counts += counts == 0
counts = counts / after
if issparse(X):
sparsefuncs.inplace_row_scale(X, 1 / counts)
elif isinstance(counts, np.ndarray):
np.divide(X, counts[:, None], out=X)
else:
X = np.divide(X, counts[:, None]) # dask does not support kwarg "out"
return X
def normalize_total(
adata: AnnData,
target_sum: Optional[float] = None,
exclude_highly_expressed: bool = False,
max_fraction: float = 0.05,
key_added: Optional[str] = None,
layer: Optional[str] = None,
layers: Union[Literal['all'], Iterable[str]] = None,
layer_norm: Optional[str] = None,
inplace: bool = True,
copy: bool = False,
) -> Optional[Dict[str, np.ndarray]]:
"""\
Normalize counts per cell.
Normalize each cell by total counts over all genes,
so that every cell has the same total count after normalization.
If choosing `target_sum=1e6`, this is CPM normalization.
If `exclude_highly_expressed=True`, very highly expressed genes are excluded
from the computation of the normalization factor (size factor) for each
cell. This is meaningful as these can strongly influence the resulting
normalized values for all other genes [Weinreb17]_.
Similar functions are used, for example, by Seurat [Satija15]_, Cell Ranger
[Zheng17]_ or SPRING [Weinreb17]_.
Params
------
adata
The annotated data matrix of shape `n_obs` × `n_vars`.
Rows correspond to cells and columns to genes.
target_sum
If `None`, after normalization, each observation (cell) has a total
count equal to the median of total counts for observations (cells)
before normalization.
exclude_highly_expressed
Exclude (very) highly expressed genes for the computation of the
normalization factor (size factor) for each cell. A gene is considered
highly expressed, if it has more than `max_fraction` of the total counts
in at least one cell. The not-excluded genes will sum up to
`target_sum`.
max_fraction
If `exclude_highly_expressed=True`, consider cells as highly expressed
that have more counts than `max_fraction` of the original total counts
in at least one cell.
key_added
Name of the field in `adata.obs` where the normalization factor is
stored.
layer
Layer to normalize instead of `X`. If `None`, `X` is normalized.
inplace
Whether to update `adata` or return dictionary with normalized copies of
`adata.X` and `adata.layers`.
copy
Whether to modify copied input object. Not compatible with inplace=False.
Returns
-------
Returns dictionary with normalized copies of `adata.X` and `adata.layers`
or updates `adata` with normalized version of the original
`adata.X` and `adata.layers`, depending on `inplace`.
Example
--------
>>> from anndata import AnnData
>>> import scanpy as sc
>>> sc.settings.verbosity = 2
>>> np.set_printoptions(precision=2)
>>> adata = AnnData(np.array([
... [3, 3, 3, 6, 6],
... [1, 1, 1, 2, 2],
... [1, 22, 1, 2, 2],
... ]))
>>> adata.X
array([[ 3., 3., 3., 6., 6.],
[ 1., 1., 1., 2., 2.],
[ 1., 22., 1., 2., 2.]], dtype=float32)
>>> X_norm = sc.pp.normalize_total(adata, target_sum=1, inplace=False)['X']
>>> X_norm
array([[0.14, 0.14, 0.14, 0.29, 0.29],
[0.14, 0.14, 0.14, 0.29, 0.29],
[0.04, 0.79, 0.04, 0.07, 0.07]], dtype=float32)
>>> X_norm = sc.pp.normalize_total(
... adata, target_sum=1, exclude_highly_expressed=True,
... max_fraction=0.2, inplace=False
... )['X']
The following highly-expressed genes are not considered during normalization factor computation:
['1', '3', '4']
>>> X_norm
array([[ 0.5, 0.5, 0.5, 1. , 1. ],
[ 0.5, 0.5, 0.5, 1. , 1. ],
[ 0.5, 11. , 0.5, 1. , 1. ]], dtype=float32)
"""
if copy:
if not inplace:
raise ValueError("`copy=True` cannot be used with `inplace=False`.")
adata = adata.copy()
if max_fraction < 0 or max_fraction > 1:
raise ValueError('Choose max_fraction between 0 and 1.')
# Deprecated features
if layers is not None:
warn(
FutureWarning(
"The `layers` argument is deprecated. Instead, specify individual "
"layers to normalize with `layer`."
)
)
if layer_norm is not None:
warn(
FutureWarning(
"The `layer_norm` argument is deprecated. Specify the target size "
"factor directly with `target_sum`."
)
)
if layers == 'all':
layers = adata.layers.keys()
elif isinstance(layers, str):
raise ValueError(
f"`layers` needs to be a list of strings or 'all', not {layers!r}"
)
view_to_actual(adata)
X = _get_obs_rep(adata, layer=layer)
gene_subset = None
msg = 'normalizing counts per cell'
if exclude_highly_expressed:
counts_per_cell = X.sum(1) # original counts per cell
counts_per_cell = np.ravel(counts_per_cell)
# at least one cell as more than max_fraction of counts per cell
gene_subset = (X > counts_per_cell[:, None] * max_fraction).sum(0)
gene_subset = np.asarray(np.ravel(gene_subset) == 0)
msg += (
' The following highly-expressed genes are not considered during '
f'normalization factor computation:\n{adata.var_names[~gene_subset].tolist()}'
)
counts_per_cell = X[:, gene_subset].sum(1)
else:
counts_per_cell = X.sum(1)
start = logg.info(msg)
counts_per_cell = np.ravel(counts_per_cell)
cell_subset = counts_per_cell > 0
if not np.all(cell_subset):
warn(UserWarning('Some cells have zero counts'))
if inplace:
if key_added is not None:
adata.obs[key_added] = counts_per_cell
_set_obs_rep(
adata, _normalize_data(X, counts_per_cell, target_sum), layer=layer
)
else:
# not recarray because need to support sparse
dat = dict(
X=_normalize_data(X, counts_per_cell, target_sum, copy=True),
norm_factor=counts_per_cell,
)
# Deprecated features
if layer_norm == 'after':
after = target_sum
elif layer_norm == 'X':
after = np.median(counts_per_cell[cell_subset])
elif layer_norm is None:
after = None
else:
raise ValueError('layer_norm should be "after", "X" or None')
for layer_to_norm in layers if layers is not None else ():
res = normalize_total(
adata, layer=layer_to_norm, target_sum=after, inplace=inplace
)
if not inplace:
dat[layer_to_norm] = res["X"]
logg.info(
' finished ({time_passed})',
time=start,
)
if key_added is not None:
logg.debug(
f'and added {key_added!r}, counts per cell before normalization (adata.obs)'
)
if copy:
return adata
elif not inplace:
return dat