Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG+1] Use astype(.., copy=False) when possible #11973

Merged
merged 14 commits into from Mar 1, 2019
@@ -376,6 +376,9 @@ Multiple modules
but this can be altered with the ``print_changed_only`` option in
:func:`sklearn.set_config`. :issue:`11705` by :user:`Nicolas Hug
<NicolasHug>`.
- |Efficiency| Memory copies are avoided when casting arrays to a different
dtype in multiple estimators. :issue:`11973` by :user:`Roman Yurchak`_.


Changes to estimator checks
---------------------------
@@ -22,6 +22,7 @@
from . import _hierarchical
from ._feature_agglomeration import AgglomerationTransform
from ..utils.fast_dict import IntFloatDict
from ..utils.fixes import _astype_copy_false

###############################################################################
# For non fully-connected graphs
@@ -87,7 +88,8 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters,
from scipy.sparse.csgraph import minimum_spanning_tree

# explicitly cast connectivity to ensure safety
connectivity = connectivity.astype('float64')
connectivity = connectivity.astype('float64',
**_astype_copy_false(connectivity))

# Ensure zero distances aren't ignored by setting them to "epsilon"
epsilon_value = np.finfo(dtype=connectivity.data.dtype).eps
@@ -109,7 +111,7 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters,

# Convert edge list into standard hierarchical clustering format
single_linkage_tree = _hierarchical._single_linkage_label(mst_array)
children_ = single_linkage_tree[:, :2].astype(np.int)
children_ = single_linkage_tree[:, :2].astype(np.int, copy=False)

This comment has been minimized.

Copy link
@GaelVaroquaux

GaelVaroquaux Feb 28, 2019

Member

Actually, here I think that it's beneficial to have a copy. It's a somewhat subtle reason:

We are taking a view of the data that is smaller than the original data, via the slicing. This view references the original data. If no copy is made, the original data cannot be garbage collected, and memory is wasted.

This comment has been minimized.

Copy link
@agramfort

agramfort Feb 28, 2019

Member

to fix or not?

This comment has been minimized.

Copy link
@rth

rth Feb 28, 2019

Author Member

Will fix tomorrow...

This comment has been minimized.

Copy link
@rth

rth Mar 1, 2019

Author Member

Agreed, removed the copy=False here.


# Compute parents
parent = np.arange(n_nodes, dtype=np.intp)
@@ -229,7 +231,7 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False):
stacklevel=2)
X = np.require(X, requirements="W")
out = hierarchy.ward(X)
children_ = out[:, :2].astype(np.intp)
children_ = out[:, :2].astype(np.intp, copy=False)

This comment has been minimized.

Copy link
@GaelVaroquaux

GaelVaroquaux Feb 28, 2019

Member

Whether or not the argument about view not being released applies here or not depends on whether we have return_distance or not. I am not sure whether or not the copy should be kept or not.

This comment does not ask for a change in the diff. It's just a reflection as I pass by.

This comment has been minimized.

Copy link
@rth

rth Mar 1, 2019

Author Member

Removed copy=False here to use the defaults when in doubt.


if return_distance:
distances = out[:, 2]
@@ -458,7 +460,7 @@ def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete',
i, j = np.triu_indices(X.shape[0], k=1)
X = X[i, j]
out = hierarchy.linkage(X, method=linkage, metric=affinity)
children_ = out[:, :2].astype(np.int)
children_ = out[:, :2].astype(np.int, copy=False)

if return_distance:
distances = out[:, 2]
@@ -477,7 +479,8 @@ def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete',
del diag_mask

if affinity == 'precomputed':
distances = X[connectivity.row, connectivity.col].astype('float64')
distances = X[connectivity.row, connectivity.col].astype(
'float64', **_astype_copy_false(X))
else:
# FIXME We compute all the distances, while we could have only computed
# the "interesting" distances
@@ -178,7 +178,7 @@ def _check_sample_weight(X, sample_weight):
% (n_samples, len(sample_weight)))
# normalize the weights to sum up to n_samples
scale = n_samples / sample_weight.sum()
return (sample_weight * scale).astype(X.dtype)
return (sample_weight * scale).astype(X.dtype, copy=False)


def k_means(X, n_clusters, sample_weight=None, init='k-means++',
@@ -618,7 +618,7 @@ def _labels_inertia_precompute_dense(X, sample_weight, x_squared_norms,
labels, mindist = pairwise_distances_argmin_min(
X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True})
# cython k-means code assumes int32 inputs
labels = labels.astype(np.int32)
labels = labels.astype(np.int32, copy=False)
if n_samples == distances.shape[0]:
# distances will be changed in-place
distances[:] = mindist
@@ -1194,9 +1194,10 @@ def _mini_batch_step(X, sample_weight, x_squared_norms, centers, weight_sums,
% n_reassigns)

if sp.issparse(X) and not sp.issparse(centers):
assign_rows_csr(X, new_centers.astype(np.intp),
np.where(to_reassign)[0].astype(np.intp),
centers)
assign_rows_csr(
X, new_centers.astype(np.intp, copy=False),
np.where(to_reassign)[0].astype(np.intp, copy=False),
centers)
else:
centers[to_reassign] = X[new_centers]
# reset counts of reassigned centers, but don't reset them too small
@@ -283,7 +283,7 @@ def test_scikit_vs_scipy():

out = hierarchy.linkage(X, method=linkage)

children_ = out[:, :2].astype(np.int)
children_ = out[:, :2].astype(np.int, copy=False)
children, _, n_leaves, _ = _TREE_BUILDERS[linkage](X, connectivity)

# Sort the order of child nodes per row for consistency
@@ -482,14 +482,14 @@ def test_connectivity_fixing_non_lil():

def test_int_float_dict():
rng = np.random.RandomState(0)
keys = np.unique(rng.randint(100, size=10).astype(np.intp))
keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False))
values = rng.rand(len(keys))

d = IntFloatDict(keys, values)
for key, value in zip(keys, values):
assert d[key] == value

other_keys = np.arange(50).astype(np.intp)[::2]
other_keys = np.arange(50, dtype=np.intp)[::2]
other_values = np.full(50, 0.5)[::2]
other = IntFloatDict(other_keys, other_values)
# Complete smoke test
@@ -545,7 +545,7 @@ def load_digits(n_class=10, return_X_y=False):
delimiter=',')
with open(join(module_path, 'descr', 'digits.rst')) as f:
descr = f.read()
target = data[:, -1].astype(np.int)
target = data[:, -1].astype(np.int, copy=False)
flat_data = data[:, :-1]
images = flat_data.view()
images.shape = (-1, 8, 8)
@@ -115,7 +115,7 @@ def fetch_covtype(data_home=None, download_if_missing=True,
remove(archive_path)

X = Xy[:, :-1]
y = Xy[:, -1].astype(np.int32)
y = Xy[:, -1].astype(np.int32, copy=False)

_joblib.dump(X, samples_path, compress=9)
_joblib.dump(y, targets_path, compress=9)
@@ -139,9 +139,9 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False,
data = np.c_[data[s, :11], data[s, 12:]]
target = target[s]

data[:, 0] = np.log((data[:, 0] + 0.1).astype(float))
data[:, 4] = np.log((data[:, 4] + 0.1).astype(float))
data[:, 5] = np.log((data[:, 5] + 0.1).astype(float))
data[:, 0] = np.log((data[:, 0] + 0.1).astype(float, copy=False))
data[:, 4] = np.log((data[:, 4] + 0.1).astype(float, copy=False))
data[:, 5] = np.log((data[:, 5] + 0.1).astype(float, copy=False))

if subset == 'http':
s = data[:, 2] == b'http'
@@ -605,7 +605,7 @@ def fetch_openml(name=None, version='active', data_id=None, data_home=None,
elif all(is_classification):
y = np.hstack([np.take(np.asarray(nominal_attributes.pop(col_name),
dtype='O'),
y[:, i:i+1].astype(int))
y[:, i:i+1].astype(int, copy=False))
for i, col_name in enumerate(target_column)])
elif any(is_classification):
raise ValueError('Mix of nominal and non-nominal targets is not '
@@ -179,7 +179,7 @@ def fetch_rcv1(data_home=None, subset='all', download_if_missing=True,
# Training data is before testing data
X = sp.vstack([Xy[8], Xy[0], Xy[2], Xy[4], Xy[6]]).tocsr()
sample_id = np.hstack((Xy[9], Xy[1], Xy[3], Xy[5], Xy[7]))
sample_id = sample_id.astype(np.uint32)
sample_id = sample_id.astype(np.uint32, copy=False)

_joblib.dump(X, samples_path, compress=9)
_joblib.dump(sample_id, sample_id_path, compress=9)
@@ -190,7 +190,7 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,

# Build the polytope whose vertices become cluster centroids
centroids = _generate_hypercube(n_clusters, n_informative,
generator).astype(float)
generator).astype(float, copy=False)
centroids *= 2 * class_sep
centroids -= class_sep
if not hypercube:
@@ -446,7 +446,7 @@ def make_hastie_10_2(n_samples=12000, random_state=None):

shape = (n_samples, 10)
X = rs.normal(size=shape).reshape(shape)
y = ((X ** 2.0).sum(axis=1) > 9.34).astype(np.float64)
y = ((X ** 2.0).sum(axis=1) > 9.34).astype(np.float64, copy=False)
y[y == 0.0] = -1.0

return X, y
@@ -273,13 +273,13 @@ def test_dump():
assert_array_almost_equal(
X_input_dense, X2_dense, 4)
assert_array_almost_equal(
y_dense.astype(dtype), y2, 4)
y_dense.astype(dtype, copy=False), y2, 4)
else:
# allow a rounding error at the last decimal place
assert_array_almost_equal(
X_input_dense, X2_dense, 15)
assert_array_almost_equal(
y_dense.astype(dtype), y2, 15)
y_dense.astype(dtype, copy=False), y2, 15)


def test_dump_multilabel():
@@ -719,7 +719,8 @@ def test_pca_deterministic_output():

def check_pca_float_dtype_preservation(svd_solver):
# Ensure that PCA does not upscale the dtype when input is float32
X_64 = np.random.RandomState(0).rand(1000, 4).astype(np.float64)
X_64 = np.random.RandomState(0).rand(1000, 4).astype(np.float64,
copy=False)
X_32 = X_64.astype(np.float32)

pca_64 = PCA(n_components=3, svd_solver=svd_solver,
@@ -740,8 +741,8 @@ def check_pca_float_dtype_preservation(svd_solver):
def check_pca_int_dtype_upcast_to_double(svd_solver):
# Ensure that all int types will be upcast to float64
X_i64 = np.random.RandomState(0).randint(0, 1000, (1000, 4))
X_i64 = X_i64.astype(np.int64)
X_i32 = X_i64.astype(np.int32)
X_i64 = X_i64.astype(np.int64, copy=False)
X_i32 = X_i64.astype(np.int32, copy=False)

pca_64 = PCA(n_components=3, svd_solver=svd_solver,
random_state=0).fit(X_i64)
@@ -810,7 +810,7 @@ def test_set_oob_score_label_encoding():


def replace(X):
X = X.copy().astype('float')
X = X.astype('float', copy=True)
X[~np.isfinite(X)] = 0
return X

@@ -95,8 +95,8 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1):
'Please upgrade to scipy >=0.14')
.format(indptr[-1], '.'.join(sp_version)))
# both indices and indptr have the same dtype in CSR arrays
indices_a = indices_a.astype(np.int64)
indices_a = indices_a.astype(np.int64, copy=False)
else:
indptr_a = indptr_a.astype(np.int32)
indptr_a = indptr_a.astype(np.int32, copy=False)

return (indices_a, indptr_a, values[:size])
@@ -30,6 +30,7 @@
from .stop_words import ENGLISH_STOP_WORDS
from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
from ..utils import _IS_32BIT
from ..utils.fixes import _astype_copy_false


__all__ = ['HashingVectorizer',
@@ -1239,7 +1240,8 @@ def fit(self, X, y=None):

if self.use_idf:
n_samples, n_features = X.shape
df = _document_frequency(X).astype(dtype)
df = _document_frequency(X)
df = df.astype(dtype, **_astype_copy_false(df))

# perform idf smoothing if required
df += int(self.smooth_idf)
@@ -9,6 +9,7 @@
from ..neighbors import NearestNeighbors
from ..preprocessing import scale
from ..utils import check_random_state
from ..utils.fixes import _astype_copy_false
from ..utils.validation import check_X_y
from ..utils.multiclass import check_classification_targets

@@ -274,7 +275,7 @@ def _estimate_mi(X, y, discrete_features='auto', discrete_target=False,
with_mean=False, copy=False)

# Add small noise to continuous features as advised in Kraskov et. al.
X = X.astype(float)
X = X.astype(float, **_astype_copy_false(X))
means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
X[:, continuous_mask] += 1e-10 * means * rng.randn(
n_samples, np.sum(continuous_mask))
@@ -140,7 +140,8 @@ def test_binary_search_neighbors():

# Test that when we use all the neighbors the results are identical
k = n_samples
neighbors_nn = np.argsort(distances, axis=1)[:, 1:k].astype(np.int64)
neighbors_nn = np.argsort(distances, axis=1)[:, 1:k].astype(np.int64,
copy=False)
distances_nn = np.array([distances[k, neighbors_nn[k]]
for k in range(n_samples)])
P2 = _binary_search_perplexity(distances_nn, neighbors_nn,
@@ -152,7 +153,8 @@ def test_binary_search_neighbors():
for k in np.linspace(80, n_samples, 5):
k = int(k)
topn = k * 10 # check the top 10 *k entries out of k * k entries
neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64)
neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64,
copy=False)
distances_nn = np.array([distances[k, neighbors_nn[k]]
for k in range(n_samples)])
P2k = _binary_search_perplexity(distances_nn, neighbors_nn,
@@ -176,7 +178,8 @@ def test_binary_perplexity_stability():
distances = np.abs(distances.dot(distances.T))
np.fill_diagonal(distances, 0.0)
last_P = None
neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64)
neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64,
copy=False)
for _ in range(100):
P = _binary_search_perplexity(distances.copy(), neighbors_nn.copy(),
3, verbose=0)
@@ -532,7 +535,7 @@ def _run_answer_test(pos_input, pos_output, neighbors, grad_output,
distances = pairwise_distances(pos_input).astype(np.float32)
args = distances, perplexity, verbose
pos_output = pos_output.astype(np.float32)
neighbors = neighbors.astype(np.int64)
neighbors = neighbors.astype(np.int64, copy=False)
pij_input = _joint_probabilities(*args)
pij_input = squareform(pij_input).astype(np.float32)
grad_bh = np.zeros(pos_output.shape, dtype=np.float32)
@@ -604,7 +607,7 @@ def test_64bit(method, dt):
# Ensure 64bit arrays are handled correctly.
random_state = check_random_state(0)

X = random_state.randn(50, 2).astype(dt)
X = random_state.randn(50, 2).astype(dt, copy=False)
tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0,
random_state=0, method=method, verbose=0)
X_embedded = tsne.fit_transform(X)
@@ -28,8 +28,8 @@ def expected_mutual_information(contingency, int n_samples):
#cdef np.ndarray[int, ndim=2] start, end
R, C = contingency.shape
N = <DOUBLE>n_samples
a = np.ravel(contingency.sum(axis=1).astype(np.int32))
b = np.ravel(contingency.sum(axis=0).astype(np.int32))
a = np.ravel(contingency.sum(axis=1).astype(np.int32, copy=False))
b = np.ravel(contingency.sum(axis=0).astype(np.int32, copy=False))
# There are three major terms to the EMI equation, which are multiplied to
# and then summed over varying nij values.
# While nijs[0] will never be used, having it simplifies the indexing.
@@ -23,7 +23,7 @@

from .expected_mutual_info_fast import expected_mutual_information
from ...utils.validation import check_array
from ...utils.fixes import comb
from ...utils.fixes import comb, _astype_copy_false


def _comb2(n):
@@ -631,7 +631,8 @@ def mutual_info_score(labels_true, labels_pred, contingency=None):
log_contingency_nm = np.log(nz_val)
contingency_nm = nz_val / contingency_sum
# Don't need to calculate the full outer product, just for non-zeroes
outer = pi.take(nzx).astype(np.int64) * pj.take(nzy).astype(np.int64)
outer = (pi.take(nzx).astype(np.int64, copy=False)
* pj.take(nzy).astype(np.int64, copy=False))
log_outer = -np.log(outer) + log(pi.sum()) + log(pj.sum())
mi = (contingency_nm * (log_contingency_nm - log(contingency_sum)) +
contingency_nm * log_outer)
@@ -740,7 +741,8 @@ def adjusted_mutual_info_score(labels_true, labels_pred,
classes.shape[0] == clusters.shape[0] == 0):
return 1.0
contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
contingency = contingency.astype(np.float64)
contingency = contingency.astype(np.float64,
**_astype_copy_false(contingency))
# Calculate the MI for the two clusterings
mi = mutual_info_score(labels_true, labels_pred,
contingency=contingency)
@@ -851,7 +853,8 @@ def normalized_mutual_info_score(labels_true, labels_pred,
classes.shape[0] == clusters.shape[0] == 0):
return 1.0
contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
contingency = contingency.astype(np.float64)
contingency = contingency.astype(np.float64,
**_astype_copy_false(contingency))
# Calculate the MI for the two clusterings
mi = mutual_info_score(labels_true, labels_pred,
contingency=contingency)
@@ -934,7 +937,8 @@ def fowlkes_mallows_score(labels_true, labels_pred, sparse=False):
n_samples, = labels_true.shape

c = contingency_matrix(labels_true, labels_pred,
sparse=True).astype(np.int64)
sparse=True)
c = c.astype(np.int64, **_astype_copy_false(c))
tk = np.dot(c.data, c.data) - n_samples
pk = np.sum(np.asarray(c.sum(axis=0)).ravel() ** 2) - n_samples
qk = np.sum(np.asarray(c.sum(axis=1)).ravel() ** 2) - n_samples
Oops, something went wrong.
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.