Large diffs are not rendered by default.

@@ -592,13 +592,15 @@ def _find_best_split(np.ndarray[DTYPE_t, ndim=2, mode="fortran"] X,
cdef int n_total_samples = X.shape[0]
cdef int n_features = X.shape[1]
cdef int i, a, b, best_i = -1
cdef Py_ssize_t feature_idx = -1
cdef int n_left = 0
cdef DTYPE_t t, initial_error, error
cdef DTYPE_t best_error = np.inf, best_t = np.inf
cdef DTYPE_t *y_ptr = <DTYPE_t *>y.data
cdef DTYPE_t *X_i = NULL
cdef int *X_argsorted_i = NULL
cdef BOOL_t *sample_mask_ptr = <BOOL_t *>sample_mask.data
cdef np.ndarray[np.int64_t, ndim=1, mode='c'] features = None

# Compute the column strides (increment in pointer elements to get
# from column i to i + 1) for `X` and `X_argsorted`
@@ -622,11 +624,13 @@ def _find_best_split(np.ndarray[DTYPE_t, ndim=2, mode="fortran"] X,
# Features to consider
if max_features < 0 or max_features == n_features:
features = np.arange(n_features)
max_features = n_features
else:
features = random_state.permutation(n_features)[:max_features]

# Look for the best split
for i in features:
for feature_idx in range(max_features):
i = features[feature_idx]
# Get i-th col of X and X_sorted
X_i = (<DTYPE_t *>X.data) + X_stride * i
X_argsorted_i = (<int *>X_argsorted.data) + X_argsorted_stride * i
@@ -732,13 +736,15 @@ def _find_best_random_split(np.ndarray[DTYPE_t, ndim=2, mode="fortran"] X,
# Variables
cdef int n_total_samples = X.shape[0]
cdef int n_features = X.shape[1]
cdef int i, a, b, best_i = -1
cdef int i, a, b, c, n_left, best_i = -1
cdef Py_ssize_t feature_idx = -1
cdef DTYPE_t t, initial_error, error
cdef DTYPE_t best_error = np.inf, best_t = np.inf
cdef DTYPE_t *y_ptr = <DTYPE_t *>y.data
cdef DTYPE_t *X_i = NULL
cdef int *X_argsorted_i = NULL
cdef BOOL_t *sample_mask_ptr = <BOOL_t *>sample_mask.data
cdef np.ndarray[np.int64_t, ndim=1, mode='c'] features = None

This comment has been minimized.

Copy link
@glouppe

glouppe Jun 26, 2012

Member

I had the same issue locally. It can be fixed by replacing np.int64_t with long.

This comment has been minimized.

Copy link
@ogrisel

ogrisel Jun 26, 2012

Member

I think @larsmans or @jakevdp know which type should be used for indexing arrays in scipy sparse data-structures, assuming this is the cause of the problem.

This comment has been minimized.

Copy link
@pprett

pprett Jun 26, 2012

Author Member

@glouppe which numpy version do you have? Does np.arange return dtype long instead of np.int64?

This comment has been minimized.

Copy link
@glouppe

glouppe Jun 26, 2012

Member
In [9]: np.version.version
Out[9]: '1.5.1'

In [10]: np.arange(42).dtype
Out[10]: dtype('int32')

This comment has been minimized.

Copy link
@pprett

pprett via email Jun 26, 2012

Author Member

This comment has been minimized.

Copy link
@amueller

amueller Jun 26, 2012

Member

This causes a problem:

In [1]: np.version.version
Out[1]: '1.6.1'

In [2]: np.arange(42).dtype
Out[2]: dtype('int32')

This doesn't:

In [1]: np.version.version
Out[1]: '1.6.1'

In [2]: np.arange(42).dtype
Out[2]: dtype('int64')

First is on 32bit, second on 64bit arch. Shouldn't this be platform independent? I don't like this :-/

Is there any way to test this using jenkins?

This comment has been minimized.

Copy link
@amueller

amueller Jun 26, 2012

Member

The result of

np.arange(np.array(42).astype(np.int32)).dtype

is platform dependent, the result of

np.arange(np.array(42).astype(np.int64)).dtype

is not

This comment has been minimized.

Copy link
@ogrisel

ogrisel Jun 26, 2012

Member

First is on 32bit, second on 64bit arch. Shouldn't this be platform independent? I don't like this :-/

I think this is good opportunity to experiment with cython 0.16 fused types:

http://docs.cython.org/src/userguide/fusedtypes.html

Is there any way to test this using jenkins?

We could have more VMs but that would require more credits. We just have 1h build per day.

This comment has been minimized.

Copy link
@jakevdp

jakevdp Jun 26, 2012

Member

You might try using Py_ssize_t for the index variables: I think this should be correct on each platform.

This comment has been minimized.

Copy link
@GaelVaroquaux

GaelVaroquaux via email Jun 26, 2012

Member

# Compute the column strides (increment in pointer elements to get
# from column i to i + 1) for `X` and `X_argsorted`
@@ -762,11 +768,13 @@ def _find_best_random_split(np.ndarray[DTYPE_t, ndim=2, mode="fortran"] X,
# Features to consider
if max_features == n_features:
features = np.arange(n_features)
max_features = n_features
else:
features = random_state.permutation(n_features)[:max_features]

# Look for the best random split
for i in features:
for feature_idx in range(max_features):
i = features[feature_idx]
# Get i-th col of X and X_sorted
X_i = (<DTYPE_t *>X.data) + X_stride * i
X_argsorted_i = (<int *>X_argsorted.data) + X_argsorted_stride * i
@@ -787,7 +795,8 @@ def _find_best_random_split(np.ndarray[DTYPE_t, ndim=2, mode="fortran"] X,
continue

# Draw a random threshold in [a, b)
t = X_i[X_argsorted_i[a]] + random_state.rand() * (X_i[X_argsorted_i[b]] - X_i[X_argsorted_i[a]])
t = X_i[X_argsorted_i[a]] + (random_state.rand() *
(X_i[X_argsorted_i[b]] - X_i[X_argsorted_i[a]]))
if t == X_i[X_argsorted_i[b]]:
t = X_i[X_argsorted_i[a]]