Skip to content

Commit

Permalink
PERF/MAINT: swtich to memoryviews, remove unnecesary declarations
Browse files Browse the repository at this point in the history
break and return trial at the end
was missing the declaration for q
np.int is actually long, also memoryviews dont have a partition method
remove declaration of i
use memoryview for a in _qn without requiring contiguity, remove unnecesary imports
  • Loading branch information
esmucler authored and bashtage committed Aug 25, 2020
1 parent 1bb7f60 commit d176d4b
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 61 deletions.
96 changes: 45 additions & 51 deletions statsmodels/robust/_qn.pyx
@@ -1,71 +1,65 @@
#!python
#cython: wraparound=False, boundscheck=False, cdivision=True

cimport cython
import numpy as np
cimport numpy as np

DTYPE_INT = np.int
DTYPE_DOUBLE = np.float64

ctypedef np.float64_t DTYPE_DOUBLE_t
ctypedef np.int_t DTYPE_INT_t


def _high_weighted_median(np.ndarray[DTYPE_DOUBLE_t] a, np.ndarray[DTYPE_INT_t] weights):
def _high_weighted_median(double[::1] a, int[::1] weights):
"""
Computes a weighted high median of a. This is defined as the
smallest a[j] such that the sum over all a[i]<=a[j] is strictly
greater than half the total sum of the weights
"""
cdef:
DTYPE_INT_t n = a.shape[0]
np.ndarray[DTYPE_DOUBLE_t] sorted_a = np.zeros((n,), dtype=DTYPE_DOUBLE)
np.ndarray[DTYPE_DOUBLE_t] a_cand = np.zeros((n,), dtype=DTYPE_DOUBLE)
np.ndarray[DTYPE_INT_t] weights_cand = np.zeros((n,), dtype=DTYPE_INT)
Py_ssize_t i= 0
DTYPE_INT_t kcand = 0
DTYPE_INT_t wleft, wright, wmid, wtot, wrest = 0
DTYPE_DOUBLE_t trial = 0
wtot = np.sum(weights)
double[::1] a_cp = np.copy(a)
int[::1] weights_cp = np.copy(weights)
int n = a_cp.shape[0]
double[::1] sorted_a = np.zeros((n,), dtype=np.double)
double[::1] a_cand = np.zeros((n,), dtype=np.double)
int[::1] weights_cand = np.zeros((n,), dtype=np.intc)
int kcand = 0
int wleft, wright, wmid, wtot, wrest = 0
double trial = 0
wtot = np.sum(weights_cp)
while True:
wleft = 0
wmid = 0
wright = 0
for i in range(n):
sorted_a[i] = a[i]
sorted_a.partition(n//2)
sorted_a[i] = a_cp[i]
sorted_a = np.partition(sorted_a, kth=n//2)
trial = sorted_a[n//2]
for i in range(n):
if a[i] < trial:
wleft = wleft + weights[i]
elif a[i] > trial:
wright = wright + weights[i]
if a_cp[i] < trial:
wleft = wleft + weights_cp[i]
elif a_cp[i] > trial:
wright = wright + weights_cp[i]
else:
wmid = wmid + weights[i]
wmid = wmid + weights_cp[i]
kcand = 0
if 2 * (wrest + wleft) > wtot:
for i in range(n):
if a[i] < trial:
a_cand[kcand] = a[i]
weights_cand[kcand] = weights[i]
if a_cp[i] < trial:
a_cand[kcand] = a_cp[i]
weights_cand[kcand] = weights_cp[i]
kcand = kcand + 1
elif 2 * (wrest + wleft + wmid) <= wtot:
for i in range(n):
if a[i] > trial:
a_cand[kcand] = a[i]
weights_cand[kcand] = weights[i]
if a_cp[i] > trial:
a_cand[kcand] = a_cp[i]
weights_cand[kcand] = weights_cp[i]
kcand = kcand + 1
wrest = wrest + wleft + wmid
else:
return trial
break
n = kcand
for i in range(n):
a[i] = a_cand[i]
weights[i] = weights_cand[i]
a_cp[i] = a_cand[i]
weights_cp[i] = weights_cand[i]
return trial


def _qn(np.ndarray[DTYPE_DOUBLE_t] a, DTYPE_DOUBLE_t c):
def _qn(double[:] a, double c):
"""
Computes the Qn robust estimator of scale, a more efficient alternative
to the MAD. The implementation follows the algorithm described in Croux
Expand All @@ -85,22 +79,22 @@ def _qn(np.ndarray[DTYPE_DOUBLE_t] a, DTYPE_DOUBLE_t c):
The Qn robust estimator of scale
"""
cdef:
DTYPE_INT_t n = a.shape[0]
DTYPE_INT_t h = n/2 + 1
DTYPE_INT_t k = h * (h - 1) / 2
DTYPE_INT_t n_left = n * (n + 1) / 2
DTYPE_INT_t n_right = n * n
DTYPE_INT_t k_new = k + n_left
int n = a.shape[0]
int h = n/2 + 1
int k = h * (h - 1) / 2
int n_left = n * (n + 1) / 2
int n_right = n * n
int k_new = k + n_left
Py_ssize_t i, j, jh, l = 0
DTYPE_INT_t sump, sumq = 0
DTYPE_DOUBLE_t trial, output = 0
np.ndarray[DTYPE_DOUBLE_t] a_sorted = np.sort(a)
np.ndarray[DTYPE_INT_t] left = np.array([n - i + 1 for i in range(0, n)], dtype=DTYPE_INT)
np.ndarray[DTYPE_INT_t] right = np.array([n if i <= h else n - (i - h) for i in range(0, n)], dtype=DTYPE_INT)
np.ndarray[DTYPE_INT_t] weights = np.zeros((n,), dtype=DTYPE_INT)
np.ndarray[DTYPE_DOUBLE_t] work = np.zeros((n,), dtype=DTYPE_DOUBLE)
np.ndarray[DTYPE_INT_t] p = np.zeros((n,), dtype=DTYPE_INT)
np.ndarray[DTYPE_INT_t] q = np.zeros((n,), dtype=DTYPE_INT)
int sump, sumq = 0
double trial, output = 0
double[::1] a_sorted = np.sort(a)
int[::1] left = np.array([n - i + 1 for i in range(0, n)], dtype=np.intc)
int[::1] right = np.array([n if i <= h else n - (i - h) for i in range(0, n)], dtype=np.intc)
int[::1] weights = np.zeros((n,), dtype=np.intc)
double[::1] work = np.zeros((n,), dtype=np.double)
int[::1] p = np.zeros((n,), dtype=np.intc)
int[::1] q = np.zeros((n,), dtype=np.intc)
while n_right - n_left > n:
j = 0
for i in range(1, n):
Expand All @@ -121,7 +115,7 @@ def _qn(np.ndarray[DTYPE_DOUBLE_t] a, DTYPE_DOUBLE_t c):
j = j - 1
q[i] = j
sump = np.sum(p)
sumq = np.sum(q - 1)
sumq = np.sum(q) - n
if k_new <= sump:
right = np.copy(p)
n_right = sump
Expand Down
25 changes: 15 additions & 10 deletions statsmodels/robust/scale.py
Expand Up @@ -87,8 +87,10 @@ def iqr(a, c=Gaussian.ppf(3/4) - Gaussian.ppf(1/4), axis=0):

def qn_scale(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8)), axis=0):
"""
Computes the Qn robust estimator of scale, a more efficient alternative
to the MAD. The Qn estimator of the array a of length n is defined as
Computes the Qn robust estimator of scale
The Qn scale estimator is a more efficient alternative to the MAD.
The Qn scale estimator of an array a of length n is defined as
c * {abs(a[i] - a[j]): i<j}_(k), for k equal to [n/2] + 1 choose 2. Thus,
the Qn estimator is the k-th order statistic of the absolute differences
of the array. The optional constant is used to normalize the estimate
Expand All @@ -100,25 +102,28 @@ def qn_scale(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8)), axis=0):
a : array_like
Input array.
c : float, optional
The normalization constant, used to get consistent estimates of the
standard deviation at the normal distribution. Defined as
1/(np.sqrt(2) * scipy.stats.norm.ppf(5/8)), which is 2.219144.
The normalization constant. The default value is used to get consistent
estimates of the standard deviation at the normal distribution.
axis : int, optional
The default is 0. Can also be None.
The default is 0.
Returns
-------
The Qn robust estimator of scale
{float, ndarray}
The Qn robust estimator of scale
"""
a = array_like(a, 'a', ndim=None)
a = array_like(a, 'a', ndim=None, dtype=np.float64, contiguous=True,
order='C')
c = float_like(c, 'c')
a = a.astype(np.float64)
if a.ndim == 0:
raise ValueError("a should have at least one dimension")
elif a.size == 0:
return np.nan
else:
return np.apply_along_axis(_qn, axis=axis, arr=a, c=c)
out = np.apply_along_axis(_qn, axis=axis, arr=a, c=c)
if out.ndim == 0:
return float(out)
return out


def _qn_naive(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8))):
Expand Down

0 comments on commit d176d4b

Please sign in to comment.