PERF/MAINT: swtich to memoryviews, remove unnecesary declarations

break and return trial at the end was missing the declaration for q np.int is actually long, also memoryviews dont have a partition method remove declaration of i use memoryview for a in _qn without requiring contiguity, remove unnecesary imports
statsmodels · Aug 25, 2020 · d176d4b · d176d4b
1 parent 1bb7f60
commit d176d4b
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 61 deletions.
diff --git a/statsmodels/robust/_qn.pyx b/statsmodels/robust/_qn.pyx
@@ -1,71 +1,65 @@
 #!python
 #cython: wraparound=False, boundscheck=False, cdivision=True
 
-cimport cython
 import numpy as np
-cimport numpy as np
 
-DTYPE_INT = np.int
-DTYPE_DOUBLE = np.float64
 
-ctypedef np.float64_t DTYPE_DOUBLE_t
-ctypedef np.int_t DTYPE_INT_t
-
-
-def _high_weighted_median(np.ndarray[DTYPE_DOUBLE_t] a, np.ndarray[DTYPE_INT_t] weights):
+def _high_weighted_median(double[::1] a, int[::1] weights):
     """
     Computes a weighted high median of a. This is defined as the
     smallest a[j] such that the sum over all a[i]<=a[j] is strictly
     greater than half the total sum of the weights
     """
     cdef:
-        DTYPE_INT_t n = a.shape[0]
-        np.ndarray[DTYPE_DOUBLE_t] sorted_a = np.zeros((n,), dtype=DTYPE_DOUBLE)
-        np.ndarray[DTYPE_DOUBLE_t] a_cand = np.zeros((n,), dtype=DTYPE_DOUBLE)
-        np.ndarray[DTYPE_INT_t] weights_cand = np.zeros((n,), dtype=DTYPE_INT)
-        Py_ssize_t i= 0
-        DTYPE_INT_t kcand = 0
-        DTYPE_INT_t wleft, wright, wmid, wtot, wrest = 0
-        DTYPE_DOUBLE_t trial = 0
-    wtot = np.sum(weights)
+        double[::1] a_cp = np.copy(a)
+        int[::1] weights_cp = np.copy(weights)
+        int n = a_cp.shape[0]
+        double[::1] sorted_a = np.zeros((n,), dtype=np.double)
+        double[::1] a_cand = np.zeros((n,), dtype=np.double)
+        int[::1] weights_cand = np.zeros((n,), dtype=np.intc)
+        int kcand = 0
+        int wleft, wright, wmid, wtot, wrest = 0
+        double trial = 0
+    wtot = np.sum(weights_cp)
     while True:
         wleft = 0
         wmid = 0
         wright = 0
         for i in range(n):
-            sorted_a[i] = a[i]
-        sorted_a.partition(n//2)
+            sorted_a[i] = a_cp[i]
+        sorted_a = np.partition(sorted_a, kth=n//2)
         trial = sorted_a[n//2]
         for i in range(n):
-            if a[i] < trial:
-                wleft = wleft + weights[i]
-            elif a[i] > trial:
-                wright = wright + weights[i]
+            if a_cp[i] < trial:
+                wleft = wleft + weights_cp[i]
+            elif a_cp[i] > trial:
+                wright = wright + weights_cp[i]
             else:
-                wmid = wmid + weights[i]
+                wmid = wmid + weights_cp[i]
         kcand = 0
         if 2 * (wrest + wleft) > wtot:
             for i in range(n):
-                if a[i] < trial:
-                    a_cand[kcand] = a[i]
-                    weights_cand[kcand] = weights[i]
+                if a_cp[i] < trial:
+                    a_cand[kcand] = a_cp[i]
+                    weights_cand[kcand] = weights_cp[i]
                     kcand = kcand + 1
         elif 2 * (wrest + wleft + wmid) <= wtot:
             for i in range(n):
-                if a[i] > trial:
-                    a_cand[kcand] = a[i]
-                    weights_cand[kcand] = weights[i]
+                if a_cp[i] > trial:
+                    a_cand[kcand] = a_cp[i]
+                    weights_cand[kcand] = weights_cp[i]
                     kcand = kcand + 1
             wrest = wrest + wleft + wmid
         else:
-            return trial
+            break
         n = kcand
         for i in range(n):
-            a[i] = a_cand[i]
-            weights[i] = weights_cand[i]
+            a_cp[i] = a_cand[i]
+            weights_cp[i] = weights_cand[i]
+    return trial
 
 
-def _qn(np.ndarray[DTYPE_DOUBLE_t] a, DTYPE_DOUBLE_t c):
+def _qn(double[:] a, double c):
     """
     Computes the Qn robust estimator of scale, a more efficient alternative
     to the MAD. The implementation follows the algorithm described in Croux
@@ -85,22 +79,22 @@ def _qn(np.ndarray[DTYPE_DOUBLE_t] a, DTYPE_DOUBLE_t c):
     The Qn robust estimator of scale
     """
     cdef:
-        DTYPE_INT_t n = a.shape[0]
-        DTYPE_INT_t h = n/2 + 1
-        DTYPE_INT_t k = h * (h - 1) / 2
-        DTYPE_INT_t n_left = n * (n + 1) / 2
-        DTYPE_INT_t n_right = n * n
-        DTYPE_INT_t k_new = k + n_left
+        int n = a.shape[0]
+        int h = n/2 + 1
+        int k = h * (h - 1) / 2
+        int n_left = n * (n + 1) / 2
+        int n_right = n * n
+        int k_new = k + n_left
         Py_ssize_t i, j, jh, l = 0
-        DTYPE_INT_t sump, sumq = 0
-        DTYPE_DOUBLE_t trial, output = 0
-        np.ndarray[DTYPE_DOUBLE_t] a_sorted = np.sort(a)
-        np.ndarray[DTYPE_INT_t] left = np.array([n - i + 1 for i in range(0, n)], dtype=DTYPE_INT)
-        np.ndarray[DTYPE_INT_t] right = np.array([n if i <= h else n - (i - h) for i in range(0, n)], dtype=DTYPE_INT)
-        np.ndarray[DTYPE_INT_t] weights = np.zeros((n,), dtype=DTYPE_INT)
-        np.ndarray[DTYPE_DOUBLE_t] work = np.zeros((n,), dtype=DTYPE_DOUBLE)
-        np.ndarray[DTYPE_INT_t] p = np.zeros((n,), dtype=DTYPE_INT)
-        np.ndarray[DTYPE_INT_t] q = np.zeros((n,), dtype=DTYPE_INT)
+        int sump, sumq = 0
+        double trial, output = 0
+        double[::1] a_sorted = np.sort(a)
+        int[::1] left = np.array([n - i + 1 for i in range(0, n)], dtype=np.intc)
+        int[::1] right = np.array([n if i <= h else n - (i - h) for i in range(0, n)], dtype=np.intc)
+        int[::1] weights = np.zeros((n,), dtype=np.intc)
+        double[::1] work = np.zeros((n,), dtype=np.double)
+        int[::1] p = np.zeros((n,), dtype=np.intc)
+        int[::1] q = np.zeros((n,), dtype=np.intc)
     while n_right - n_left > n:
         j = 0
         for i in range(1, n):
@@ -121,7 +115,7 @@ def _qn(np.ndarray[DTYPE_DOUBLE_t] a, DTYPE_DOUBLE_t c):
                 j = j - 1
             q[i] = j
         sump = np.sum(p)
-        sumq = np.sum(q - 1)
+        sumq = np.sum(q) - n
         if k_new <= sump:
             right = np.copy(p)
             n_right = sump

diff --git a/statsmodels/robust/scale.py b/statsmodels/robust/scale.py
@@ -87,8 +87,10 @@ def iqr(a, c=Gaussian.ppf(3/4) - Gaussian.ppf(1/4), axis=0):
 
 def qn_scale(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8)), axis=0):
     """
-    Computes the Qn robust estimator of scale, a more efficient alternative
-    to the MAD. The Qn estimator of the array a of length n is defined as
+    Computes the Qn robust estimator of scale
+
+    The Qn scale estimator is a more efficient alternative to the MAD.
+    The Qn scale estimator of an array a of length n is defined as
     c * {abs(a[i] - a[j]): i<j}_(k), for k equal to [n/2] + 1 choose 2. Thus,
     the Qn estimator is the k-th order statistic of the absolute differences
     of the array. The optional constant is used to normalize the estimate
@@ -100,25 +102,28 @@ def qn_scale(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8)), axis=0):
     a : array_like
         Input array.
     c : float, optional
-        The normalization constant, used to get consistent estimates of the
-        standard deviation at the normal distribution.  Defined as
-        1/(np.sqrt(2) * scipy.stats.norm.ppf(5/8)), which is 2.219144.
+        The normalization constant. The default value is used to get consistent
+        estimates of the standard deviation at the normal distribution.
     axis : int, optional
-        The default is 0. Can also be None.
+        The default is 0.
 
     Returns
     -------
-    The Qn robust estimator of scale
+    {float, ndarray}
+        The Qn robust estimator of scale
     """
-    a = array_like(a, 'a', ndim=None)
+    a = array_like(a, 'a', ndim=None, dtype=np.float64, contiguous=True,
+                   order='C')
     c = float_like(c, 'c')
-    a = a.astype(np.float64)
     if a.ndim == 0:
         raise ValueError("a should have at least one dimension")
     elif a.size == 0:
         return np.nan
     else:
-        return np.apply_along_axis(_qn, axis=axis, arr=a, c=c)
+        out = np.apply_along_axis(_qn, axis=axis, arr=a, c=c)
+        if out.ndim == 0:
+            return float(out)
+        return out
 
 
 def _qn_naive(a, c=1 / (np.sqrt(2) * Gaussian.ppf(5 / 8))):