Skip to content

Commit

Permalink
ENH: implement rank function in Cython for Series and DataFrame, per #…
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Dec 12, 2011
1 parent d665004 commit 9c20734
Show file tree
Hide file tree
Showing 12 changed files with 276 additions and 29 deletions.
1 change: 1 addition & 0 deletions RELEASE.rst
Expand Up @@ -77,6 +77,7 @@ pandas 0.6.1
- Fix __doc__-related issue when converting py -> pyo with py2exe
- Bug fix in left join Cython code with duplicate monotonic labels
- Fix bug when unstacking multiple levels described in #451
- Exclude NA values in dtype=object arrays, regression from 0.5.0 (GH #469)

Thanks
------
Expand Down
18 changes: 18 additions & 0 deletions pandas/core/frame.py
Expand Up @@ -3052,6 +3052,24 @@ def clip_lower(self, threshold):
"""
return self.apply(lambda x: x.clip_lower(threshold))

def rank(self, axis=0):
"""
Compute numericaldata ranks (1 through n) along axis. Equal values are
assigned a rank that is the average of the ranks of those values
Parameters
----------
axis : {0, 1}, default 0
Ranks over columns (0) or rows (1)
Returns
-------
ranks : DataFrame
"""
data = self._get_numeric_data()
ranks = lib.rank_2d_float64(data.values.astype('f8'), axis=axis)
return DataFrame(ranks, index=data.index, columns=data.columns)

#----------------------------------------------------------------------
# Plotting

Expand Down
15 changes: 15 additions & 0 deletions pandas/core/series.py
Expand Up @@ -1211,6 +1211,21 @@ def argsort(self, axis=0, kind='quicksort', order=None):
else:
return Series(np.argsort(values), index=self.index, name=self.name)

def rank(self):
"""
Compute data ranks (1 through n). Equal values are assigned a rank that
is the average of the ranks of those values
Returns
-------
ranks : Series
"""
try:
ranks = lib.rank_1d_float64(self.values)
except Exception:
ranks = lib.rank_1d_generic(self.values)
return Series(ranks, index=self.index, name=self.name)

def order(self, na_last=True, ascending=True):
"""
Sorts Series object, by value, maintaining index-value link
Expand Down
49 changes: 25 additions & 24 deletions pandas/src/engines.pyx
Expand Up @@ -6,30 +6,6 @@ cnp.import_array()

cimport util

cdef class IndexEngine:

cpdef get_value(self, ndarray arr, object key):
'''
arr : 1-dimensional ndarray
'''
cdef:
Py_ssize_t loc
void* data_ptr

loc = self.get_loc(key)
return get_value_at(arr, loc)

cpdef set_value(self, ndarray arr, object key, object value):
'''
arr : 1-dimensional ndarray
'''
cdef:
Py_ssize_t loc
void* data_ptr

loc = self.get_loc(key)
set_value_at(arr, loc, value)

cpdef inline object get_value_at(ndarray arr, object loc):
cdef:
Py_ssize_t i
Expand Down Expand Up @@ -57,6 +33,31 @@ cpdef inline set_value_at(ndarray arr, object loc, object value):

util.assign_value_1d(arr, i, value)


cdef class IndexEngine:

cpdef get_value(self, ndarray arr, object key):
'''
arr : 1-dimensional ndarray
'''
cdef:
Py_ssize_t loc
void* data_ptr

loc = self.get_loc(key)
return get_value_at(arr, loc)

cpdef set_value(self, ndarray arr, object key, object value):
'''
arr : 1-dimensional ndarray
'''
cdef:
Py_ssize_t loc
void* data_ptr

loc = self.get_loc(key)
set_value_at(arr, loc, value)

cdef class DictIndexEngine(IndexEngine):
'''
For accelerating low-level internal details of indexing
Expand Down
3 changes: 0 additions & 3 deletions pandas/src/parsing.pyx
@@ -1,8 +1,5 @@
cimport cpython

cdef extern from "math.h":
double fabs(double)

def to_object_array(list rows):
cdef:
Py_ssize_t i, j, n, k, tmp
Expand Down
129 changes: 129 additions & 0 deletions pandas/src/stats.pyx
@@ -0,0 +1,129 @@
cdef float64_t FP_ERR = 1e-13

cimport util

def rank_1d_float64(object in_arr):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""

cdef:
Py_ssize_t i, j, n, dups = 0
ndarray[float64_t] sorted_data, ranks, values
ndarray[int64_t] argsorted
int32_t idx
float64_t val, nan_value
float64_t sum_ranks = 0

values = np.asarray(in_arr).copy()

nan_value = np.inf
mask = np.isnan(values)
np.putmask(values, mask, nan_value)

n = len(values)
ranks = np.empty(n, dtype='f8')
argsorted = values.argsort().astype('i8')
sorted_data = values.take(argsorted)

for i in range(n):
sum_ranks += i + 1
dups += 1
val = sorted_data[i]
if val == nan_value:
ranks[argsorted[i]] = nan
continue
if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR:
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = sum_ranks / dups
sum_ranks = dups = 0
return ranks

def rank_2d_float64(object in_arr, axis=0):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""

cdef:
Py_ssize_t i, j, z, k, n, dups = 0
ndarray[float64_t, ndim=2] ranks, values
ndarray[int64_t, ndim=2] argsorted
int32_t idx
float64_t val, nan_value
float64_t sum_ranks = 0

in_arr = np.asarray(in_arr)

if axis == 0:
values = in_arr.T.copy()
else:
values = in_arr.copy()

nan_value = np.inf
np.putmask(values, np.isnan(values), nan_value)

n, k = (<object> values).shape
ranks = np.empty((n, k), dtype='f8')
argsorted = values.argsort(1).astype('i8')
values.sort(axis=1)

for i in range(n):
dups = sum_ranks = 0
for j in range(k):
sum_ranks += j + 1
dups += 1
val = values[i, j]
if val == nan_value:
ranks[i, argsorted[i, j]] = nan
continue
if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR:
for z in range(j - dups + 1, j + 1):
ranks[i, argsorted[i, z]] = sum_ranks / dups
sum_ranks = dups = 0

if axis == 0:
return ranks.T
else:
return ranks

def rank_1d_generic(object in_arr):
"""
Fast NaN-friendly version of scipy.stats.rankdata
"""

cdef:
Py_ssize_t i, j, n, dups = 0
ndarray[float64_t] ranks
ndarray sorted_data, values
ndarray[int64_t] argsorted
int32_t idx
float64_t val, nan_value
float64_t sum_ranks = 0

values = np.asarray(in_arr).copy()

nan_value = np.inf

if isinstance(values.dtype.type, np.floating):
mask = np.isnan(values)
np.putmask(values, mask, nan_value)

n = len(values)
ranks = np.empty(n, dtype='f8')

argsorted = values.argsort().astype('i8')
sorted_data = values.take(argsorted)

for i in range(n):
sum_ranks += i + 1
dups += 1
val = util.get_value_at(sorted_data, i)
if val == nan_value:
ranks[argsorted[i]] = nan
continue
if (i == n - 1 or
fabs(util.get_value_at(sorted_data, i + 1) - val) > FP_ERR):
for j in range(i - dups + 1, i + 1):
ranks[argsorted[j]] = sum_ranks / dups
sum_ranks = dups = 0
return ranks
4 changes: 3 additions & 1 deletion pandas/src/tseries.pyx
Expand Up @@ -53,6 +53,7 @@ from util cimport is_integer_object

cdef extern from "math.h":
double sqrt(double x)
double fabs(double)

cdef extern from "datetime.h":

Expand Down Expand Up @@ -471,11 +472,12 @@ cdef class cache_readonly(object):
cpdef is_array(object o):
return np.PyArray_Check(o)


include "skiplist.pyx"
include "groupby.pyx"
include "moments.pyx"
include "reindex.pyx"
include "generated.pyx"
include "parsing.pyx"
include "reduce.pyx"

include "stats.pyx"
28 changes: 28 additions & 0 deletions pandas/src/util.pxd
@@ -1,6 +1,34 @@
from numpy cimport ndarray
cimport numpy as cnp

cdef extern from "numpy_helper.h":
inline int is_integer_object(object)
inline int is_float_object(object)
inline int assign_value_1d (ndarray, Py_ssize_t, object) except -1

cpdef inline object get_value_at(ndarray arr, object loc):
cdef:
Py_ssize_t i
void* data_ptr
if is_float_object(loc):
casted = int(loc)
if casted == loc:
loc = casted
i = <Py_ssize_t> loc
if i < 0:
i += cnp.PyArray_SIZE(arr)
data_ptr = cnp.PyArray_GETPTR1(arr, i)
return cnp.PyArray_GETITEM(arr, data_ptr)

cpdef inline set_value_at(ndarray arr, object loc, object value):
cdef:
Py_ssize_t i
if is_float_object(loc):
casted = int(loc)
if casted == loc:
loc = casted
i = <Py_ssize_t> loc
if i < 0:
i += cnp.PyArray_SIZE(arr)

assign_value_1d(arr, i, value)
24 changes: 24 additions & 0 deletions pandas/tests/test_frame.py
Expand Up @@ -3465,6 +3465,30 @@ def test_cumprod(self):
df.cumprod(0)
df.cumprod(1)

def test_rank(self):
from scipy.stats import rankdata

self.frame['A'][::2] = np.nan
self.frame['B'][::3] = np.nan
self.frame['C'][::4] = np.nan
self.frame['D'][::5] = np.nan


ranks0 = self.frame.rank()
ranks1 = self.frame.rank(1)
mask = np.isnan(self.frame.values)

fvals = self.frame.fillna(np.inf).values

exp0 = np.apply_along_axis(rankdata, 0, fvals)
exp0[mask] = np.nan

exp1 = np.apply_along_axis(rankdata, 1, fvals)
exp1[mask] = np.nan

assert_almost_equal(ranks0.values, exp0)
assert_almost_equal(ranks1.values, exp1)

def test_describe(self):
desc = self.tsframe.describe()
desc = self.mixed_frame.describe()
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/test_series.py
Expand Up @@ -1032,6 +1032,22 @@ def test_order(self):
ordered = ts.order(ascending=False, na_last=False)
assert_almost_equal(expected, ordered.valid().values)

def test_rank(self):
from scipy.stats import rankdata

self.ts[::2] = np.nan
self.ts[:10][::3] = 4.

ranks = self.ts.rank()

mask = np.isnan(self.ts)
filled = self.ts.fillna(np.inf)

exp = rankdata(filled)
exp[mask] = np.nan

assert_almost_equal(ranks, exp)

def test_to_csv(self):
self.ts.to_csv('_foo')

Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/test_tseries.py
Expand Up @@ -186,6 +186,22 @@ def test_convert_objects_ints():
result = lib.maybe_convert_objects(arr)
assert(issubclass(result.dtype.type, np.integer))

def test_rank():
from scipy.stats import rankdata
from numpy import nan
def _check(arr):
mask = -np.isfinite(arr)
arr = arr.copy()
result = lib.rank_1d_float64(arr)
arr[mask] = np.inf
exp = rankdata(arr)
exp[mask] = np.nan
assert_almost_equal(result, exp)

_check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan]))
_check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan]))


class TestMoments(unittest.TestCase):
pass

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -278,7 +278,7 @@ def run(self):
cmdclass['sdist'] = CheckSDist

tseries_depends = ['reindex', 'groupby', 'skiplist', 'moments',
'generated', 'parsing', 'reduce']
'generated', 'parsing', 'reduce', 'stats']
def srcpath(name=None, suffix='.pyx', subdir='src'):
return pjoin('pandas', subdir, name+suffix)

Expand Down

0 comments on commit 9c20734

Please sign in to comment.