From 02c2029e5ad7c0f71fcd302ac410f813a4436455 Mon Sep 17 00:00:00 2001 From: Nick Becker Date: Wed, 29 Aug 2018 21:48:57 +0100 Subject: [PATCH] Fast forward to gpuopenanalytics/pygdf master (#4) * adding eq datetime ops for pygdf * flake8 fixes * Drop Python 2.7, Add Python 3.7 * removing int coercion for datetime * Remove Python 3.7 build * bumping numba * forgot to commit meta.yaml changes * flake8 * commutative addition * commutative subtraction and multiplication * reflected floordiv and truediv * cleanup * stray comment * change rsub method * further testing rsub * rsub docstring * revert back * type coercion * revert to pseudo-commutative implementation * commutative ops tests * test comment cleanup * Feature/reflected ops noncommutative testing (#1) * np array solution * cleanup * np solution for division * full reflected ops tests * cleanup * switching lambda scalar to 2 * Update README.md Conda installation instruction needed changes with pygdf version. * Feature/reflected ops update (#2) * test binary_operator * test one line * essentially use _binaryop with a line flipped * expand to all non commutative reflected ops * revert rmul * Feature/reflected ops update (#3) * test binary_operator * test one line * essentially use _binaryop with a line flipped * expand to all non commutative reflected ops * revert rmul * rbinaryop function for clarity * add scalar to array generation to avoid division by zero behavior * remove integer division test due to libgdf bug * Fix timezone issue when converting from datetime object into datetime64 * Remove unused import to fix flake8 * Initial modifications for new join API --- .travis.yml | 1 - README.md | 2 +- conda-recipes/pygdf/meta.yaml | 4 +- conda_environments/builddocs_py35.yml | 2 +- conda_environments/testing_py35.yml | 2 +- pygdf/_gdf.py | 33 +++++++---- pygdf/datetime.py | 80 +++++++++++++++++++++------ pygdf/queryutils.py | 5 +- pygdf/series.py | 25 +++++++++ pygdf/tests/test_binops.py | 27 +++++++++ pygdf/tests/test_datetime.py | 50 ++++++++++++++++- 11 files changed, 194 insertions(+), 37 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7cf7683a787..620a1ab72e4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,6 @@ matrix: include: - env: PYTHON=3.6 - env: PYTHON=3.5 - - env: PYTHON=2.7 before_install: # install miniconda diff --git a/README.md b/README.md index e944efecc75..e44d84d7a34 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ You can get a minimal conda installation with [Miniconda](https://conda.io/minic You can install and update PyGDF using the conda command: ```bash -conda install -c numba -c conda-forge -c gpuopenanalytics/label/dev -c defaults pygdf=0.1.0a2 +conda install -c numba -c conda-forge -c gpuopenanalytics/label/dev -c defaults pygdf=0.1.0a3 ``` You can create and activate a development environment using the conda command: diff --git a/conda-recipes/pygdf/meta.yaml b/conda-recipes/pygdf/meta.yaml index ac5bbaff3ac..429a9cd8c30 100644 --- a/conda-recipes/pygdf/meta.yaml +++ b/conda-recipes/pygdf/meta.yaml @@ -16,14 +16,14 @@ requirements: build: - python - setuptools - - numba >0.35 + - numba>=0.40.0dev run: - python - setuptools - libgdf 0.1.0a3.* - libgdf_cffi 0.1.0a3.* - pandas 0.20.* - - numba >0.35 + - numba>=0.40.0dev test: requires: diff --git a/conda_environments/builddocs_py35.yml b/conda_environments/builddocs_py35.yml index e0436b82445..7b8a3cb14f7 100644 --- a/conda_environments/builddocs_py35.yml +++ b/conda_environments/builddocs_py35.yml @@ -9,7 +9,7 @@ dependencies: - pytest - libgdf=0.1.0a3.* - libgdf_cffi=0.1.0a3.* -- numba>=0.35.0 +- numba>=0.40.0dev - pandas=0.20.* - pip: - numpydoc diff --git a/conda_environments/testing_py35.yml b/conda_environments/testing_py35.yml index a2859d20cef..9cd307bd0a5 100644 --- a/conda_environments/testing_py35.yml +++ b/conda_environments/testing_py35.yml @@ -9,6 +9,6 @@ dependencies: - pytest - libgdf=0.1.0a3.* - libgdf_cffi=0.1.0a3.* -- numba>=0.35.0 +- numba>=0.40.0dev - pandas=0.20.3 - notebook>=0.5.0 diff --git a/pygdf/_gdf.py b/pygdf/_gdf.py index 6254305846a..733f16cfa30 100644 --- a/pygdf/_gdf.py +++ b/pygdf/_gdf.py @@ -167,7 +167,6 @@ def apply_join(col_lhs, col_rhs, how, method='hash'): raise ValueError(msg) joiner = _join_how_api[how] - join_result_ptr = ffi.new("gdf_join_result_type**", None) method_api = _join_method_api[method] gdf_context = ffi.new('gdf_context*') @@ -179,6 +178,9 @@ def apply_join(col_lhs, col_rhs, how, method='hash'): msg = "method not supported" raise ValueError(msg) + col_result_l = columnview(0, None, dtype=np.int32) + col_result_r = columnview(0, None, dtype=np.int32) + if(how in ['left', 'inner']): list_lhs = [] list_rhs = [] @@ -188,19 +190,28 @@ def apply_join(col_lhs, col_rhs, how, method='hash'): # Call libgdf - joiner(len(col_lhs), list_lhs, list_rhs, join_result_ptr, gdf_context) + joiner(len(col_lhs), list_lhs, list_rhs, col_result_l, + col_result_r, gdf_context) else: - joiner(col_lhs[0].cffi_view, col_rhs[0].cffi_view, join_result_ptr) + joiner(col_lhs[0].cffi_view, col_rhs[0].cffi_view, col_result_l, + col_result_r) # Extract result - join_result = join_result_ptr[0] - dataptr = libgdf.gdf_join_result_data(join_result) - datasize = libgdf.gdf_join_result_size(join_result) - ary = _as_numba_devarray(intaddr=int(ffi.cast("uintptr_t", dataptr)), - nelem=datasize, dtype=np.int32) - ary = ary.reshape(2, datasize // 2) - yield ((ary[0], ary[1]) if datasize > 0 else (ary, ary)) - libgdf.gdf_join_result_free(join_result) + + # yield ((ary[0], ary[1]) if datasize > 0 else (ary, ary)) + + left = _as_numba_devarray(intaddr=int(ffi.cast("uintptr_t", + col_result_l.data)), + nelem=col_result_l.size, dtype=np.int32) + + right = _as_numba_devarray(intaddr=int(ffi.cast("uintptr_t", + col_result_r.data)), + nelem=col_result_r.size, dtype=np.int32) + + yield(left, right) + + libgdf.gdf_column_free(col_result_l) + libgdf.gdf_column_free(col_result_r) def apply_prefixsum(col_inp, col_out, inclusive): diff --git a/pygdf/datetime.py b/pygdf/datetime.py index e27a7b1614d..28ea5d3180a 100644 --- a/pygdf/datetime.py +++ b/pygdf/datetime.py @@ -1,11 +1,24 @@ +import datetime as dt + import numpy as np -from . import columnops, _gdf +import pandas as pd + +from . import columnops, _gdf, utils from .buffer import Buffer -from .cudautils import compact_mask_bytes from libgdf_cffi import libgdf +_unordered_impl = { + 'eq': libgdf.gdf_eq_generic, + 'ne': libgdf.gdf_ne_generic, +} + + class DatetimeColumn(columnops.TypedColumnBase): + # TODO - we only support milliseconds (date64) + # we should support date32 and timestamp, but perhaps + # only after we move to arrow + # we also need to support other formats besides Date64 funcs = { 'year': libgdf.gdf_extract_datetime_year, 'month': libgdf.gdf_extract_datetime_month, @@ -14,29 +27,23 @@ class DatetimeColumn(columnops.TypedColumnBase): 'minute': libgdf.gdf_extract_datetime_minute, 'second': libgdf.gdf_extract_datetime_second, } + _npdatetime64_dtype = np.dtype('datetime64[ms]') def __init__(self, data, mask=None, null_count=None, dtype=None): - # currently libgdf datetime kernels fail if mask is null - if mask is None: - mask = np.ones(data.mem.size, dtype=np.bool) - mask = compact_mask_bytes(mask) - mask = Buffer(mask) super(DatetimeColumn, self).__init__(data=data, mask=mask, null_count=null_count, dtype=dtype ) - # the column constructor removes mask if it's all true - self._mask = mask + self._precision = 1e-3 + self._inverse_precision = 1e3 + self._pandas_conversion_factor = 1e9 * self._precision @classmethod def from_numpy(cls, array): - # hack, coerce to int, then set the dtype - array = array.astype('datetime64[ms]') - dtype = np.int64 + array = array.astype(cls._npdatetime64_dtype) assert array.dtype.itemsize == 8 - buf = Buffer(array.astype(dtype, copy=False)) - buf.dtype = array.dtype + buf = Buffer(array) return cls(data=buf, dtype=buf.dtype) @property @@ -68,9 +75,50 @@ def get_dt_field(self, field): self, dtype=np.int16 ) - # force mask again - out._mask = self.mask _gdf.apply_unaryop(self.funcs[field], self, out) return out + + def normalize_binop_value(self, other): + if isinstance(other, dt.datetime): + other = np.datetime64(other) + + if isinstance(other, pd.Timestamp): + ary = utils.scalar_broadcast_to( + other.value * self._pandas_conversion_factor, + shape=len(self), + dtype=self._npdatetime64_dtype + ) + elif isinstance(other, np.datetime64): + other = other.astype(self._npdatetime64_dtype) + ary = utils.scalar_broadcast_to( + other, + shape=len(self), + dtype=self._npdatetime64_dtype + ) + else: + raise TypeError('cannot broadcast {}'.format(type(other))) + + buf = Buffer(ary) + result = self.replace(data=buf, dtype=self.dtype) + return result + + def unordered_compare(self, cmpop, rhs): + lhs, rhs = self, rhs + return binop( + lhs, rhs, + op=_unordered_impl[cmpop], + out_dtype=np.bool + ) + + def to_pandas(self, index): + return pd.Series(self.to_array().astype(self.dtype), index=index) + + +def binop(lhs, rhs, op, out_dtype): + masked = lhs.has_null_mask or rhs.has_null_mask + out = columnops.column_empty_like(lhs, dtype=out_dtype, masked=masked) + null_count = _gdf.apply_binaryop(op, lhs, rhs, out) + out = out.replace(null_count=null_count) + return out diff --git a/pygdf/queryutils.py b/pygdf/queryutils.py index 2b8ff4f8edd..7da014d4f34 100644 --- a/pygdf/queryutils.py +++ b/pygdf/queryutils.py @@ -1,7 +1,8 @@ + # Copyright (c) 2018, NVIDIA CORPORATION. import ast - +import datetime as dt import six import numpy as np @@ -202,6 +203,8 @@ def query_execute(df, expr, callenv): name = name[len(ENVREF_PREFIX):] try: val = envdict[name] + if isinstance(val, dt.datetime): + val = np.datetime64(val) except KeyError: msg = '{!r} not defined in the calling environment' raise NameError(msg.format(name)) diff --git a/pygdf/series.py b/pygdf/series.py index afdad47c771..d5e866186d1 100644 --- a/pygdf/series.py +++ b/pygdf/series.py @@ -269,6 +269,16 @@ def _binaryop(self, other, fn): outcol = self._column.binary_operator(fn, other._column) return self._copy_construct(data=outcol) + def _rbinaryop(self, other, fn): + """ + Internal util to call a binary operator *fn* on operands *self* + and *other* for reflected operations. Return the output Series. + The output dtype is determined by the input operands. + """ + other = self._normalize_binop_value(other) + outcol = other._column.binary_operator(fn, self._column) + return self._copy_construct(data=outcol) + def _unaryop(self, fn): """ Internal util to call a unary operator *fn* on operands *self*. @@ -281,18 +291,33 @@ def _unaryop(self, fn): def __add__(self, other): return self._binaryop(other, 'add') + def __radd__(self, other): + return self._rbinaryop(other, 'add') + def __sub__(self, other): return self._binaryop(other, 'sub') + def __rsub__(self, other): + return self._rbinaryop(other, 'sub') + def __mul__(self, other): return self._binaryop(other, 'mul') + def __rmul__(self, other): + return self._rbinaryop(other, 'mul') + def __floordiv__(self, other): return self._binaryop(other, 'floordiv') + def __rfloordiv__(self, other): + return self._rbinaryop(other, 'floordiv') + def __truediv__(self, other): return self._binaryop(other, 'truediv') + def __rtruediv__(self, other): + return self._rbinaryop(other, 'truediv') + __div__ = __truediv__ def _normalize_binop_value(self, other): diff --git a/pygdf/tests/test_binops.py b/pygdf/tests/test_binops.py index ba370f89793..e7bc5980867 100644 --- a/pygdf/tests/test_binops.py +++ b/pygdf/tests/test_binops.py @@ -138,3 +138,30 @@ def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype): np.testing.assert_array_equal(cmpop(Series(lhs), Series(rhs)).to_array(), cmpop(lhs, rhs)) + + +_reflected_ops = [ + lambda x: 1 + x, + lambda x: 2 * x, + lambda x: 2 - x, + lambda x: 2 // x, +] + + +@pytest.mark.parametrize('func, dtype', list(product(_reflected_ops, _dtypes))) +def test_reflected_ops_scalar(func, dtype): + import pandas as pd + + # create random series + np.random.seed(12) + random_series = pd.Series(np.random.sample(100) + 10, dtype=dtype) + + # gpu series + gs = Series(random_series) + gs_result = func(gs) + + # pandas + ps_result = func(random_series) + + # verify + np.testing.assert_allclose(ps_result, gs_result) diff --git a/pygdf/tests/test_datetime.py b/pygdf/tests/test_datetime.py index 0fb0af164e2..b393bdc677d 100644 --- a/pygdf/tests/test_datetime.py +++ b/pygdf/tests/test_datetime.py @@ -1,7 +1,12 @@ import pytest + +import datetime as dt import numpy as np import pandas as pd -from pandas.util.testing import assert_index_equal, assert_series_equal +from pandas.util.testing import ( + assert_index_equal, assert_series_equal, + assert_frame_equal +) from pygdf.dataframe import Series, DataFrame from pygdf.index import DatetimeIndex @@ -19,8 +24,6 @@ def data2(): fields = ['year', 'month', 'day', 'hour', 'minute', 'second'] -# fields = ['year'] - @pytest.mark.parametrize('data', [data1(), data2()]) def test_series(data): @@ -56,3 +59,44 @@ def test_setitem_datetime(): a = DataFrame() a['a'] = pd.date_range('20010101', '20010105').values # TODO check some stuff + + +def test_issue_165(): + df_pandas = pd.DataFrame() + start_date = dt.datetime.strptime("2000-10-21", '%Y-%m-%d') + data = [(start_date + dt.timedelta(days=x)) for x in range(6)] + df_pandas["dates"] = data + df_pandas["num"] = [1, 2, 3, 4, 5, 6] + df_pygdf = DataFrame.from_pandas(df_pandas) + + base = df_pandas.query("dates==@start_date") + test = df_pygdf.query("dates==@start_date") + assert_frame_equal(base, test.to_pandas()) + assert len(test) > 0 + + mask = df_pygdf.dates == start_date + base_mask = df_pandas.dates == start_date + assert_series_equal(mask.to_pandas(), base_mask, check_names=False) + assert mask.to_pandas().sum() > 0 + + start_date_ts = pd.Timestamp(start_date) + test = df_pygdf.query("dates==@start_date_ts") + base = df_pandas.query("dates==@start_date_ts") + assert_frame_equal(base, test.to_pandas()) + assert len(test) > 0 + + mask = df_pygdf.dates == start_date_ts + base_mask = df_pandas.dates == start_date_ts + assert_series_equal(mask.to_pandas(), base_mask, check_names=False) + assert mask.to_pandas().sum() > 0 + + start_date_np = np.datetime64(start_date_ts, 'ns') + test = df_pygdf.query("dates==@start_date_np") + base = df_pandas.query("dates==@start_date_np") + assert_frame_equal(base, test.to_pandas()) + assert len(test) > 0 + + mask = df_pygdf.dates == start_date_np + base_mask = df_pandas.dates == start_date_np + assert_series_equal(mask.to_pandas(), base_mask, check_names=False) + assert mask.to_pandas().sum() > 0