Skip to content

Commit

Permalink
Merge pull request #1439 from rstudio/pandas-na
Browse files Browse the repository at this point in the history
Preserve NAs when casting R data.frames to pandas.
  • Loading branch information
t-kalinowski committed Aug 15, 2023
2 parents e5794fe + d09c7f8 commit dd2529b
Show file tree
Hide file tree
Showing 5 changed files with 251 additions and 21 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# reticulate (development version)

- reticulate now supports casting R data.frames to Pandas data.frames using nullable
data types allowing users to preserve NA's from R atomic vectors. This feature is
opt-in and can be enabled by setting the R option `reticulate.pandas_use_nullable_dtypes`
to `TRUE`. (#1439)

# reticulate 1.31

## Python Installation Management
Expand Down
3 changes: 3 additions & 0 deletions src/libpython.h
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,9 @@ LIBPYTHON_EXTERN void **PyArray_API;
(*(PyObject * (*)(PyTypeObject *, int, npy_intp *, int, npy_intp *, void *, int, int, PyObject *)) \
PyArray_API[93])

#define PyArray_SimpleNew(nd, dims, typenum) \
PyArray_New(&PyArray_Type, nd, dims, typenum, NULL, NULL, 0, 0, NULL)

inline void* PyArray_DATA(PyArrayObject *arr) {
return ((PyArrayObject_fields *)arr)->data;
}
Expand Down
203 changes: 182 additions & 21 deletions src/python.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,12 @@ std::string as_std_string(PyObject* str) {

#define as_utf8_r_string(str) Rcpp::String(as_std_string(str))

PyObject* as_python_str(SEXP strSEXP) {
PyObject* as_python_str(SEXP strSEXP, bool handle_na=false) {
if (handle_na && strSEXP == NA_STRING) {
Py_IncRef(Py_None);
return Py_None;
}

if (is_python3()) {
// python3 doesn't have PyString and all strings are unicode so
// make sure we get a unicode representation from R
Expand Down Expand Up @@ -909,12 +914,19 @@ bool is_pandas_na(PyObject* x) {

}

#define STATIC_MODULE(module) \
const static PyObjectPtr mod(PyImport_ImportModule(module)); \
if (mod.is_null()) { \
throw PythonException(py_fetch_error()); \
} \
return mod;

PyObject* numpy () {
const static PyObjectPtr numpy(PyImport_ImportModule("numpy"));
if (numpy.is_null()) {
throw PythonException(py_fetch_error());
}
return numpy;
STATIC_MODULE("numpy")
}

PyObject* pandas_arrays () {
STATIC_MODULE("pandas.arrays")
}

bool is_pandas_na_like(PyObject* x) {
Expand Down Expand Up @@ -1631,7 +1643,7 @@ PyObject* r_to_py_numpy(RObject x, bool convert) {
void** pData = (void**)PyArray_DATA((PyArrayObject*)array);
R_xlen_t len = Rf_xlength(x);
for (R_xlen_t i = 0; i<len; i++) {
PyObject* pyStr = as_python_str(STRING_ELT(x, i));
PyObject* pyStr = as_python_str(STRING_ELT(x, i), /*handle_na=*/true);
pData[i] = pyStr;
}

Expand Down Expand Up @@ -3381,6 +3393,132 @@ SEXP py_convert_pandas_df(PyObjectRef df) {

}

PyObject* na_mask (SEXP x) {

const size_t n(LENGTH(x));
npy_intp dims(n);

PyObject* mask(PyArray_SimpleNew(1, &dims, NPY_BOOL));
if (!mask) throw PythonException(py_fetch_error());

// Instead of using R's Logical
// data points to mask 'owned' memory, so we don't need to free it.
bool* data = (bool*) PyArray_DATA((PyArrayObject*) mask);
if (!data) throw PythonException(py_fetch_error());

size_t i;

// This is modified from R primitive do_isna - backing the `is.na()`:
// https://github.com/wch/r-source/blob/6b5d4ca5d1e3b4b9e4bbfb8f75577aff396a378a/src/main/coerce.c#L2221
// Unfortunately couldn't find a simple way to find NA's for whichever atomic type.
switch (TYPEOF(x)) {
case LGLSXP:
for (i = 0; i < n; i++)
data[i] = (LOGICAL_ELT(x, i) == NA_LOGICAL);
break;
case INTSXP:
for (i = 0; i < n; i++)
data[i] = (INTEGER_ELT(x, i) == NA_INTEGER);
break;
case REALSXP:
for (i = 0; i < n; i++)
data[i] = ISNAN(REAL_ELT(x, i));
break;
case CPLXSXP:
for (i = 0; i < n; i++) {
Rcomplex v = COMPLEX_ELT(x, i);
data[i] = (ISNAN(v.r) || ISNAN(v.i));
}
break;
case STRSXP:
for (i = 0; i < n; i++)
data[i] = (STRING_ELT(x, i) == NA_STRING);
break;
}

return mask;
}

PyObject* r_to_py_pandas_nullable_series (const RObject& column, const bool convert) {

PyObject* constructor;
switch (TYPEOF(column)) {
case INTSXP:
const static PyObjectPtr IntArray(
PyObject_GetAttrString(pandas_arrays(), "IntegerArray")
);
constructor = IntArray.get();
break;
case REALSXP:
const static PyObjectPtr FloatArray(
PyObject_GetAttrString(pandas_arrays(), "FloatingArray")
);
constructor = FloatArray.get();
break;
case LGLSXP:
const static PyObjectPtr BoolArray(
PyObject_GetAttrString(pandas_arrays(), "BooleanArray")
);
constructor = BoolArray.get();
break;
case STRSXP:
const static PyObjectPtr StringArray(
PyObject_GetAttrString(pandas_arrays(), "StringArray")
);
constructor = StringArray.get();
break;
default:
Rcpp::stop("R type not handled. Please supply one of int, double, logical or character");
}

if (!constructor) {
// if the constructor is not available it means that the user doesn't have
// the minimum pandas version.
// we show a warning and force the numpy construction.
Rcpp::warning(
"Nullable data types require pandas version >= 1.2.0. "
"Forcing numpy cast. Use `options(reticulate.pandas_use_nullable_dtypes = FALSE)` "
"to disable this warning."
);

return r_to_py_numpy(column, convert);
}

// strings are not built using np array + mask. Instead they take a
// np array with OBJECT type, with None's in the place of NA's
if (TYPEOF(column) == STRSXP) {
PyObjectPtr args(PyTuple_New(2));
PyTuple_SetItem(args, 0, (PyObject*)r_to_py_numpy(column, convert));
PyTuple_SetItem(args, 1, Py_False);

PyObject* pd_col(PyObject_Call(constructor, args, NULL));

if (!pd_col) {
// it's likely that the error is caused by using an old version of pandas
// that don't accept `None` as a `NA` value.
// we force the old cast method after a warning.
Rcpp::warning(
"String nullable data types require pandas version >= 1.5.0. "
"Forcing numpy cast. Use `options(reticulate.pandas_use_nullable_dtypes = FALSE)` "
"to disable this warning."
);

return r_to_py_numpy(column, convert);
}

return pd_col;
}

// tuples own the objects - thus we don't leak the value and mask
PyObjectPtr args(PyTuple_New(3));
PyTuple_SetItem(args, 0, (PyObject*)r_to_py_numpy(column, convert)); // value
PyTuple_SetItem(args, 1, (PyObject*)na_mask(column)); // mask
PyTuple_SetItem(args, 2, Py_False); // copy=False

PyObject* pd_col(PyObject_Call(constructor, args, NULL));
return pd_col;
}

// [[Rcpp::export]]
PyObjectRef r_convert_dataframe(RObject dataframe, bool convert) {

Expand All @@ -3390,35 +3528,58 @@ PyObjectRef r_convert_dataframe(RObject dataframe, bool convert) {
PyObjectPtr dict(PyDict_New());

CharacterVector names = dataframe.attr("names");
// when this is set we cast R atomic vectors to numpy arrays and don't
// use pandas dtypes that can handle missing values.
bool nullable_dtypes = option_is_true("reticulate.pandas_use_nullable_dtypes");

for (R_xlen_t i = 0, n = Rf_xlength(dataframe); i < n; i++)
{
RObject column = VECTOR_ELT(dataframe, i);

// ensure name is converted to appropriate encoding
const char* name = is_python3()
? Rf_translateCharUTF8(names[i])
: Rf_translateChar(names[i]);
PyObjectPtr name(as_python_str(names[i]));

int status = 0;
if (OBJECT(column) == 0) {
if (is_convertible_to_numpy(column)) {
PyObjectPtr value(r_to_py_numpy(column, convert));
status = PyDict_SetItemString(dict, name, value);
} else {
PyObjectPtr value(r_to_py_cpp(column, convert));
status = PyDict_SetItemString(dict, name, value);
}
} else {

if (OBJECT(column) != 0) {
// An object with a class attribute, we dispatch to the S3 method
// and continue to the next column.
PyObjectRef ref(r_convert_dataframe_column(column, convert));
status = PyDict_SetItemString(dict, name, ref.get());
status = PyDict_SetItem(dict, name, ref.get());
if (status != 0)
throw PythonException(py_fetch_error());

continue;
}

if (!is_convertible_to_numpy(column)) {
// Not an atomic type supported by numpy, thus we use the default
// cast engine and continue to the next column.
PyObjectPtr value(r_to_py_cpp(column, convert));
status = PyDict_SetItem(dict, name, value);

if (status != 0)
throw PythonException(py_fetch_error());

continue;
}

// We are sure it's an atomic vector:
// Atomic values STRSXP, INTSXP, REALSXP and CPLSXP
if (!nullable_dtypes || TYPEOF(column) == CPLXSXP) {
PyObjectPtr value(r_to_py_numpy(column, convert));
status = PyDict_SetItem(dict, name, value);
} else {
// use Pandas nullable data types.
PyObjectPtr value(r_to_py_pandas_nullable_series(column, convert));
status = PyDict_SetItem(dict, name, value);
}

if (status != 0)
throw PythonException(py_fetch_error());
}

return py_ref(dict.detach(), convert);

}

namespace {
Expand Down
35 changes: 35 additions & 0 deletions tests/testthat/test-python-pandas.R
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,38 @@ test_that("NA in string columns don't prevent simplification", {
expect_equal(as.logical(is.na(r)), c(FALSE, TRUE, TRUE, TRUE))

})

test_that("NA's are preserved in pandas columns", {
pd <- import("pandas")
if (numeric_version(pd$`__version__`) < "1.5") {
skip("Nullable data types require pandas version >= 1.5 to work fully.")
}

df <- data.frame(
int = c(NA, 1:10),
num = c(NA, rnorm(10)),
bool = c(NA, rep(c(TRUE, FALSE), 5)),
string = c(NA, letters[1:10])
)

withr::with_options(c(reticulate.pandas_use_nullable_dtypes = TRUE), {
p_df <- r_to_py(df)
})

r_df <- py_to_r(p_df)

expect_identical(r_df$num, df$num)
expect_identical(r_df$int, df$int)
expect_identical(r_df$bool, df$bool)
expect_identical(r_df$string, df$string)
})

test_that("Round strip for string columns with NA's work correctly", {
df <- data.frame(string = c(NA, letters[1:10]))
p <- r_to_py(df)

expect_true(py_to_r(p$string$isna()[0]))

r <- py_to_r(p)
expect_true(is.na(r$string[1]))
})
26 changes: 26 additions & 0 deletions vignettes/calling_python.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,32 @@ R data frames can be automatically converted to and from [Pandas](https://pandas

If the R data frame has row names, the generated Pandas DataFrame will be re-indexed using those row names (and vice versa). Special handling is also available for a `DatetimeIndex` associated with a Pandas DataFrame; however, because R only supports character vectors for row names they are converted to character first.

### Using Pandas nullable data types

Pandas has experimental support for nullable data types. Those data types have built-in
support for missing values, represented by `pd.NA` and using them allows us to better
represent R `NA` values.

Users can opt-in to use Pandas nullable data types instead of numpy arrays by setting
the `reticulate.pandas_use_nullable_dtypes` to `TRUE`. For example:

```r
df <- data.frame(
int = c(NA, 1:4),
num = c(NA, rnorm(4)),
lgl = c(NA, rep(c(TRUE, FALSE), 2)),
string = c(NA, letters[1:4])
)
options(reticulate.pandas_use_nullable_data_types = TRUE)
r_to_py(df)
#> int num lgl string
#> 0 <NA> <NA> <NA> <NA>
#> 1 1 -0.697855 True a
#> 2 2 -0.253042 False b
#> 3 3 0.385421 True c
#> 4 4 0.519933 False d
```

## Sparse Matrices

Sparse matrices created by [Matrix R package](https://CRAN.R-project.org/package=Matrix) can be converted [Scipy CSC matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csc_matrix.html), and vice versa. This is often useful when you want to pass sparse matrices to Python functions that accepts Scipy CSC matrix to take advantage of this format, such as efficient column slicing and fast matrix vector products.
Expand Down

0 comments on commit dd2529b

Please sign in to comment.