Merge pull request #1439 from rstudio/pandas-na

Preserve NAs when casting R data.frames to pandas.
rstudio · Aug 15, 2023 · dd2529b · dd2529b
2 parents e5794fe + d09c7f8
commit dd2529b
Show file tree

Hide file tree

Showing 5 changed files with 251 additions and 21 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # reticulate (development version)
 
+- reticulate now supports casting R data.frames to Pandas data.frames using nullable
+  data types allowing users to preserve NA's from R atomic vectors. This feature is
+  opt-in and can be enabled by setting the R option `reticulate.pandas_use_nullable_dtypes`
+  to `TRUE`. (#1439)
+
 # reticulate 1.31
 
 ## Python Installation Management

diff --git a/src/libpython.h b/src/libpython.h
@@ -552,6 +552,9 @@ LIBPYTHON_EXTERN void **PyArray_API;
           (*(PyObject * (*)(PyTypeObject *, int, npy_intp *, int, npy_intp *, void *, int, int, PyObject *)) \
              PyArray_API[93])
 
+#define PyArray_SimpleNew(nd, dims, typenum) \
+          PyArray_New(&PyArray_Type, nd, dims, typenum, NULL, NULL, 0, 0, NULL)
+
 inline void* PyArray_DATA(PyArrayObject *arr) {
   return ((PyArrayObject_fields *)arr)->data;
 }

diff --git a/src/python.cpp b/src/python.cpp
@@ -285,7 +285,12 @@ std::string as_std_string(PyObject* str) {
 
 #define as_utf8_r_string(str) Rcpp::String(as_std_string(str))
 
-PyObject* as_python_str(SEXP strSEXP) {
+PyObject* as_python_str(SEXP strSEXP, bool handle_na=false) {
+  if (handle_na && strSEXP == NA_STRING) {
+    Py_IncRef(Py_None);
+    return Py_None;
+  }
+
   if (is_python3()) {
     // python3 doesn't have PyString and all strings are unicode so
     // make sure we get a unicode representation from R
@@ -909,12 +914,19 @@ bool is_pandas_na(PyObject* x) {
 
 }
 
+#define STATIC_MODULE(module)                                      \
+  const static PyObjectPtr mod(PyImport_ImportModule(module));     \
+  if (mod.is_null()) {                                             \
+    throw PythonException(py_fetch_error());                       \
+  }                                                                \
+  return mod;
+
 PyObject* numpy () {
-  const static PyObjectPtr numpy(PyImport_ImportModule("numpy"));
-  if (numpy.is_null()) {
-    throw PythonException(py_fetch_error());
-  }
-  return numpy;
+  STATIC_MODULE("numpy")
+}
+
+PyObject* pandas_arrays () {
+  STATIC_MODULE("pandas.arrays")
 }
 
 bool is_pandas_na_like(PyObject* x) {
@@ -1631,7 +1643,7 @@ PyObject* r_to_py_numpy(RObject x, bool convert) {
     void** pData = (void**)PyArray_DATA((PyArrayObject*)array);
     R_xlen_t len = Rf_xlength(x);
     for (R_xlen_t i = 0; i<len; i++) {
-      PyObject* pyStr = as_python_str(STRING_ELT(x, i));
+      PyObject* pyStr = as_python_str(STRING_ELT(x, i), /*handle_na=*/true);
       pData[i] = pyStr;
     }
 
@@ -3381,6 +3393,132 @@ SEXP py_convert_pandas_df(PyObjectRef df) {
 
 }
 
+PyObject* na_mask (SEXP x) {
+
+  const size_t n(LENGTH(x));
+  npy_intp dims(n);
+
+  PyObject* mask(PyArray_SimpleNew(1, &dims, NPY_BOOL));
+  if (!mask) throw PythonException(py_fetch_error());
+
+  // Instead of using R's Logical
+  // data points to mask 'owned' memory, so we don't need to free it.
+  bool* data = (bool*) PyArray_DATA((PyArrayObject*) mask);
+  if (!data) throw PythonException(py_fetch_error());
+
+  size_t i;
+
+  // This is modified from R primitive do_isna - backing the `is.na()`:
+  // https://github.com/wch/r-source/blob/6b5d4ca5d1e3b4b9e4bbfb8f75577aff396a378a/src/main/coerce.c#L2221
+  // Unfortunately couldn't find a simple way to find NA's for whichever atomic type.
+  switch (TYPEOF(x)) {
+  case LGLSXP:
+    for (i = 0; i < n; i++)
+      data[i] = (LOGICAL_ELT(x, i) == NA_LOGICAL);
+    break;
+  case INTSXP:
+    for (i = 0; i < n; i++)
+      data[i] = (INTEGER_ELT(x, i) == NA_INTEGER);
+    break;
+  case REALSXP:
+    for (i = 0; i < n; i++)
+      data[i] = ISNAN(REAL_ELT(x, i));
+    break;
+  case CPLXSXP:
+    for (i = 0; i < n; i++) {
+      Rcomplex v = COMPLEX_ELT(x, i);
+      data[i] = (ISNAN(v.r) || ISNAN(v.i));
+    }
+    break;
+  case STRSXP:
+    for (i = 0; i < n; i++)
+      data[i] = (STRING_ELT(x, i) == NA_STRING);
+    break;
+  }
+
+  return mask;
+}
+
+PyObject* r_to_py_pandas_nullable_series (const RObject& column, const bool convert) {
+
+  PyObject* constructor;
+  switch (TYPEOF(column)) {
+  case INTSXP:
+    const static PyObjectPtr IntArray(
+        PyObject_GetAttrString(pandas_arrays(), "IntegerArray")
+    );
+    constructor = IntArray.get();
+    break;
+  case REALSXP:
+    const static PyObjectPtr FloatArray(
+        PyObject_GetAttrString(pandas_arrays(), "FloatingArray")
+    );
+    constructor = FloatArray.get();
+    break;
+  case LGLSXP:
+    const static PyObjectPtr BoolArray(
+        PyObject_GetAttrString(pandas_arrays(), "BooleanArray")
+    );
+    constructor = BoolArray.get();
+    break;
+  case STRSXP:
+    const static PyObjectPtr StringArray(
+        PyObject_GetAttrString(pandas_arrays(), "StringArray")
+    );
+    constructor = StringArray.get();
+    break;
+  default:
+    Rcpp::stop("R type not handled. Please supply one of int, double, logical or character");
+  }
+
+  if (!constructor) {
+    // if the constructor is not available it means that the user doesn't have
+    // the minimum pandas version.
+    // we show a warning and force the numpy construction.
+    Rcpp::warning(
+      "Nullable data types require pandas version >= 1.2.0. "
+      "Forcing numpy cast. Use `options(reticulate.pandas_use_nullable_dtypes = FALSE)` "
+      "to disable this warning."
+    );
+
+    return r_to_py_numpy(column, convert);
+  }
+
+  // strings are not built using np array + mask. Instead they take a
+  // np array with OBJECT type, with None's in the place of NA's
+  if (TYPEOF(column) == STRSXP) {
+    PyObjectPtr args(PyTuple_New(2));
+    PyTuple_SetItem(args, 0, (PyObject*)r_to_py_numpy(column, convert));
+    PyTuple_SetItem(args, 1, Py_False);
+
+    PyObject* pd_col(PyObject_Call(constructor, args, NULL));
+
+    if (!pd_col) {
+      // it's likely that the error is caused by using an old version of pandas
+      // that don't accept `None` as a `NA` value.
+      // we force the old cast method after a warning.
+      Rcpp::warning(
+        "String nullable data types require pandas version >= 1.5.0. "
+        "Forcing numpy cast. Use `options(reticulate.pandas_use_nullable_dtypes = FALSE)` "
+        "to disable this warning."
+      );
+
+      return r_to_py_numpy(column, convert);
+    }
+
+    return pd_col;
+  }
+
+  // tuples own the objects - thus we don't leak the value and mask
+  PyObjectPtr args(PyTuple_New(3));
+  PyTuple_SetItem(args, 0, (PyObject*)r_to_py_numpy(column, convert)); // value
+  PyTuple_SetItem(args, 1, (PyObject*)na_mask(column));                // mask
+  PyTuple_SetItem(args, 2, Py_False);                                  // copy=False
+
+  PyObject* pd_col(PyObject_Call(constructor, args, NULL));
+  return pd_col;
+}
+
 // [[Rcpp::export]]
 PyObjectRef r_convert_dataframe(RObject dataframe, bool convert) {
 
@@ -3390,35 +3528,58 @@ PyObjectRef r_convert_dataframe(RObject dataframe, bool convert) {
   PyObjectPtr dict(PyDict_New());
 
   CharacterVector names = dataframe.attr("names");
+  // when this is set we cast R atomic vectors to numpy arrays and don't
+  // use pandas dtypes that can handle missing values.
+  bool nullable_dtypes = option_is_true("reticulate.pandas_use_nullable_dtypes");
+
   for (R_xlen_t i = 0, n = Rf_xlength(dataframe); i < n; i++)
   {
     RObject column = VECTOR_ELT(dataframe, i);
 
     // ensure name is converted to appropriate encoding
-    const char* name = is_python3()
-      ? Rf_translateCharUTF8(names[i])
-      : Rf_translateChar(names[i]);
+    PyObjectPtr name(as_python_str(names[i]));
 
     int status = 0;
-    if (OBJECT(column) == 0) {
-      if (is_convertible_to_numpy(column)) {
-        PyObjectPtr value(r_to_py_numpy(column, convert));
-        status = PyDict_SetItemString(dict, name, value);
-      } else {
-        PyObjectPtr value(r_to_py_cpp(column, convert));
-        status = PyDict_SetItemString(dict, name, value);
-      }
-    } else {
+
+    if (OBJECT(column) != 0) {
+      // An object with a class attribute, we dispatch to the S3 method
+      // and continue to the next column.
       PyObjectRef ref(r_convert_dataframe_column(column, convert));
-      status = PyDict_SetItemString(dict, name, ref.get());
+      status = PyDict_SetItem(dict, name, ref.get());
+      if (status != 0)
+        throw PythonException(py_fetch_error());
+
+      continue;
+    }
+
+    if (!is_convertible_to_numpy(column)) {
+      // Not an atomic type supported by numpy, thus we use the default
+      // cast engine and continue to the next column.
+      PyObjectPtr value(r_to_py_cpp(column, convert));
+      status = PyDict_SetItem(dict, name, value);
+
+      if (status != 0)
+        throw PythonException(py_fetch_error());
+
+      continue;
+    }
+
+    // We are sure it's an atomic vector:
+    // Atomic values STRSXP, INTSXP, REALSXP and CPLSXP
+    if (!nullable_dtypes || TYPEOF(column) == CPLXSXP) {
+      PyObjectPtr value(r_to_py_numpy(column, convert));
+      status = PyDict_SetItem(dict, name, value);
+    } else {
+      // use Pandas nullable data types.
+      PyObjectPtr value(r_to_py_pandas_nullable_series(column, convert));
+      status = PyDict_SetItem(dict, name, value);
     }
 
     if (status != 0)
       throw PythonException(py_fetch_error());
   }
 
   return py_ref(dict.detach(), convert);
-
 }
 
 namespace {

diff --git a/tests/testthat/test-python-pandas.R b/tests/testthat/test-python-pandas.R
@@ -273,3 +273,38 @@ test_that("NA in string columns don't prevent simplification", {
   expect_equal(as.logical(is.na(r)), c(FALSE, TRUE, TRUE, TRUE))
 
 })
+
+test_that("NA's are preserved in pandas columns", {
+  pd <- import("pandas")
+  if (numeric_version(pd$`__version__`) < "1.5") {
+    skip("Nullable data types require pandas version >= 1.5 to work fully.")
+  }
+
+  df <- data.frame(
+    int = c(NA, 1:10),
+    num = c(NA, rnorm(10)),
+    bool = c(NA, rep(c(TRUE, FALSE), 5)),
+    string = c(NA, letters[1:10])
+  )
+
+  withr::with_options(c(reticulate.pandas_use_nullable_dtypes = TRUE), {
+    p_df <- r_to_py(df)
+  })
+
+  r_df <- py_to_r(p_df)
+
+  expect_identical(r_df$num, df$num)
+  expect_identical(r_df$int, df$int)
+  expect_identical(r_df$bool, df$bool)
+  expect_identical(r_df$string, df$string)
+})
+
+test_that("Round strip for string columns with NA's work correctly", {
+  df <- data.frame(string = c(NA, letters[1:10]))
+  p <- r_to_py(df)
+
+  expect_true(py_to_r(p$string$isna()[0]))
+
+  r <- py_to_r(p)
+  expect_true(is.na(r$string[1]))
+})
diff --git a/vignettes/calling_python.Rmd b/vignettes/calling_python.Rmd
@@ -240,6 +240,32 @@ R data frames can be automatically converted to and from [Pandas](https://pandas
 
 If the R data frame has row names, the generated Pandas DataFrame will be re-indexed using those row names (and vice versa). Special handling is also available for a `DatetimeIndex` associated with a Pandas DataFrame; however, because R only supports character vectors for row names they are converted to character first.
 
+### Using Pandas nullable data types
+
+Pandas has experimental support for nullable data types. Those data types have built-in
+support for missing values, represented by `pd.NA` and using them allows us to better
+represent R `NA` values.
+
+Users can opt-in to use Pandas nullable data types instead of numpy arrays by setting
+the `reticulate.pandas_use_nullable_dtypes` to `TRUE`. For example:
+
+```r
+df <- data.frame(
+  int = c(NA, 1:4),
+  num = c(NA, rnorm(4)),
+  lgl = c(NA, rep(c(TRUE, FALSE), 2)),
+  string = c(NA, letters[1:4])
+)
+options(reticulate.pandas_use_nullable_data_types = TRUE)
+r_to_py(df)
+#>     int       num    lgl string
+#> 0  <NA>      <NA>   <NA>   <NA>
+#> 1     1 -0.697855   True      a
+#> 2     2 -0.253042  False      b
+#> 3     3  0.385421   True      c
+#> 4     4  0.519933  False      d
+```
+
 ## Sparse Matrices
 
 Sparse matrices created by [Matrix R package](https://CRAN.R-project.org/package=Matrix) can be converted [Scipy CSC matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csc_matrix.html), and vice versa. This is often useful when you want to pass sparse matrices to Python functions that accepts Scipy CSC matrix to take advantage of this format, such as efficient column slicing and fast matrix vector products.