Skip to content

Commit

Permalink
Add python rust compilation example (#2826)
Browse files Browse the repository at this point in the history
* start with examples

* add example showing custom compiled python functions
  • Loading branch information
ritchie46 committed Mar 4, 2022
1 parent 0265a86 commit 7ab9bda
Show file tree
Hide file tree
Showing 22 changed files with 355 additions and 57 deletions.
36 changes: 32 additions & 4 deletions .github/workflows/build-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,38 @@ on:
- 'polars/**'
jobs:

examples:
name: Examples
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Install Rust
run: rustup update stable
- uses: Swatinem/rust-cache@v1
- name: Install carho-hack
run: |
cargo install cargo-hack
- name: "check"
run: cd examples && cargo check

features:
name: Features
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Install Rust nightly
uses: actions-rs/toolchain@v1
with:
toolchain: nightly-2022-02-23
override: true
- uses: Swatinem/rust-cache@v1
- name: Install dependencies
run: |
cargo install cargo-hack
- name: Feature test
run: |
cd polars && cargo hack check --each-feature --no-dev-deps --features private
test-rust:
name: Build and test Rust
runs-on: ubuntu-latest
Expand Down Expand Up @@ -33,7 +65,6 @@ jobs:
key: ubuntu-x86-64-target-cache-nightly
- name: Install dependencies
run: |
cargo install cargo-hack
rm -f dprint-x86_64-unknown-linux-gnu.zip
- name: Run formatting checks
run: |
Expand All @@ -48,9 +79,6 @@ jobs:
-p polars-lazy \
-- -D warnings
cargo clippy -- -D warnings
- name: Feature test
run: |
cd polars && cargo hack check --each-feature --no-dev-deps --features private
- name: Run tests
run: |
export RUSTFLAGS="-C debuginfo=0"
Expand Down
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ members = [
"polars/polars-lazy",
"polars/polars-time",
"polars/polars-utils",
"examples/read_csv",
"examples/read_parquet",
"examples/python_rust_compiled_function",
]

[patch.crates-io]
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ Polars has transitioned to [arrow2](https://crates.io/crates/arrow2).
Arrow2 is a faster and safer implementation of the [Apache Arrow Columnar Format](https://arrow.apache.org/docs/format/Columnar.html).
Arrow2 also has a more granular code base, helping to reduce the compiler bloat.

## Use custom Rust function in python?
See [this example](./examples/python_rust_compiled_function).

## Acknowledgements

Expand Down
4 changes: 0 additions & 4 deletions examples/README.md

This file was deleted.

14 changes: 14 additions & 0 deletions examples/python_rust_compiled_function/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[package]
name = "python_rust_compiled_function"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
name = "my_polars_functions"
crate-type = ["cdylib"]

[dependencies]
polars = { path = "../../polars" }
polars-arrow = { path = "../../polars/polars-arrow" }
pyo3 = "0.16"
11 changes: 11 additions & 0 deletions examples/python_rust_compiled_function/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Compile Custom Rust functions and use in python polars

## Compile a development binary in your current environment
`$ pip install -U maturin && maturin develop`

## Run
`$ python example.py`


## Compile a **release** build
`$ maturin develop --release`
12 changes: 12 additions & 0 deletions examples/python_rust_compiled_function/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import polars as pl
from my_polars_functions import hamming_distance

a = pl.Series("a", ["foo", "bar"])
b = pl.Series("b", ["fooy", "ham"])

dist = hamming_distance(a, b)
expected = pl.Series("", [None, 2], dtype=pl.UInt32)

print(hamming_distance(a, b))
assert dist.series_equal(expected, null_equal=True)

7 changes: 7 additions & 0 deletions examples/python_rust_compiled_function/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[build-system]
requires = ["maturin>=0.12,<0.13"]
build-backend = "maturin"

[project]
name = "my_polars_functions"
version = "0.1.0"
94 changes: 94 additions & 0 deletions examples/python_rust_compiled_function/src/ffi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
use arrow::{array::ArrayRef, ffi};
use polars::prelude::*;
use polars_arrow::export::arrow;
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use pyo3::{ffi::Py_uintptr_t, PyAny, PyObject, PyResult};

/// Take an arrow array from python and convert it to a rust arrow array.
/// This operation does not copy data.
fn array_to_rust(arrow_array: &PyAny) -> PyResult<ArrayRef> {
// prepare a pointer to receive the Array struct
let array = Box::new(ffi::ArrowArray::empty());
let schema = Box::new(ffi::ArrowSchema::empty());

let array_ptr = &*array as *const ffi::ArrowArray;
let schema_ptr = &*schema as *const ffi::ArrowSchema;

// make the conversion through PyArrow's private API
// this changes the pointer's memory and is thus unsafe. In particular, `_export_to_c` can go out of bounds
arrow_array.call_method1(
"_export_to_c",
(array_ptr as Py_uintptr_t, schema_ptr as Py_uintptr_t),
)?;

unsafe {
let field = ffi::import_field_from_c(schema.as_ref()).unwrap();
let array = ffi::import_array_from_c(array, field.data_type).unwrap();
Ok(array.into())
}
}

/// Arrow array to Python.
pub(crate) fn to_py_array(py: Python, pyarrow: &PyModule, array: ArrayRef) -> PyResult<PyObject> {
let array_ptr = Box::new(ffi::ArrowArray::empty());
let schema_ptr = Box::new(ffi::ArrowSchema::empty());

let array_ptr = Box::into_raw(array_ptr);
let schema_ptr = Box::into_raw(schema_ptr);

unsafe {
ffi::export_field_to_c(
&ArrowField::new("", array.data_type().clone(), true),
schema_ptr,
);
ffi::export_array_to_c(array, array_ptr);
};

let array = pyarrow.getattr("Array")?.call_method1(
"_import_from_c",
(array_ptr as Py_uintptr_t, schema_ptr as Py_uintptr_t),
)?;

unsafe {
Box::from_raw(array_ptr);
Box::from_raw(schema_ptr);
};

Ok(array.to_object(py))
}

pub fn py_series_to_rust_series(series: &PyAny) -> PyResult<Series> {
// rechunk series so that they have a single arrow array
let series = series.call_method0("rechunk")?;

let name = series.getattr("name")?.extract::<String>()?;

// retrieve pyarrow array
let array = series.call_method0("to_arrow")?;

// retrieve rust arrow array
let array = array_to_rust(array)?;

Series::try_from((name.as_str(), array)).map_err(|e| PyValueError::new_err(format!("{}", e)))
}

pub fn rust_series_to_py_series(series: &Series) -> PyResult<PyObject> {
// ensure we have a single chunk
let series = series.rechunk();
let array = series.to_arrow(0);

// acquire the gil
let gil = Python::acquire_gil();
let py = gil.python();
// import pyarrow
let pyarrow = py.import("pyarrow")?;

// pyarrow array
let pyarrow_array = to_py_array(py, pyarrow, array)?;

// import polars
let polars = py.import("polars")?;
let out = polars.call_method1("from_arrow", (pyarrow_array,))?;
Ok(out.to_object(py))
}
50 changes: 50 additions & 0 deletions examples/python_rust_compiled_function/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
mod ffi;

use polars::prelude::*;
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;

#[pyfunction]
fn hamming_distance(series_a: &PyAny, series_b: &PyAny) -> PyResult<PyObject> {
let series_a = ffi::py_series_to_rust_series(series_a)?;
let series_b = ffi::py_series_to_rust_series(series_b)?;

let out = hamming_distance_impl(&series_a, &series_b)
.map_err(|e| PyValueError::new_err(format!("Something went wrong: {:?}", e)))?;
ffi::rust_series_to_py_series(&out.into_series())
}

/// This function iterates over 2 `Utf8Chunked` arrays and computes the hamming distance between the values .
fn hamming_distance_impl(a: &Series, b: &Series) -> Result<UInt32Chunked> {
Ok(a.utf8()?
.into_iter()
.zip(b.utf8()?.into_iter())
.map(|(lhs, rhs)| hamming_distance_strs(lhs, rhs))
.collect())
}

/// Compute the hamming distance between 2 string values.
fn hamming_distance_strs(a: Option<&str>, b: Option<&str>) -> Option<u32> {
match (a, b) {
(None, _) => None,
(_, None) => None,
(Some(a), Some(b)) => {
if a.len() != b.len() {
None
} else {
Some(
a.chars()
.zip(b.chars())
.map(|(a_char, b_char)| (a_char != b_char) as u32)
.sum::<u32>(),
)
}
}
}
}

#[pymodule]
fn my_polars_functions(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_wrapped(wrap_pyfunction!(hamming_distance)).unwrap();
Ok(())
}
13 changes: 13 additions & 0 deletions examples/read_csv/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[package]
name = "read_csv"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[features]
write_output = ["polars/ipc", "polars/parquet"]
default = ["write_output"]

[dependencies]
polars = { path = "../../polars", features = ["lazy", "csv-file", "pretty_fmt"] }
32 changes: 32 additions & 0 deletions examples/read_csv/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
use polars::prelude::*;

fn main() -> Result<()> {
let mut df = LazyCsvReader::new("../datasets/foods1.csv".into())
.finish()?
.select([
// select all columns
all(),
// and do some aggregations
cols(["fats_g", "sugars_g"]).sum().suffix("_summed"),
])
.collect()?;

dbg!(&df);

write_other_formats(&mut df)?;
Ok(())
}

fn write_other_formats(df: &mut DataFrame) -> Result<()> {
let parquet_out = "../datasets/foods1.parquet";
if std::fs::metadata(&parquet_out).is_err() {
let f = std::fs::File::create(&parquet_out).unwrap();
ParquetWriter::new(f).with_statistics(true).finish(df)?;
}
let ipc_out = "../datasets/foods1.ipc";
if std::fs::metadata(&ipc_out).is_err() {
let f = std::fs::File::create(&ipc_out).unwrap();
IpcWriter::new(f).finish(df)?
}
Ok(())
}
9 changes: 9 additions & 0 deletions examples/read_parquet/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[package]
name = "read_parquet"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
polars = { path = "../../polars", features = ["lazy", "parquet", "pretty_fmt"] }
18 changes: 18 additions & 0 deletions examples/read_parquet/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
use polars::prelude::*;

fn main() -> Result<()> {
let df = LazyFrame::scan_parquet(
"../datasets/foods1.parquet".into(),
ScanArgsParquet::default(),
)?
.select([
// select all columns
all(),
// and do some aggregations
cols(["fats_g", "sugars_g"]).sum().suffix("_summed"),
])
.collect()?;

dbg!(df);
Ok(())
}
5 changes: 4 additions & 1 deletion polars/polars-lazy/src/dsl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ pub mod string;

use crate::logical_plan::Context;
use crate::prelude::*;
use crate::utils::{has_expr, has_root_literal_expr};
use crate::utils::has_expr;

#[cfg(feature = "is_in")]
use crate::utils::has_root_literal_expr;
use polars_arrow::prelude::QuantileInterpolOptions;
use polars_core::export::arrow::{array::BooleanArray, bitmap::MutableBitmap};
use polars_core::prelude::*;
Expand Down
3 changes: 2 additions & 1 deletion polars/polars-lazy/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,8 @@ pub fn all() -> Expr {
}

/// Select multiple columns by name
pub fn cols(names: Vec<String>) -> Expr {
pub fn cols<I: IntoVec<String>>(names: I) -> Expr {
let names = names.into_vec();
Expr::Columns(names)
}

Expand Down

0 comments on commit 7ab9bda

Please sign in to comment.