In [1]:
import pytest

import pyarrow as pa
import numpy as np

from vaex_arrow_ext import ext


def test_madd():
    a = pa.array([1, 2, 3], type=pa.float64())
    b = pa.array([0, 2, 4], type=pa.float64())
    c = pa.array([1, 1, 5], type=pa.float64())
    assert ext.madd(a, b, c).to_pylist() == [1, 4, 23]

    with pytest.raises(ValueError):
        assert ext.madd(a[1:], b, c).to_pylist() == [1, 4, 23]


def test_sum():
    x = pa.array([1,2,3.1])
    assert ext.sum(x) == np.sum(np.array(x))
    assert ext.sum(x[1:]) == np.sum(np.array(x[1:]))


In [1]:


import pyarrow as pa
import numpy as np

from vaex_arrow_ext import ext


a = pa.array(["1", "1", "3"])
b = pa.array([0, 1, 2], type=pa.int32())
print(ext.getUnique(a, b).to_pylist())



[0, 2]


In [2]:
import pyarrow as pa

In [3]:
pa.get_include()

'/home/zhewu/.local/lib/python3.8/site-packages/pyarrow/include'

In [4]:
pa.get_libraries()

['arrow', 'arrow_python']

In [1]:
import numpy as np
import pyarrow as pa
import pybind11
print(
    np.get_include(),
pa.get_include(),
pybind11.get_include()
)

/home/zhewu/.local/lib/python3.8/site-packages/numpy/core/include /home/zhewu/.local/lib/python3.8/site-packages/pyarrow/include /home/zhewu/.local/lib/python3.8/site-packages/pybind11/include


In [1]:
%load_ext Cython

In [8]:
import pandas as pd
import numpy as np

In [2]:
%%cython
def f_plain(x):
    return x * (x - 1)
def integrate_f_plain(a, b, N):
    s = 0
    dx = (b - a) / N
    for i in range(N):
        s += f_plain(a + i * dx)
    return s * dx

In [12]:
def f(x):
    return x * (x - 1)


def integrate_f(a, b, N):
    s = 0
    dx = (b - a) / N
    for i in range(N):
        s += f(a + i * dx)
    return s * dx

In [4]:
f_plain(12)

132

In [18]:
df = pd.DataFrame(
    {
        "a": np.random.randn(10000),
        "b": np.random.randn(10000),
        "N": np.random.randint(100, 1000, (10000)),
        "x": "x",
    }
)

In [20]:
%%timeit 
df.apply(lambda x: integrate_f(x["a"], x["b"], x["N"]), axis=1)

1.34 s ± 31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%%timeit 
df.apply(lambda x: integrate_f_plain(x["a"], x["b"], x["N"]), axis=1)

845 ms ± 49.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%cython
cdef double f_typed(double x) except? -2:
    return x * (x - 1)
cpdef double integrate_f_typed(double a, double b, int N):
    cdef int i
    cdef double s, dx
    s = 0
    dx = (b - a) / N
    for i in range(N):
        s += f_typed(a + i * dx)
    return s * dx

In [21]:
%timeit df.apply(lambda x: integrate_f_typed(x["a"], x["b"], x["N"]), axis=1)

172 ms ± 5.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
