# First, import the py11 module

The underlying technology that makes this work on pybind11 (https://github.com/pybind/pybind11)

It compiles individual functions in C++ to provide an more dynamic approach to C++ programming.

First step, import the following symbols...

In [1]:
!rm -fr ~/tmp
!mkdir -p ~/tmp

In [2]:
from py11 import py11, smap, svec

To create a C++ function that's callable from Python, use the @py11 decorator and
put your code in the docstring. You
can supply it with a list of headers needed. You can modify the source and
re-execute the cell if you want different output.

In [3]:
@py11(headers=['<iostream>'])
def hello():
    """
    std::cout << "Hello, world" << std::endl;
    std::cerr << "Goodbye, world" << std::endl;
    """

g++ -std=c++14 -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/hello_v1.so /home/stevenrbrandt/tmp/hello.cpp


We can now call this function directly from Python. It won't recompile again unless we change the source code.

In [4]:
hello()

Hello, world
Goodbye, world


We can pass arguments to the function and receive arguments back. To do this, we use
Python's syntax for declaring types.

In [7]:
@py11()
def sumi(a : int,b : int)->int:
    """
    // add 2 args
    return a+b;
    """

g++ -std=c++14 -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/sumi_v1.so /home/stevenrbrandt/tmp/sumi.cpp


In [8]:
sumi(20,22)

42

Static variables can be used to remember state.

In [9]:
@py11()
def count()->int:
    """
    static int counter = 0; // remember state
    return counter++;
    """

g++ -std=c++14 -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/count_v1.so /home/stevenrbrandt/tmp/count.cpp


In [10]:
for i in range(5):
    print(count())

0
1
2
3
4


If you want to pass a more complex data structure, like an std::vector, use svec.

In [11]:
@py11(headers=["<vector>"])
def sumv(v : svec[int])->int:
    """
    int sum=0;
    for(auto i=v.begin(); i != v.end(); ++i)
        sum += *i;
    return sum;
    """

g++ -std=c++14 -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/sumv_v1.so /home/stevenrbrandt/tmp/sumv.cpp


In [12]:
sumv([1,2,3,4])

10

Something similar works for maps...

In [13]:
@py11(header=['<map>'])
def summ(m : smap[str,int])->int:
    """
    int sum = 0;
    for(auto i=m.begin();i != m.end();++i) {
        sum += i->second;
    }
    return sum;
    """

print(summ({"a":3,"b":10}))

g++ -std=c++14 -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/summ_v1.so /home/stevenrbrandt/tmp/summ.cpp
13


It is possible to use pybind11 to throw an exception from C++ into Python.

In [14]:
@py11()
def get_and_set(k : str, v : int, p : bool)->int:
    """
    static std::map<std::string,int> dict;
    if(!p)
        if(dict.find(k) == dict.end())
            throw py::key_error(k);
    int r = dict[k];
    if(p)dict[k] = v;
    return r;
        
    """

g++ -std=c++14 -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/get_and_set_v1.so /home/stevenrbrandt/tmp/get_and_set.cpp


In [15]:
get_and_set("a",3,True)

0

In [16]:
get_and_set("a",0,False)

3

In [17]:
get_and_set("b",0,False)

KeyError: 'b'

Recursion works...

In [18]:
@py11()
def fib(n : int)->int:
    """
    if(n < 2) return n;
    return fib(n-1)+fib(n-2);
    """

g++ -std=c++14 -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/fib_v1.so /home/stevenrbrandt/tmp/fib.cpp


If we define a Python version of this same function, we can run a benchmark...

In [19]:
def fib2(n):
    if n < 2:
        return n
    return fib2(n-1)+fib2(n-2)

In [20]:
def timer(fun,*args):
    from time import time
    t1 = time()
    fun(*args)
    t2 = time()
    print("time:",t2-t1)

You should find that fib is much faster than fib2.

In [21]:
timer(fib,34)
timer(fib2,34)

time: 0.09941720962524414
time: 2.3330471515655518


If we want to call one @py11() function from another, we can. However, we need to specify what
we are doing by means of the funs parameter.

In [22]:
@py11(funs=[fib])
def print_fib(n:int)->None:
    """
    std::cout << "fib(" << n << ") = " << fib(n) << std::endl;
    """

g++ -std=c++14 -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/print_fib_v1.so /home/stevenrbrandt/tmp/print_fib.cpp


In [23]:
print_fib(15)

fib(15) = 610


Note that if we redefine fib, print_fib() will automatically
update to use the new version.

In [24]:
@py11()
def fib(n : int)->int:
    """
    if(n < 0) return n;
    return fib(n-1)+fib(n-2);
    """

g++ -std=c++14 -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/fib_v2.so /home/stevenrbrandt/tmp/fib.cpp


In [25]:
print_fib(15)

fib(15) = -3571


If we want to change the compile flags, we can.

In [26]:
from py11 import py11, create_type, set_flags

# Set the compile flags
set_flags("-std=c++17 -L/usr/local/lib64 -lhpx")

# Create your own types
create_type("func","std::function<void()>")

Sometimes you need to set the LD_LIBRARY_PATH

In [27]:
import os
os.environ["LD_LIBRARY_PATH"]="/usr/local/lib64"

HPX is an advanced parallel threading library. However, to use it, you have to have a
special threading environment. To make this work, we will create a "wrapper function."
This function needs to take a std::function<void()> as an input argument.

In [28]:
@py11(headers=["<run_hpx.cpp>"],recompile=True)
def hpx_wrapper(f : func)->None:
    """
    const char *num = getenv("HPX_NUM_THREADS");
    int num_threads = num == 0 ? 4 : atoi(num);
    std::cout << "Using " << num_threads << " threads." << std::endl;
    hpx_global::submit_work(num_threads,f);
    """

g++ -std=c++17 -L/usr/local/lib64 -lhpx -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/hpx_wrapper_v1.so /home/stevenrbrandt/tmp/hpx_wrapper.cpp


In [29]:
@py11(headers=["<hpx/hpx.hpp>"],recompile=True,wrap=hpx_wrapper)
def do_fut()->None:
    """
    auto f = hpx::async([](){ return 5; });
    std::cout << "f=" << f.get() << std::endl;
    """

g++ -std=c++17 -L/usr/local/lib64 -lhpx -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/do_fut_v1.so /home/stevenrbrandt/tmp/do_fut.cpp


In [30]:
do_fut()

Using 4 threads.
f=5


In [31]:
create_type("future","hpx::future",is_template=True)

In [32]:
@py11(headers=["<hpx/hpx.hpp>"],recompile=True,wrap=hpx_wrapper)
def hpx_fib(n : int)->int:
    """
    if(n < 2)
        return n;
    if(n < 25)
        return hpx_fib(n-1)+hpx_fib(n-2);
    hpx::future<int> f1 = hpx::async(hpx_fib,n-1);
    int f2 = hpx_fib(n-2);
    return f1.get() + f2;
    """

g++ -std=c++17 -L/usr/local/lib64 -lhpx -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/hpx_fib_v1.so /home/stevenrbrandt/tmp/hpx_fib.cpp


In [33]:
nfib = 34
(hpx_fib(nfib), fib2(nfib))

Using 4 threads.


(5702887, 5702887)

In [34]:
nfib = 34
timer(hpx_fib,nfib)
timer(fib2,    nfib)

Using 4 threads.
time: 0.06061148643493652
time: 2.433096170425415


In [35]:
def timer2(fun,args,zargs):
    from time import time
    t1 = time()
    fun(*args)
    t2 = time()
    fun(*zargs)
    t3 = time()
    
    del1 = t2-t1 # time with args
    del2 = t3-t2 # time with zargs
    
    delt = del1 - del2
    print("time:",delt)

In [36]:
nfib = 34
timer2(hpx_fib,[nfib],[1])
timer2(fib2,   [nfib],[1])

Using 4 threads.
Using 4 threads.
time: 0.038722991943359375
time: 2.3845033645629883


Sometimes calling C++ can kill the notebook kernel. To avoid that problem, we can run in a thread.

In [37]:
def run_fork(f,*args):
    import os
    pid = os.fork()
    if pid==0:
        f(*args)
        os._exit(0)
    else:
        while True:
            wpid, wstatus = os.wait()
            if wpid == pid:
                if wstatus == 0:
                    pass
                print("status:",wstatus)
                return

In [38]:
@py11()
def segv():
    """
    int *i=0;
    i[0]=1;
    """

g++ -std=c++17 -L/usr/local/lib64 -lhpx -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/segv_v1.so /home/stevenrbrandt/tmp/segv.cpp


In [39]:
run_fork(segv)

status: 65280


In [40]:
@py11()
def prvec(v : svec[float])->None:
    """
    for(auto i : v)
      std::cout << i << " ";
    std::cout << std::endl;
    """

g++ -std=c++17 -L/usr/local/lib64 -lhpx -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/prvec_v1.so /home/stevenrbrandt/tmp/prvec.cpp


Now for some fancy stuff... defining executors that (in principle) can work on more than one locality

In [42]:
create_type("executor","hpx::compute::host::block_executor<>")

@py11(headers=["<hpx/hpx.hpp>","<hpx/include/compute.hpp>"])
def getexec()->executor:
    """
    auto host_targets = hpx::compute::host::get_local_targets();
    typedef hpx::compute::host::block_executor<> executor_type;
    executor_type exec(host_targets);
    
    for(auto host : host_targets)
        std::cout << host.get_locality() << std::endl;
        
    return exec;
    """

Fill an array

In [43]:
@py11(wrap=hpx_wrapper,funs=[prvec,getexec],headers=[
    "<hpx/hpx.hpp>",
    "<hpx/include/parallel_fill.hpp>",
    "<hpx/include/compute.hpp>",
    "<hpx/include/parallel_executors.hpp>"
    ])
def fill_example()->None:
    """
    auto exec = getexec();
    
    std::vector<float> vd;
    for(int i=0;i<5;i++) vd.push_back(1.f);
    prvec(vd);
    hpx::parallel::fill(hpx::parallel::execution::par.on(exec),vd.begin(),vd.end(),2.0f);
    prvec(vd);
    """

fill_example()

g++ -std=c++17 -L/usr/local/lib64 -lhpx -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/fill_example_v1.so /home/stevenrbrandt/tmp/fill_example.cpp
In file included from /usr/local/include/hpx/parallel/algorithm.hpp:22,
                 from /usr/local/include/hpx/algorithm.hpp:10,
                 from /usr/local/include/hpx/hpx.hpp:10,
                 from /home/stevenrbrandt/tmp/fill_example.cpp:4:
/usr/local/include/hpx/parallel/algorithms/fill.hpp: In instantiation of 'typename hpx::parallel::util::detail::algorithm_result<ExPolicy, void>::type hpx::parallel::v1::fill(ExPolicy&&, FwdIter, FwdIter, const T&) [with ExPolicy = hpx::execution::parallel_policy_shim<hpx::compute::host::block_executor<>, hpx::execution::static_chunk_size>; FwdIter = __gnu_cxx::__normal_iterator<float*, std::vector<float> >; T = float; int _concept_requires_217 = 42; typename std::enable_if<((_concept_requires_637 == 43) || (hpx::is_execution_policy<T>::v

TypeError: 'NoneType' object is not callable

In [44]:
@py11(wrap=hpx_wrapper,funs=[prvec,getexec],headers=[
    "<hpx/hpx.hpp>",
    "<hpx/parallel/algorithms/reverse.hpp>",
    "<hpx/include/compute.hpp>",
    "<hpx/include/parallel_executors.hpp>"
    ])
def rev_example()->None:
    """
    auto exec = getexec();
    
    std::vector<float> vd;
    for(int i=0;i<10;i++) vd.push_back(i);
    prvec(vd);
    hpx::parallel::reverse(hpx::parallel::execution::par.on(exec),vd.begin(),vd.end());
    prvec(vd);
    """

rev_example()

g++ -std=c++17 -L/usr/local/lib64 -lhpx -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/rev_example_v1.so /home/stevenrbrandt/tmp/rev_example.cpp
Using 4 threads.
{0000000100000000, 0000000000000000}
{0000000100000000, 0000000000000000}
{0000000100000000, 0000000000000000}
{0000000100000000, 0000000000000000}
0 1 2 3 4 5 6 7 8 9 
9 8 7 6 5 4 3 2 1 0 


In [45]:
@py11(wrap=hpx_wrapper,funs=[prvec,getexec],headers=[
    "<hpx/hpx.hpp>",
    "<hpx/include/parallel_minmax.hpp>",
    "<hpx/include/compute.hpp>",
    "<hpx/include/parallel_executors.hpp>"
    ])
def min_example()->None:
    """
    auto exec = getexec();
    
    std::vector<float> vd;
    for(int i=0;i<10;i++) vd.push_back(i+300);
    prvec(vd);
    auto ptr = hpx::parallel::min_element(hpx::parallel::execution::par.on(exec),vd.begin(),vd.end(),std::less<float>());
    std::cout << *ptr << std::endl;
    ptr = hpx::parallel::max_element(hpx::parallel::execution::par.on(exec),vd,std::less<float>());
    std::cout << *ptr << std::endl;
    """

min_example()

g++ -std=c++17 -L/usr/local/lib64 -lhpx -I/usr/include/python3.6m -I/usr/local/include -rdynamic -fPIC -shared -o /home/stevenrbrandt/tmp/min_example_v1.so /home/stevenrbrandt/tmp/min_example.cpp
Using 4 threads.
{0000000100000000, 0000000000000000}
{0000000100000000, 0000000000000000}
{0000000100000000, 0000000000000000}
{0000000100000000, 0000000000000000}
300 301 302 303 304 305 306 307 308 309 
300
309


Exercise: Make this routine run in parallel with HPX

In [47]:
@py11(wrap=hpx_wrapper,headers=["<hpx/hpx.hpp>"])
def myqsort(v : svec[int])->svec[int]:
    """
    if(v.size() < 2)
        return v;
    int pivot = v[rand() % v.size()];
    std::vector<int> lo, hi, eq;
    for(int i=0;i<v.size();i++) {
        if(v[i] < pivot)
           lo.push_back(v[i]);
        else if(v[i] > pivot)
           hi.push_back(v[i]);
        else
           eq.push_back(v[i]);
    }
    lo = myqsort(lo);
    hi = myqsort(hi);
    std::vector<int> result;
    result.insert(result.end(),lo.begin(),lo.end());
    result.insert(result.end(),eq.begin(),eq.end());
    result.insert(result.end(),hi.begin(),hi.end());
    return result;
    """

from random import randint
inp = [randint(1,100) for v in range(20)]
out = myqsort(inp)
print(inp)
print(out)

# check result...
for i in range(1,len(out)):
    assert out[i-1] <= out[i]
vals = {}
for v in inp:
    if v not in vals:
        vals[v] = 0
    vals[v] += 1
for v in out:
    # if vals[v] is not defined, our sort messed up
    vals[v] -= 1
for v in vals.values():
    assert v == 0 # should have all the same values in output

Using 4 threads.
[88, 35, 82, 50, 48, 54, 68, 22, 88, 19, 58, 78, 36, 74, 54, 62, 34, 74, 98, 56]
[19, 22, 34, 35, 36, 48, 50, 54, 54, 56, 58, 62, 68, 74, 74, 78, 82, 88, 88, 98]
