In [20]:
import numpy as np
import numba
from numba import cuda
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32
import time

In [104]:
@cuda.jit(device=True, inline=True)
def cuda_is_sorted(arr, length: 'arr.size'):
    idx = 0
    sort_stop = length - 1
    # while-loop was much faster than a for-loop
    while idx < sort_stop:
        if arr[idx] > arr[idx+1]:
            return False
        idx += 1
    return True

In [11]:
@cuda.jit(device=True, inline=True)
def cuda_random_index(
    rng_states: 'pre-initialized random seeds for every thread',
    rng_idx: 'globally unique thread ID',
    max_idx: 'highest index (exclusive)'
) -> 'integer [0, max_idx)':
    rand = xoroshiro128p_uniform_float32(rng_states, rng_idx)
    # int() floors the result
    return int(max_idx*rand)

In [40]:
# The expected number of swaps grows more than fast:
# n! + 2(n-2)! + o((n-2)!) for n -> inf
# as shown by Flatto/Odlyzko/Wales in
# http://www.dtc.umn.edu/~odlyzko/doc/arch/random.shuffles.pdf
# Therefore, we will be able to analyze nothing but
# small arrays
MAX_ARR_SIZE = 16

@cuda.jit
def cuda_expectation(rng_states, arr, out):
    # allocate L1 cache
    local_arr = cuda.local.array(MAX_ARR_SIZE, dtype=numba.int32)
    
    # initialize the local memory
    length = arr.size
    for idx in range(length):
        local_arr[idx] = arr[idx]
        
    
    # globally unique index/ID
    thread_id = cuda.grid(1)
    
    swap_count = 0
    while not cuda_is_sorted(local_arr, length):
        
        # choose a random pair
        i = cuda_random_index(rng_states, thread_id, length)
        j = cuda_random_index(rng_states, thread_id, length)
        while i==j:
            j = cuda_random_index(rng_states, thread_id, length)
            
        # swap the pair
        temp = local_arr[i]
        local_arr[i] = local_arr[j]
        local_arr[j] = temp
        
        swap_count += 1
        
    # done with this thread
    out[thread_id] = swap_count

In [None]:
def expectation(
    arr: 'Numpy 1D int32 array of at most MAX_ARR_SIZE elements',
    runs: 'number of iterations or a tuple of GPU block and threads',
    seed: 'random generator seed, default is reuse or current time' = None,
):
    assert (
        arr.ndim == 1 and
        arr.size < MAX_ARR_SIZE and
        arr.dtype == np.int32
    )

    # calc grid, blocks, threads per block
    if type(runs) == int:
        grid_size = runs
        threads = 32
        blocks = (grid_size + threads - 1)//threads
    else:
        blocks, threads = runs
        grid_size = blocks*threads
        
    # get random generator states
    if seed is None:
        prev_grid_size = None
        prev_rng_states = getattr(expectation, 'prev_rng_states', None)
        try:
            prev_grid_size = prev_rng_states.size
        except Exception as e:
            print(e)
        if grid_size != prev_grid_size:
            seed = int(1000*time.time())
            rng_states = create_xoroshiro128p_states(grid_size, seed=seed)
        else:
            rng_states = prev_rng_states
    else:
        rng_states = create_xoroshiro128p_states(grid_size, seed=seed)
    expectation.prev_rng_states = rng_states
    
    # the good stuff
#     cuda_results = cuda.device_array(grid_size, dtype=np.int32)
#     cuda_expectation(rng_states, arr, cuda_results)
#     results = cuda_results.to_host()
    results = np.empty(grid_size, dtype=np.int32)
    cuda_expectation(rng_states, arr, results)
    
    return results.mean()

In [8]:
arr = np.random.randint(0, 6, 10, dtype=np.int32)
arr

In [None]:
expectation(arr, 2**15, seed=1)

In [None]:
%timeit expectation(arr, 2**15, seed=1)

In [2]:
!nvidia-smi

Sat May 25 07:27:53 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.104      Driver Version: 410.104      CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  On   | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [None]:
%%bash --proc nvpid --bg --out nvsmi
nvidia-smi --query-gpu=index,utilization.gpu,utilization.memory,memory.total,memory.used --format=csv -l 1

In [None]:
nvpid.kill()

In [None]:
while nvpid.poll() is None: print(nvsmi.readline().decode().strip())

print(nvsmi.read().decode())