Benchmarking python-scripts/script.py

In [16]:
## compute_input.py
import sys, json, numpy as np, pandas as pd
import line_profiler
import numpy as np

import numba
from numba import cuda
import pyarrow as pa

@numba.jit(nopython=True)
def compute_bin(x, n, xmin, xmax):
    # special case to mirror NumPy behavior for last bin
    if x == xmax:
        return n - 1 # a_max always in last bin

    # SPEEDTIP: Remove the float64 casts if you don't need to exactly reproduce NumPy
    bin = np.int32(n * (np.float64(x) - np.float64(xmin)) / (np.float64(xmax) - np.float64(xmin)))

    if bin < 0 or bin >= n:
        return None
    else:
        return bin

@cuda.jit
def histogram(x, xmin, xmax, histogram_out):
    nbins = histogram_out.shape[0]
    bin_width = (xmax - xmin) / nbins

    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    for i in range(start, x.shape[0], stride):
        # note that calling a numba.jit function from CUDA automatically
        # compiles an equivalent CUDA device function!
        bin_number = compute_bin(x[i], nbins, xmin, xmax)

        if bin_number >= 0 and bin_number < histogram_out.shape[0]:
            cuda.atomic.add(histogram_out, bin_number, 1)

@cuda.jit
def min_max(x, min_max_array):
    nelements = x.shape[0]

    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    # Array already seeded with starting values appropriate for x's dtype
    # Not a problem if this array has already been updated
    local_min = min_max_array[0]
    local_max = min_max_array[1]

    for i in range(start, x.shape[0], stride):
        element = x[i]
        local_min = min(element, local_min)
        local_max = max(element, local_max)

    # Now combine each thread local min and max
    cuda.atomic.min(min_max_array, 0, local_min)
    cuda.atomic.max(min_max_array, 1, local_max)


def dtype_min_max(dtype):
    '''Get the min and max value for a numeric dtype'''
    if np.issubdtype(dtype, np.integer):
        info = np.iinfo(dtype)
    else:
        info = np.finfo(dtype)
    return info.min, info.max


@numba.jit(nopython=True)
def get_bin_edges(a, nbins, a_min, a_max):
    bin_edges = np.empty((nbins+1,), dtype=np.float64)
    delta = (a_max - a_min) / nbins
    for i in range(bin_edges.shape[0]):
        bin_edges[i] = a_min + i * delta

    bin_edges[-1] = a_max  # Avoid roundoff error on last point
    return bin_edges


def numba_gpu_histogram(a, bins):
    # Move data to GPU so we can do two operations on it
    a_gpu = cuda.to_device(a)

    ### Find min and max value in array
    dtype_min, dtype_max = dtype_min_max(a.dtype)
    # Put them in the array in reverse order so that they will be replaced by the first element in the array
    min_max_array_gpu = cuda.to_device(np.array([dtype_max, dtype_min], dtype=a.dtype))
    min_max[64, 64](a_gpu, min_max_array_gpu)
    a_min, a_max = min_max_array_gpu.copy_to_host()

    # SPEEDTIP: Skip this step if you don't need to reproduce the NumPy histogram edge array
    bin_edges = get_bin_edges(a, bins, a_min, a_max) # Doing this on CPU for now

    ### Bin the data into a histogram 
    histogram_out = cuda.to_device(np.zeros(shape=(bins,), dtype=np.int32))
    histogram[64, 64](a_gpu, a_min, a_max, histogram_out)

    return histogram_out.copy_to_host(), bin_edges


def histNumba(data,colName):
    bins = data.shape[0] > 64 and 64 or data.shape[0]
    df1 = numba_gpu_histogram(np.asarray(data[colName]),bins)
    dict_temp ={}
    
    dict_temp['A'] = list(df1[1].astype(str))
    dict_temp['B'] = list(df1[0].astype(str))
    
#     print(json.dumps(dict_temp))
    sys.stdout.flush()

def histPandas(data,colName):
    bins = data.shape[0] > 64 and 64 or data.shape[0]
    df1 = np.histogram(data[colName],bins=bins)
    dict_temp ={}
    
    dict_temp['A'] = list(df1[1].astype(str))
    dict_temp['B'] = list(df1[0].astype(str))
    
#     print(json.dumps(dict_temp))
    sys.stdout.flush()

def columns(data):
#     print(list(data.columns))
    sys.stdout.flush()

def readArrowToDF(source):
    source = source+".arrow"
    reader = pa.RecordBatchStreamReader(source)
    pa_df = reader.read_all()
    return pa_df.to_pandas()

def readDirect(source):
    df = pd.read_csv(source)
    return df

def exec(file,type,processing,load_type,colName='A'):
    #get our data as an array from read_in()    
    if load_type == 'csv':
        data = readDirect(file)
    elif load_type == 'arrow':
        data = readArrowToDF(file)
    
    
    if type == 'hist':
        if processing == 'numba':
            histNumba(data,colName)
        elif processing == 'pandas':
            histPandas(data,colName)
    elif type == 'columns':
        columns(data)


In [96]:
files = ['data-0k','data-10k','data-100k','data-1000k','data-10000k','data-100000k']

In [97]:
pandas_with_csv = {}
pandas_with_arrow = {}
numba_with_csv = {}
numba_with_arrow = {}

In [98]:
def initiate(file):
    pandas_with_csv[file] = []
    pandas_with_arrow[file] = []
    numba_with_csv[file] = []
    numba_with_arrow[file] = []

In [99]:
def display():
    index = ['getCols','getHist']
    print('\npandas_with_csv\n\n',pd.DataFrame(pandas_with_csv,index=index))
    print('\npandas_with_arrow\n\n',pd.DataFrame(pandas_with_arrow,index=index))
    print('\nnumba_with_csv\n\n',pd.DataFrame(numba_with_csv,index=index))
    print('\nnumba_with_arrow\n\n',pd.DataFrame(numba_with_arrow,index=index))

In [100]:
def getHist(file,name):
    pandas_csv = %timeit -o -n 2 exec(file,'hist','pandas','csv')
    pandas_with_csv[name].append(pandas_csv.best)
    pandas_arrow = %timeit -o -n 2 exec(file,'hist','pandas','arrow')
    pandas_with_arrow[name].append(pandas_arrow.best)
    numba_csv = %timeit -o -n 2 exec(file,'hist','numba','csv')
    numba_with_csv[name].append(numba_csv.best)
    numba_arrow = %timeit -o -n 2 exec(file,'hist','numba','arrow')
    numba_with_arrow[name].append(numba_arrow.best)

In [101]:
def getCols(file,name):
    pandas_csv = %timeit -o -n 2 exec(file,'columns','pandas','csv');
    pandas_with_csv[name].append(pandas_csv.best)
    pandas_arrow = %timeit -o -n 2 exec(file,'hist','pandas','arrow');
    pandas_with_arrow[name].append(pandas_arrow.best)
    numba_csv = %timeit -o -n 2 exec(file,'columns','numba','csv');
    numba_with_csv[name].append(numba_csv.best)
    numba_arrow = %timeit -o -n 2 exec(file,'columns','numba','arrow');
    numba_with_arrow[name].append(numba_arrow.best)

In [102]:
def benchmark():
    for file in files:
        print(file)
        name = file
        file = str('uploads/'+file)
        initiate(name)
        getCols(file,name)
        getHist(file,name)

In [103]:
benchmark()

data-0k
1.79 ms ± 371 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
3.4 ms ± 468 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
1.66 ms ± 356 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
2.8 ms ± 409 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
1.96 ms ± 548 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
3.01 ms ± 544 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
3.53 ms ± 834 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
4.23 ms ± 513 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
data-10k
3.12 ms ± 302 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
3.33 ms ± 611 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
3.22 ms ± 492 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
2.68 ms ± 393 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
3.71 ms ± 595 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
3.17 ms ± 455 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)
5.1 ms ± 765 µs per

In [104]:
display()


pandas_with_csv

           data-0k  data-100000k  data-10000k  data-1000k  data-100k  data-10k
getCols  0.001272     14.721829     1.426768    0.230918   0.029645  0.002447
getHist  0.001393     16.219119     1.571219    0.243362   0.031370  0.003078

pandas_with_arrow

           data-0k  data-100000k  data-10000k  data-1000k  data-100k  data-10k
getCols  0.002766       3.06889     0.294425    0.038988   0.006398  0.002618
getHist  0.002230       3.06346     0.291442    0.038613   0.006531  0.002527

numba_with_csv

           data-0k  data-100000k  data-10000k  data-1000k  data-100k  data-10k
getCols  0.001350     15.193975     1.412224    0.231728   0.034279  0.002484
getHist  0.002725     15.620342     1.472023    0.239145   0.031950  0.004284

numba_with_arrow

           data-0k  data-100000k  data-10000k  data-1000k  data-100k  data-10k
getCols  0.002051      1.757306     0.179952    0.028002   0.004530  0.002112
getHist  0.003716      2.363428     0.274073    0.035246   0.007