In [1]:
import numpy as np

In [2]:
data = np.floor(np.random.uniform(0,1000,size=50))

In [3]:
bins = np.array([0,100,1000,5000,10000])

In [4]:
data

array([ 42., 646., 533., 716., 939., 395., 266., 557., 914.,  81., 451.,
       422., 425., 745., 544., 395.,  14., 899., 141., 108., 103., 442.,
       140., 352., 233., 485., 909., 338., 483., 259., 812., 637., 710.,
       282., 943., 984., 344., 433., 346., 221., 818., 254., 218., 489.,
       509., 644., 566., 320., 606., 376.])

In [5]:
labels = bins.searchsorted(data)

In [6]:
labels

array([1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2], dtype=int64)

In [7]:
import pandas as pd

In [8]:
pd.Series(data).groupby(labels).mean()

1     45.666667
2    496.851064
dtype: float64

### ***Writing fast Numpy functions with Numba***

In [9]:
def mean_distance(x,y):
    nx = len(x)
    result = 0.0
    count=0
    for i in range(nx):
        result += x[i]-y[i]
        count += 1
    return result/count

In [10]:
# this function is very slow

In [11]:
x = np.random.randn(10000000)

In [12]:
y=np.random.randn(10000000)

In [13]:
%timeit mean_distance(x,y)

3.07 s ± 276 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%timeit (x-y).mean()

54.5 ms ± 4.38 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
import numba as nb

In [16]:
numba_mean_distance = nb.jit(mean_distance)

In [17]:
%timeit numba_mean_distance(x,y)

10.7 ms ± 749 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


*Numba cannot compile all pure python code, but it supports significant subset of python*