# Advanced NumPy

In [15]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)

## ndarray Object Internals

In [2]:
np.ones((10, 5)).shape

(10, 5)

In [3]:
np.ones((3, 4, 5), dtype=np.float64).strides

(160, 40, 8)

### NumPy dtype Hierarchy

In [8]:
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)
np.issubdtype(ints.dtype, np.integer)
np.issubdtype(floats.dtype, np.floating)

True

True

In [9]:
np.float64.mro()

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

In [6]:
np.issubdtype(ints.dtype, np.number)

True

## Advanced Array Manipulation

### Reshaping Arrays

In [10]:
arr = np.arange(8)
arr
arr.reshape((4, 2))

array([0, 1, 2, 3, 4, 5, 6, 7])

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [11]:
arr.reshape((4, 2)).reshape((2, 4))

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [12]:
arr = np.arange(15)
arr.reshape((5, -1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [13]:
other_arr = np.ones((3, 5))
other_arr.shape
arr.reshape(other_arr.shape)

(3, 5)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [16]:
arr = np.arange(10).reshape((5, 2))
arr
arr.ravel()

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [17]:
arr.flatten()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

### C Versus Fortran Order

In [19]:
arr = np.arange(12).reshape((3, 4))
arr
arr.ravel()
arr.ravel('F')
arr.ravel('C')

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

### Concatenating and Splitting Arrays

In [20]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
np.concatenate([arr1, arr2], axis=0)
np.concatenate([arr1, arr2], axis=1)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [21]:
np.vstack((arr1, arr2))
np.hstack((arr1, arr2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [22]:
arr = np.random.randn(5, 2)
arr
first, second, third = np.split(arr, [1, 3])
first
second
third

array([[-0.2047,  0.4789],
       [-0.5194, -0.5557],
       [ 1.9658,  1.3934],
       [ 0.0929,  0.2817],
       [ 0.769 ,  1.2464]])

array([[-0.2047,  0.4789]])

array([[-0.5194, -0.5557],
       [ 1.9658,  1.3934]])

array([[ 0.0929,  0.2817],
       [ 0.769 ,  1.2464]])

#### Stacking helpers: r_ and c_

In [26]:
arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr1
arr2 = np.random.randn(3, 2)
arr2
np.r_[arr1, arr2]
np.c_[np.r_[arr1, arr2], arr]

array([[0, 1],
       [2, 3],
       [4, 5]])

array([[ 0.0009,  1.3438],
       [-0.7135, -0.8312],
       [-2.3702, -1.8608]])

array([[ 0.    ,  1.    ],
       [ 2.    ,  3.    ],
       [ 4.    ,  5.    ],
       [ 0.0009,  1.3438],
       [-0.7135, -0.8312],
       [-2.3702, -1.8608]])

array([[ 0.    ,  1.    ,  0.    ],
       [ 2.    ,  3.    ,  1.    ],
       [ 4.    ,  5.    ,  2.    ],
       [ 0.0009,  1.3438,  3.    ],
       [-0.7135, -0.8312,  4.    ],
       [-2.3702, -1.8608,  5.    ]])

In [32]:
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### Repeating Elements: tile and repeat

In [33]:
arr = np.arange(3)
arr
arr.repeat(3)

array([0, 1, 2])

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [34]:
arr.repeat([2, 3, 4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [41]:
arr = np.random.randn(2, 2)
arr
arr.repeat(2)
arr.repeat(2, axis=0)

array([[-1.542 , -0.9707],
       [-1.307 ,  0.2863]])

array([-1.542 , -1.542 , -0.9707, -0.9707, -1.307 , -1.307 ,  0.2863,
        0.2863])

array([[-1.542 , -0.9707],
       [-1.542 , -0.9707],
       [-1.307 ,  0.2863],
       [-1.307 ,  0.2863]])

In [42]:
arr.repeat([2, 3], axis=0)
arr.repeat([2, 3], axis=1)

array([[-1.542 , -0.9707],
       [-1.542 , -0.9707],
       [-1.307 ,  0.2863],
       [-1.307 ,  0.2863],
       [-1.307 ,  0.2863]])

array([[-1.542 , -1.542 , -0.9707, -0.9707, -0.9707],
       [-1.307 , -1.307 ,  0.2863,  0.2863,  0.2863]])

In [43]:
arr
np.tile(arr, 2)

array([[-1.542 , -0.9707],
       [-1.307 ,  0.2863]])

array([[-1.542 , -0.9707, -1.542 , -0.9707],
       [-1.307 ,  0.2863, -1.307 ,  0.2863]])

In [44]:
arr
np.tile(arr, (2, 1))
np.tile(arr, (3, 2))

array([[-1.542 , -0.9707],
       [-1.307 ,  0.2863]])

array([[-1.542 , -0.9707],
       [-1.307 ,  0.2863],
       [-1.542 , -0.9707],
       [-1.307 ,  0.2863]])

array([[-1.542 , -0.9707, -1.542 , -0.9707],
       [-1.307 ,  0.2863, -1.307 ,  0.2863],
       [-1.542 , -0.9707, -1.542 , -0.9707],
       [-1.307 ,  0.2863, -1.307 ,  0.2863],
       [-1.542 , -0.9707, -1.542 , -0.9707],
       [-1.307 ,  0.2863, -1.307 ,  0.2863]])

### Fancy Indexing Equivalents: take and put

In [49]:
arr = np.arange(10) * 100
inds = [7, 1, 2, 6]
arr[inds]

array([700, 100, 200, 600])

In [50]:
arr.take(inds)
arr.put(inds, 42)
arr
arr.put(inds, [40, 41, 42, 43])
arr

array([700, 100, 200, 600])

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

In [51]:
inds = [2, 0, 2, 1]
arr = np.random.randn(2, 4)
arr
arr.take(inds, axis=1)

array([[ 1.3272, -0.9193, -1.5491,  0.0222],
       [ 0.7584, -0.6605,  0.8626, -0.01  ]])

array([[-1.5491,  1.3272, -1.5491, -0.9193],
       [ 0.8626,  0.7584,  0.8626, -0.6605]])

## Broadcasting

In [52]:
arr = np.arange(5)
arr
arr * 4

array([0, 1, 2, 3, 4])

array([ 0,  4,  8, 12, 16])

In [53]:
arr = np.random.randn(4, 3)
arr.mean(0)
demeaned = arr - arr.mean(0)
demeaned
demeaned.mean(0)

array([-0.1209,  0.038 , -0.5235])

array([[ 0.1709,  0.6322,  1.3764],
       [-0.8349, -0.0615, -1.7808],
       [-0.5315, -1.2563, -0.8091],
       [ 1.1955,  0.6856,  1.2135]])

array([ 0.,  0., -0.])

In [54]:
arr
row_means = arr.mean(1)
row_means.shape
row_means.reshape((4, 1))
demeaned = arr - row_means.reshape((4, 1))
demeaned.mean(1)

array([[ 0.05  ,  0.6702,  0.853 ],
       [-0.9559, -0.0235, -2.3042],
       [-0.6525, -1.2183, -1.3326],
       [ 1.0746,  0.7236,  0.69  ]])

(4,)

array([[ 0.5244],
       [-1.0945],
       [-1.0678],
       [ 0.8294]])

array([-0., -0.,  0.,  0.])

### Broadcasting Over Other Axes

In [62]:
try:
    arr - arr.mean(1)
except ValueError as ve:
    print("ValueError: ", ve)

ValueError:  operands could not be broadcast together with shapes (4,3) (4,) 


In [56]:
arr - arr.mean(1).reshape((4, 1))

array([[-0.4744,  0.1458,  0.3286],
       [ 0.1387,  1.071 , -1.2097],
       [ 0.4153, -0.1505, -0.2648],
       [ 0.2452, -0.1058, -0.1394]])

In [58]:
arr = np.zeros((4, 4))
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape
arr_1d = np.random.normal(size=3)
arr_1d
arr_1d[:, np.newaxis]
arr_1d[np.newaxis, :]

(4, 1, 4)

array([-0.9212, -0.7262,  0.2229])

array([[-0.9212],
       [-0.7262],
       [ 0.2229]])

array([[-0.9212, -0.7262,  0.2229]])

In [79]:
arr = np.random.randn(3, 4, 5)
depth_means = arr.mean(2)
depth_means
depth_means.shape
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned.mean(2)

array([[ 0.1076,  0.1269, -0.3188,  0.0174],
       [-0.2172,  1.0448,  0.038 , -0.4703],
       [ 0.2197, -0.2048,  0.0728,  0.0051]])

(3, 4)

array([[ 0., -0.,  0.,  0.],
       [ 0.,  0.,  0., -0.],
       [ 0., -0.,  0., -0.]])

In [91]:
arr.ndim
means = arr.mean(2)
means
indexer=[slice(None)]*3
indexer
indexer[2]=np.newaxis
means[slice(None), slice(None)]
indexer

3

array([[ 0.1076,  0.1269, -0.3188,  0.0174],
       [-0.2172,  1.0448,  0.038 , -0.4703],
       [ 0.2197, -0.2048,  0.0728,  0.0051]])

[slice(None, None, None), slice(None, None, None), slice(None, None, None)]

array([[ 0.1076,  0.1269, -0.3188,  0.0174],
       [-0.2172,  1.0448,  0.038 , -0.4703],
       [ 0.2197, -0.2048,  0.0728,  0.0051]])

[slice(None, None, None), slice(None, None, None), None]

In [None]:
def demean_axis(arr, axis=0):
    means = arr.mean(axis)
    # This generalizes things like [:, :, np.newaxis] to N dimensions
    indexer = [slice(None)] * arr.ndim
    #slice(None) is equivalent to ":"
    indexer[axis] = np.newaxis
    return arr - means[indexer]

### Setting Array Values by Broadcasting

In [60]:
arr = np.zeros((4, 3))
arr[:] = 5
arr

array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.]])

In [61]:
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr
arr[:2] = [[-1.37], [0.509]]
arr

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

## Advanced ufunc Usage

### ufunc Instance Methods

In [92]:
arr = np.arange(10)
np.add.reduce(arr)
arr.sum()

45

45

In [94]:
np.random.seed(12346) 
arr = np.random.randn(5, 5)
arr[::2].sort(1) # sort a few rows
arr
arr[:, :-1] < arr[:, 1:]
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)

array([[-0.9815, -0.09  ,  0.3658,  0.7483,  0.7594],
       [-0.3154, -0.8661,  0.0279, -0.4556, -1.6019],
       [-0.8487, -0.5465, -0.3215,  0.0005,  0.2483],
       [ 0.2539,  1.9368, -0.7995, -0.5692,  0.0489],
       [-0.9535, -0.6491, -0.4795,  0.1754,  1.4225]])

array([[ True,  True,  True,  True],
       [False,  True, False, False],
       [ True,  True,  True,  True],
       [ True, False,  True,  True],
       [ True,  True,  True,  True]], dtype=bool)

array([ True, False,  True, False,  True], dtype=bool)

In [96]:
arr = np.arange(15).reshape((3, 5))
arr
np.add.accumulate(arr, axis=1)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]], dtype=int32)

In [97]:
arr = np.arange(3).repeat([1, 2, 2])
arr
np.multiply.outer(arr, np.arange(5))

array([0, 1, 1, 2, 2])

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

In [99]:
x, y = np.random.randn(3, 4), np.random.randn(5)
result = np.subtract.outer(x, y)
result.shape

(3, 4, 5)

In [101]:
arr = np.arange(10)
arr
np.add.reduceat(arr, [0, 5, 8])

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

array([10, 18, 17], dtype=int32)

In [102]:
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr
np.add.reduceat(arr, [0, 2, 4], axis=1)

array([[ 0,  0,  0,  0,  0],
       [ 0,  1,  2,  3,  4],
       [ 0,  2,  4,  6,  8],
       [ 0,  3,  6,  9, 12]])

array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]], dtype=int32)

### Writing New ufuncs in Python

In [105]:
def add_elements(x, y):
    return x + y
add_them = np.frompyfunc(add_elements, 2, 1)
add_them(np.arange(8), np.arange(8))

array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)

In [106]:
add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))

array([  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.])

In [107]:
arr = np.random.randn(10000)
%timeit add_them(arr, arr)
%timeit np.add(arr, arr)

100 loops, best of 3: 4.11 ms per loop
The slowest run took 4.51 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 9.29 µs per loop


## Structured and Record Arrays

In [108]:
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr

array([( 1.5   ,  6), ( 3.1416, -2)], 
      dtype=[('x', '<f8'), ('y', '<i4')])

In [109]:
sarr[0]
sarr[0]['y']

( 1.5, 6)

6

In [110]:
sarr['x']

array([ 1.5   ,  3.1416])

### Nested dtypes and Multidimensional Fields

In [111]:
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)
arr

array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)], 
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

In [112]:
arr[0]['x']

array([0, 0, 0], dtype=int64)

In [113]:
arr['x']

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]], dtype=int64)

In [114]:
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)
data['x']
data['y']
data['x']['a']

array([( 1.,  2.), ( 3.,  4.)], 
      dtype=[('a', '<f8'), ('b', '<f4')])

array([5, 6])

array([ 1.,  3.])

### Why Use Structured Arrays?

## More About Sorting

In [115]:
arr = np.random.randn(6)
arr.sort()
arr

array([-1.1181, -0.329 , -0.2415,  0.7424,  0.8548,  1.038 ])

In [116]:
arr = np.random.randn(3, 5)
arr
arr[:, 0].sort()  # Sort first column values in-place
arr

array([[-2.0051,  0.7379, -1.0614,  0.5955, -0.2682],
       [ 1.3389, -0.1872,  0.9111, -0.3215,  1.0054],
       [-0.5168,  1.1925, -0.1989,  0.3969, -1.7638]])

array([[-2.0051,  0.7379, -1.0614,  0.5955, -0.2682],
       [-0.5168, -0.1872,  0.9111, -0.3215,  1.0054],
       [ 1.3389,  1.1925, -0.1989,  0.3969, -1.7638]])

In [117]:
arr = np.random.randn(5)
arr
np.sort(arr)
arr

array([ 0.6071, -0.2222, -0.2171, -1.2136, -0.8704])

array([-1.2136, -0.8704, -0.2222, -0.2171,  0.6071])

array([ 0.6071, -0.2222, -0.2171, -1.2136, -0.8704])

In [123]:
arr = np.random.randn(3, 5)
arr
arr.sort(axis=1)
arr

array([[-0.2118,  0.2474,  0.3075,  0.741 , -2.2347],
       [ 0.5593, -0.1566,  0.6461, -0.1385,  0.4751],
       [ 0.2539,  0.279 ,  0.9882,  0.7844,  0.9926]])

array([[-2.2347, -0.2118,  0.2474,  0.3075,  0.741 ],
       [-0.1566, -0.1385,  0.4751,  0.5593,  0.6461],
       [ 0.2539,  0.279 ,  0.7844,  0.9882,  0.9926]])

In [126]:
arr[:, ::-1]

array([[ 0.741 ,  0.3075,  0.2474, -0.2118, -2.2347],
       [ 0.6461,  0.5593,  0.4751, -0.1385, -0.1566],
       [ 0.9926,  0.9882,  0.7844,  0.279 ,  0.2539]])

### Indirect Sorts: argsort and lexsort

In [127]:
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer
values[indexer]

array([1, 2, 4, 3, 0], dtype=int64)

array([0, 1, 2, 3, 5])

In [128]:
arr = np.random.randn(3, 5)
arr[0] = values
arr
arr[:, arr[0].argsort()]

array([[ 5.    ,  0.    ,  1.    ,  3.    ,  2.    ],
       [ 0.178 , -1.3388,  1.3942,  0.6435, -0.0379],
       [ 0.3375, -1.273 , -0.1547,  0.7817, -0.7927]])

array([[ 0.    ,  1.    ,  2.    ,  3.    ,  5.    ],
       [-1.3388,  1.3942, -0.0379,  0.6435,  0.178 ],
       [-1.273 , -0.1547, -0.7927,  0.7817,  0.3375]])

In [130]:
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name))
sorter
last_name[sorter]
first_name[sorter]
zip(last_name[sorter], first_name[sorter])

array([1, 2, 3, 0, 4], dtype=int64)

array(['Arnold', 'Arnold', 'Jones', 'Jones', 'Walters'], 
      dtype='<U7')

array(['Jane', 'Steve', 'Bill', 'Bob', 'Barbara'], 
      dtype='<U7')

<zip at 0x1d36a36c108>

### Alternative Sort Algorithms

In [131]:
values = np.array(['2:first', '2:second', '1:first', '1:second',
                   '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer
values.take(indexer)

array([2, 3, 4, 0, 1], dtype=int64)

array(['1:first', '1:second', '1:third', '2:first', '2:second'], 
      dtype='<U8')

### Partially Sorting Arrays

In [132]:
np.random.seed(12345)
arr = np.random.randn(20)
arr
np.partition(arr, 3)

array([-0.2047,  0.4789, -0.5194, -0.5557,  1.9658,  1.3934,  0.0929,
        0.2817,  0.769 ,  1.2464,  1.0072, -1.2962,  0.275 ,  0.2289,
        1.3529,  0.8864, -2.0016, -0.3718,  1.669 , -0.4386])

array([-2.0016, -1.2962, -0.5557, -0.5194, -0.3718, -0.4386, -0.2047,
        0.2817,  0.769 ,  0.4789,  1.0072,  0.0929,  0.275 ,  0.2289,
        1.3529,  0.8864,  1.3934,  1.9658,  1.669 ,  1.2464])

In [133]:
indices = np.argpartition(arr, 3)
indices
arr.take(indices)

array([16, 11,  3,  2, 17, 19,  0,  7,  8,  1, 10,  6, 12, 13, 14, 15,  5,
        4, 18,  9], dtype=int64)

array([-2.0016, -1.2962, -0.5557, -0.5194, -0.3718, -0.4386, -0.2047,
        0.2817,  0.769 ,  0.4789,  1.0072,  0.0929,  0.275 ,  0.2289,
        1.3529,  0.8864,  1.3934,  1.9658,  1.669 ,  1.2464])

### numpy.searchsorted: Finding Elements in a Sorted Array

In [134]:
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)

3

In [135]:
arr.searchsorted([0, 8, 11, 16])

array([0, 3, 3, 5], dtype=int64)

In [136]:
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])
arr.searchsorted([0, 1], side='right')

array([0, 3], dtype=int64)

array([3, 7], dtype=int64)

In [137]:
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data

array([ 9940.,  6768.,  7908.,  1709.,   268.,  8003.,  9037.,   246.,
        4917.,  5262.,  5963.,   519.,  8950.,  7282.,  8183.,  5002.,
        8101.,   959.,  2189.,  2587.,  4681.,  4593.,  7095.,  1780.,
        5314.,  1677.,  7688.,  9281.,  6094.,  1501.,  4896.,  3773.,
        8486.,  9110.,  3838.,  3154.,  5683.,  1878.,  1258.,  6875.,
        7996.,  5735.,  9732.,  6340.,  8884.,  4954.,  3516.,  7142.,
        5039.,  2256.])

In [138]:
labels = bins.searchsorted(data)
labels

array([4, 4, 4, 3, 2, 4, 4, 2, 3, 4, 4, 2, 4, 4, 4, 4, 4, 2, 3, 3, 3, 3, 4,
       3, 4, 3, 4, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 4, 4, 4, 4, 4, 3,
       3, 4, 4, 3], dtype=int64)

In [139]:
pd.Series(data).groupby(labels).mean()

2     498.000000
3    3064.277778
4    7389.035714
dtype: float64

## Writing Fast NumPy Functions with Numba

In [140]:
import numpy as np

def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count

In [141]:
x = np.random.randn(10000000)
y = np.random.randn(10000000)
%timeit mean_distance(x, y)
%timeit (x - y).mean()

1 loop, best of 3: 9.87 s per loop
10 loops, best of 3: 130 ms per loop


In [142]:
import numba as nb
numba_mean_distance = nb.jit(mean_distance)

In [143]:
@nb.jit
def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count

In [144]:
%timeit numba_mean_distance(x, y)

The slowest run took 12.15 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 32.3 ms per loop


In [145]:
from numba import float64, njit

@njit(float64(float64[:], float64[:]))
def mean_distance(x, y):
    return (x - y).mean()

### Creating Custom numpy.ufunc Objects with Numba

In [22]:
from numba import vectorize,  float64, int32, int64, float32

@vectorize([int32(int32, int32),
            int64(int64, int64),
            float32(float32, float32),
            float64(float64, float64)])
def nb_add(x, y):
    return x + y

In [26]:
x = np.arange(10)
nb_add(x, x)
nb_add.accumulate(x)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

array([ 0,  1,  3,  6, 10, 15, 21, 28, 36, 45])

## Advanced Array Input and Output

### Memory-Mapped Files

In [3]:
import numpy as np
mmap = np.memmap('data\mymmap', dtype='float64', mode='w+',
                 shape=(10000, 10000))
mmap

memmap([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [4]:
section = mmap[:5]

In [5]:
section[:] = np.random.randn(5, 10000)
mmap.flush()
mmap
del mmap

In [6]:
mmap = np.memmap('data\mymmap', dtype='float64', shape=(10000, 10000))
mmap

memmap([[-0.09642834, -1.5818114 ,  0.50319246, ..., -0.9537932 ,
        -0.41914176,  0.94785242],
       [-0.02435007, -0.11245394, -0.47509787, ...,  0.45445498,
        -0.92108942,  1.05797305],
       [-0.32291208,  0.2813427 , -1.80962685, ...,  0.21406753,
         0.76927045, -0.23431117],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [8]:
%xdel mmap
!DEL "data\mymmap"

NameError: name 'mmap' is not defined
C:\Users\gmanish\Desktop\openminds\code\data\mymmap


The process cannot access the file because it is being used by another process.


### HDF5 and Other Array Storage Options

## Performance Tips

### The Importance of Contiguous Memory

In [31]:
arr_c = np.ones((1000, 1000), order='C')
arr_f = np.ones((1000, 1000), order='F')
arr_c.flags
arr_f.flags
arr_f.flags.f_contiguous
b = np.ones((1000, 1000))
b.flags.f_contiguous

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

True

False

In [32]:
%timeit arr_c.sum(1)
%timeit arr_f.sum(1)
%timeit b.sum(1)

1000 loops, best of 3: 2.01 ms per loop
1000 loops, best of 3: 1.33 ms per loop
1000 loops, best of 3: 1.94 ms per loop


In [11]:
arr_f.copy('C').flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

In [12]:
arr_c[:50].flags.contiguous
arr_c[:, :50].flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

In [13]:
%xdel arr_c
%xdel arr_f

In [16]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS