### ndarray internals

In [1]:
# 1. data (pointer to data)
# 2. dtype
# 3. shape
# 4. stride

In [3]:
import numpy as np
np.ones((10, 5)).shape

(10, 5)

In [4]:
np.ones((3, 4, 5), dtype=np.float64).strides

(160, 40, 8)

### numpy dtype hierarchy

In [5]:
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)

In [7]:
np.issubdtype(ints.dtype, np.integer)  # all integer types

True

In [8]:
np.issubdtype(floats.dtype, np.floating)  # all float types

True

### reshape arrays

In [9]:
arr = np.arange(8)
arr

array([0, 1, 2, 3, 4, 5, 6, 7])

In [10]:
arr.reshape((4, 2))  # VIEWS, NOT making copies

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [12]:
arr.reshape((4, 2)).reshape((2, 4))

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [13]:
arr = np.arange(15)

In [14]:
arr.reshape((5, -1))  # -1 for auto-detection

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [15]:
other_arr = np.ones((3, 5))
arr.reshape(other_arr.shape)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [18]:
arr = np.arange(15).reshape(5, 3)
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [19]:
arr.ravel()  # NOT making copies, make copy when necessary

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [20]:
arr.flatten()  # always make copies

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [21]:
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

### c vs fortran order

In [22]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [23]:
arr.ravel()  # ravel by row major order

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [24]:
arr.ravel('F')  # ravel by column major order

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

### concatenating and splitting arrays

In [25]:
arr1 = np.arange(6).reshape(2, 3)
arr2 = np.arange(7, 13, 1).reshape(2, 3)
arr1, arr2

(array([[0, 1, 2],
        [3, 4, 5]]), array([[ 7,  8,  9],
        [10, 11, 12]]))

In [26]:
np.concatenate([arr1, arr2], axis=0)

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 7,  8,  9],
       [10, 11, 12]])

In [27]:
np.concatenate([arr1, arr2], axis=1)

array([[ 0,  1,  2,  7,  8,  9],
       [ 3,  4,  5, 10, 11, 12]])

In [28]:
np.vstack((arr1, arr2))  # vertical stack, same as np.concatenate([arr1, arr2], axis=0)

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 7,  8,  9],
       [10, 11, 12]])

In [29]:
np.hstack((arr1, arr2))  # horizontal stack, same as np.concatenate([arr1, arr2], axis=1)

array([[ 0,  1,  2,  7,  8,  9],
       [ 3,  4,  5, 10, 11, 12]])

In [31]:
arr = np.random.randn(5, 2)
arr

array([[-0.84402428,  0.39562633],
       [-0.92983791,  0.37878274],
       [-0.79381175,  2.18907876],
       [ 0.60319838,  0.9490111 ],
       [ 0.24968924,  0.242322  ]])

In [32]:
first, second, third = np.split(arr, [1, 3])
first, second, third

(array([[-0.84402428,  0.39562633]]), array([[-0.92983791,  0.37878274],
        [-0.79381175,  2.18907876]]), array([[ 0.60319838,  0.9490111 ],
        [ 0.24968924,  0.242322  ]]))

In [34]:
col_0, col_1 = np.split(arr, [1], axis=1)  # also accept an axis arguments
col_0, col_1

(array([[-0.84402428],
        [-0.92983791],
        [-0.79381175],
        [ 0.60319838],
        [ 0.24968924]]), array([[ 0.39562633],
        [ 0.37878274],
        [ 2.18907876],
        [ 0.9490111 ],
        [ 0.242322  ]]))

### stacking helpers

In [35]:
arr1 = np.arange(6).reshape((3, 2))
arr2 = np.random.randn(3, 2)
arr1, arr2

(array([[0, 1],
        [2, 3],
        [4, 5]]), array([[-1.03288723,  0.05534161],
        [-1.17394543, -0.1523022 ],
        [ 0.20070717, -0.14607779]]))

In [36]:
np.r_[arr1, arr2]  # row, notice the bracket syntax

array([[ 0.        ,  1.        ],
       [ 2.        ,  3.        ],
       [ 4.        ,  5.        ],
       [-1.03288723,  0.05534161],
       [-1.17394543, -0.1523022 ],
       [ 0.20070717, -0.14607779]])

In [37]:
np.c_[arr1, arr2]  # column, notice the bracket syntax

array([[ 0.        ,  1.        , -1.03288723,  0.05534161],
       [ 2.        ,  3.        , -1.17394543, -0.1523022 ],
       [ 4.        ,  5.        ,  0.20070717, -0.14607779]])

In [38]:
np.c_[1:6, -10:-5]  # also accept slice for array creation by columns

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### repeating elements: repeat and tile

In [39]:
arr = np.arange(3)
arr

array([0, 1, 2])

In [40]:
arr.repeat(3)  # repeat() is an array method

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [41]:
arr.repeat([2, 3, 4])  # different repeat for each element

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [42]:
arr = np.random.randn(2, 2)
arr

array([[ 0.79525123, -2.57066905],
       [-0.56306707, -0.29890245]])

In [43]:
arr.repeat(2, axis=0)  # accepts axis argument

array([[ 0.79525123, -2.57066905],
       [ 0.79525123, -2.57066905],
       [-0.56306707, -0.29890245],
       [-0.56306707, -0.29890245]])

In [44]:
arr.repeat(2, axis=1)

array([[ 0.79525123,  0.79525123, -2.57066905, -2.57066905],
       [-0.56306707, -0.56306707, -0.29890245, -0.29890245]])

In [45]:
arr.repeat([2, 1], axis=0)

array([[ 0.79525123, -2.57066905],
       [ 0.79525123, -2.57066905],
       [-0.56306707, -0.29890245]])

In [46]:
arr.repeat([2, 3], axis=1)

array([[ 0.79525123,  0.79525123, -2.57066905, -2.57066905, -2.57066905],
       [-0.56306707, -0.56306707, -0.29890245, -0.29890245, -0.29890245]])

In [47]:
arr

array([[ 0.79525123, -2.57066905],
       [-0.56306707, -0.29890245]])

In [48]:
np.tile(arr, 2)  # tile is in the numpy namespace

array([[ 0.79525123, -2.57066905,  0.79525123, -2.57066905],
       [-0.56306707, -0.29890245, -0.56306707, -0.29890245]])

In [49]:
np.tile(arr, (2, 1))  # can be visualized as tiles

array([[ 0.79525123, -2.57066905],
       [-0.56306707, -0.29890245],
       [ 0.79525123, -2.57066905],
       [-0.56306707, -0.29890245]])

In [50]:
np.tile(arr, (3, 2))

array([[ 0.79525123, -2.57066905,  0.79525123, -2.57066905],
       [-0.56306707, -0.29890245, -0.56306707, -0.29890245],
       [ 0.79525123, -2.57066905,  0.79525123, -2.57066905],
       [-0.56306707, -0.29890245, -0.56306707, -0.29890245],
       [ 0.79525123, -2.57066905,  0.79525123, -2.57066905],
       [-0.56306707, -0.29890245, -0.56306707, -0.29890245]])

### fancy indexing faster equivalents: take and put

In [51]:
arr = np.arange(10) * 100
arr

array([  0, 100, 200, 300, 400, 500, 600, 700, 800, 900])

In [52]:
inds = [7, 1, 2, 6]
arr[inds]  # fancy indexing

array([700, 100, 200, 600])

In [53]:
arr.take(inds)

array([700, 100, 200, 600])

In [55]:
arr.put(inds, 42)
arr

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

In [57]:
arr.put(inds, [40, 41, 42, 43])
arr

array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

In [58]:
arr = np.random.randn(2, 4)
arr

array([[ 1.38684578,  0.66672365,  0.42978577,  0.16386163],
       [-0.37866849, -0.05083861, -0.58683124,  0.50841731]])

In [59]:
inds = [2, 0, 2, 1]
arr.take(inds, axis=1)  # accepts other axis

array([[ 0.42978577,  1.38684578,  0.42978577,  0.66672365],
       [-0.58683124, -0.37866849, -0.58683124, -0.05083861]])

In [62]:
arr.put(inds, 333, axis=0)  # axis not yet supported, use fancy indexing instead

TypeError: 'axis' is an invalid keyword argument for this function

In [65]:
arr = np.random.randn(1000, 50)

In [66]:
inds = np.random.permutation(1000)[:500]  # permutation(integer) is random arange

In [67]:
%timeit arr[inds]

The slowest run took 4.72 times longer than the fastest. This could mean that an intermediate result is being cached 
10000 loops, best of 3: 44.3 µs per loop


In [68]:
%timeit arr.take(inds, axis=0)  # mas o menos, use fancy indexing is ok

The slowest run took 4.00 times longer than the fastest. This could mean that an intermediate result is being cached 
10000 loops, best of 3: 40.2 µs per loop


### broadcasting

In [70]:
arr = np.random.randn(4, 3)
arr.mean(0)

array([ 0.58034771,  0.50904362, -0.35474597])

In [71]:
arr.shape, arr.mean(0).shape

((4, 3), (3,))

In [72]:
demeaned = arr - arr.mean(0)  # broadcasting
demeaned

array([[-0.89333671, -0.47772175, -0.77293452],
       [ 0.16049524,  2.0282579 ,  0.1841532 ],
       [ 0.52309198, -1.15666466, -0.81016138],
       [ 0.20974949, -0.39387149,  1.3989427 ]])

In [73]:
demeaned.mean(0)

array([ -5.55111512e-17,   1.38777878e-17,   0.00000000e+00])

In [74]:
arr

array([[-0.31298901,  0.03132187, -1.1276805 ],
       [ 0.74084295,  2.53730152, -0.17059277],
       [ 1.10343969, -0.64762104, -1.16490735],
       [ 0.79009719,  0.11517213,  1.04419673]])

In [76]:
row_means = arr.mean(1)
row_means

array([-0.46978255,  1.03585057, -0.2363629 ,  0.64982202])

In [78]:
demeaned = arr - row_means.reshape((4, 1))  # reshape and use broadcasting
demeaned.mean(1)

array([  0.00000000e+00,   1.48029737e-16,   3.70074342e-17,
        -3.70074342e-17])

In [81]:
arr = np.arange(16).reshape((4, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [82]:
# create new axis with full sicing and np.newaxis
arr_3d = arr[:, np.newaxis, :]  # create new axis along the rows
arr_3d

array([[[ 0,  1,  2,  3]],

       [[ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11]],

       [[12, 13, 14, 15]]])

In [83]:
arr_3d.shape

(4, 1, 4)

In [84]:
arr_1d = np.random.normal(size=3)
arr_1d

array([ 1.34220543,  0.13142501, -2.71556605])

In [85]:
arr_1d[:, np.newaxis]  # increase dimension and set shape = 1 for the one with np.newaxis

array([[ 1.34220543],
       [ 0.13142501],
       [-2.71556605]])

In [87]:
arr_1d[np.newaxis, :]

array([[ 1.34220543,  0.13142501, -2.71556605]])

In [88]:
arr = np.random.randn(3, 4, 5)
arr

array([[[ 0.16833323,  1.37156748,  0.59775907,  0.99197071, -0.36914824],
        [-1.32911668,  0.38815859, -0.30582882,  1.03197952, -2.27486309],
        [ 1.3577924 ,  0.88937303,  0.52889828, -0.64097346, -0.67511506],
        [ 2.51093672,  2.17348171, -0.45880583,  0.99766485,  0.86239239]],

       [[-1.37893746,  2.02598561, -0.9631464 ,  1.7878503 ,  0.64771723],
        [-0.0778759 , -0.54208966,  0.58377967, -1.08579115,  1.94187082],
        [ 0.47159628, -0.70295212, -0.76838466, -2.45142292, -1.97983324],
        [-0.89194804,  1.35877784, -0.42188337,  0.59454443, -0.08396996]],

       [[-1.28677819, -1.33734381, -0.34028226, -1.48639749, -0.53034275],
        [ 1.1192561 , -0.44287554,  0.2451824 , -0.95461755, -0.0461259 ],
        [ 0.2046868 ,  0.75474255, -1.23034735, -0.808334  ,  0.93804462],
        [-0.70299682, -0.29077034, -2.51830217,  0.84657791,  0.61138623]]])

In [90]:
depth_means = arr.mean(2)  # along the columns
arr.shape, depth_means.shape

((3, 4, 5), (3, 4))

In [91]:
demeaned = arr - depth_means[:, :, np.newaxis]  # make it (3, 4, 1)
demeaned.mean(2)

array([[  6.66133815e-17,   4.44089210e-17,   0.00000000e+00,
          6.66133815e-17],
       [  0.00000000e+00,   0.00000000e+00,  -1.33226763e-16,
         -1.11022302e-17],
       [  1.55431223e-16,  -4.02455846e-17,  -2.22044605e-17,
          0.00000000e+00]])

### setting array values by broadcasting

In [93]:
arr = np.zeros((4, 3))
arr

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [96]:
arr[:] = 5
arr

array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.]])

In [97]:
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]  # (4, ) to (4, 1)
arr

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

In [98]:
arr[:2] = [[-1.37], [0.509]]  # (2, 3) = (2, 1)
arr

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

### advanced ufunc usage

### ufunc instance methods

In [100]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [101]:
np.add.reduce(arr)  # same as arr.sum()

45

In [102]:
arr.sum()

45

In [103]:
arr = np.random.randn(5, 5)
arr

array([[-0.70235592, -1.70187961,  0.36980431, -0.1330973 ,  0.59831154],
       [ 0.92536617,  0.05856691,  1.80727909, -0.54883722,  0.90058008],
       [-1.02433843,  1.30654925,  0.32153662,  1.23218145, -0.4157401 ],
       [ 0.66689692, -0.06459162,  0.69110348,  0.21774474, -0.13319059],
       [ 1.06988035, -0.77089931,  0.25727182,  0.65971139,  2.20086081]])

In [105]:
arr[::2].sort(1)  # sort several rows
arr

array([[-1.70187961, -0.70235592, -0.1330973 ,  0.36980431,  0.59831154],
       [ 0.92536617,  0.05856691,  1.80727909, -0.54883722,  0.90058008],
       [-1.02433843, -0.4157401 ,  0.32153662,  1.23218145,  1.30654925],
       [ 0.66689692, -0.06459162,  0.69110348,  0.21774474, -0.13319059],
       [-0.77089931,  0.25727182,  0.65971139,  1.06988035,  2.20086081]])

In [106]:
arr[:, :-1]

array([[-1.70187961, -0.70235592, -0.1330973 ,  0.36980431],
       [ 0.92536617,  0.05856691,  1.80727909, -0.54883722],
       [-1.02433843, -0.4157401 ,  0.32153662,  1.23218145],
       [ 0.66689692, -0.06459162,  0.69110348,  0.21774474],
       [-0.77089931,  0.25727182,  0.65971139,  1.06988035]])

In [107]:
arr[:, 1:]

array([[-0.70235592, -0.1330973 ,  0.36980431,  0.59831154],
       [ 0.05856691,  1.80727909, -0.54883722,  0.90058008],
       [-0.4157401 ,  0.32153662,  1.23218145,  1.30654925],
       [-0.06459162,  0.69110348,  0.21774474, -0.13319059],
       [ 0.25727182,  0.65971139,  1.06988035,  2.20086081]])

In [108]:
arr[:, :-1] < arr[:, 1:]

array([[ True,  True,  True,  True],
       [False,  True, False,  True],
       [ True,  True,  True,  True],
       [False,  True, False, False],
       [ True,  True,  True,  True]], dtype=bool)

In [109]:
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)  # same as (arr[:, :-1] < arr[:, 1:]).all(1)

array([ True, False,  True, False,  True], dtype=bool)

In [110]:
(arr[:, :-1] < arr[:, 1:]).all(1)

array([ True, False,  True, False,  True], dtype=bool)

In [111]:
arr = np.arange(15).reshape((3, 5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [112]:
np.add.accumulate(arr, axis=1)  # same as arr.cumsum(1)

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]])

In [113]:
arr.cumsum(1)

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]])

In [115]:
arr = np.arange(3).repeat([1, 2, 2])
arr

array([0, 1, 1, 2, 2])

In [116]:
np.multiply.outer(arr, np.arange(5))  # one-by-one multiplication

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

In [117]:
result = np.subtract.outer(np.arange(12).reshape(3, 4), np.arange(5))
result, result.shape

(array([[[ 0, -1, -2, -3, -4],
         [ 1,  0, -1, -2, -3],
         [ 2,  1,  0, -1, -2],
         [ 3,  2,  1,  0, -1]],
 
        [[ 4,  3,  2,  1,  0],
         [ 5,  4,  3,  2,  1],
         [ 6,  5,  4,  3,  2],
         [ 7,  6,  5,  4,  3]],
 
        [[ 8,  7,  6,  5,  4],
         [ 9,  8,  7,  6,  5],
         [10,  9,  8,  7,  6],
         [11, 10,  9,  8,  7]]]), (3, 4, 5))

In [118]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [119]:
np.add.reduceat(arr, [0, 5, 8])  # local reduce at [0:5], [5:8], [8:]

array([10, 18, 17])

In [121]:
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr

array([[ 0,  0,  0,  0,  0],
       [ 0,  1,  2,  3,  4],
       [ 0,  2,  4,  6,  8],
       [ 0,  3,  6,  9, 12]])

In [122]:
np.add.reduceat(arr, [0, 2, 4], axis=1)

array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]])

### custom ufuncs

In [123]:
def add_elements(x, y):
    return x + y

In [124]:
add_them = np.frompyfunc(add_elements, 2, 1)  # number of inputs, number of outputs

In [125]:
add_them(np.arange(8), np.arange(8))  # always returns array of python object

array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)

In [126]:
# use np.vectorize to avoid python objects
add_them = np.vectorize(add_elements, otypes=[np.float64])

In [127]:
add_them(np.arange(8), np.arange(8))

array([  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.])

In [128]:
# these are still python functions, which are slow
arr = np.random.randn(1000)

In [129]:
%timeit add_them(arr, arr)

1000 loops, best of 3: 435 µs per loop


In [130]:
%timeit np.add(arr, arr)

The slowest run took 11.58 times longer than the fastest. This could mean that an intermediate result is being cached 
100000 loops, best of 3: 2.57 µs per loop


### structured array

In [131]:
dtype = [('x', np.float64), ('y', np.int32)]

In [132]:
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr

array([(1.5, 6), (3.141592653589793, -2)], 
      dtype=[('x', '<f8'), ('y', '<i4')])

In [133]:
sarr[0]

(1.5, 6)

In [134]:
sarr[0]['y']

6

In [135]:
sarr['x']

array([ 1.5       ,  3.14159265])

### nested dtypes and multidimensional fields

In [136]:
dtype = [('x', np.int64, 3), ('y', np.int32)]  # pass in shape additionally

In [138]:
arr = np.zeros(4, dtype=dtype)
arr

array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)], 
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

In [139]:
arr[0]['x']

array([0, 0, 0])

In [140]:
arr['x']

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [141]:
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]  # nested dtypes

In [142]:
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)

In [143]:
data

array([((1.0, 2.0), 5), ((3.0, 4.0), 6)], 
      dtype=[('x', [('a', '<f8'), ('b', '<f4')]), ('y', '<i4')])

In [144]:
data['x']['a']

array([ 1.,  3.])

### more about sorting

In [145]:
arr = np.random.randn(6)
arr.sort()  # in-place sort
arr

array([-1.75569384, -1.68940509, -1.20625731, -0.99604187, -0.20695007,
        0.09566824])

In [146]:
arr = np.random.randn(3, 5)
arr

array([[-1.5188331 , -0.25033623, -1.41766613, -0.09258527, -0.47100731],
       [-0.78365921,  0.37962352,  1.8902097 ,  0.46132765,  2.37678614],
       [ 0.02575312,  1.26357755, -1.37790894,  0.86750278,  0.13421977]])

In [147]:
arr[:, 0].sort()  # in-place sort the first column

In [148]:
arr

array([[-1.5188331 , -0.25033623, -1.41766613, -0.09258527, -0.47100731],
       [-0.78365921,  0.37962352,  1.8902097 ,  0.46132765,  2.37678614],
       [ 0.02575312,  1.26357755, -1.37790894,  0.86750278,  0.13421977]])

In [149]:
arr = np.random.randn(5)
arr

array([ 1.55851942,  1.18698664,  0.8078914 , -1.15841676, -1.81808871])

In [150]:
np.sort(arr)  # sort and makes a copy

array([-1.81808871, -1.15841676,  0.8078914 ,  1.18698664,  1.55851942])

In [151]:
arr  # not changed

array([ 1.55851942,  1.18698664,  0.8078914 , -1.15841676, -1.81808871])

In [152]:
arr = np.random.randn(3, 5)

In [153]:
arr

array([[-1.55514514,  2.48382495,  2.01242844,  1.18605425, -0.09348974],
       [ 0.06297153,  0.79100157,  1.76510382, -0.73358095, -0.11421936],
       [-0.21201677,  1.84197   ,  1.13277026,  0.33473578, -1.70317712]])

In [154]:
arr.sort(axis=1)  # accepts axis argument

In [155]:
arr

array([[-1.55514514, -0.09348974,  1.18605425,  2.01242844,  2.48382495],
       [-0.73358095, -0.11421936,  0.06297153,  0.79100157,  1.76510382],
       [-1.70317712, -0.21201677,  0.33473578,  1.13277026,  1.84197   ]])

In [156]:
arr[:, ::-1]  # often used to make orders change between descending and ascending

array([[ 2.48382495,  2.01242844,  1.18605425, -0.09348974, -1.55514514],
       [ 1.76510382,  0.79100157,  0.06297153, -0.11421936, -0.73358095],
       [ 1.84197   ,  1.13277026,  0.33473578, -0.21201677, -1.70317712]])

### indirect sorts: argsort and lexsort

In [157]:
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()  # get the sorting indexer
indexer

array([1, 2, 4, 3, 0])

In [158]:
values[indexer]  # fancy indexing

array([0, 1, 2, 3, 5])

In [159]:
arr = np.random.randn(3, 5)
arr[0] = values

In [160]:
arr

array([[ 5.        ,  0.        ,  1.        ,  3.        ,  2.        ],
       [ 0.22572439,  0.03669126, -0.63918768,  0.09128767, -0.29896849],
       [ 0.11189725, -1.55519615,  0.98399596,  0.73673446,  0.02731404]])

In [161]:
arr[:, arr[0].argsort()]  # fancy indexing at columns

array([[ 0.        ,  1.        ,  2.        ,  3.        ,  5.        ],
       [ 0.03669126, -0.63918768, -0.29896849,  0.09128767,  0.22572439],
       [-1.55519615,  0.98399596,  0.02731404,  0.73673446,  0.11189725]])

In [162]:
first_name = np.array(['bob', 'jane', 'steve', 'bill', 'barbara'])

In [163]:
last_name = np.array(['jones', 'arnold', 'arnold', 'jones', 'walters'])

In [None]:
# two arrays are zipped, and then primary sort = last_name, secondary sort = first_name
sorter = np.lexsort((first_name, last_name))  # the last, `last_name` is sorted first
sorter

In [167]:
list(zip(last_name[sorter], first_name[sorter]))

[('arnold', 'jane'),
 ('arnold', 'steve'),
 ('jones', 'bill'),
 ('jones', 'bob'),
 ('walters', 'barbara')]

### alternate sort algorithms

In [168]:
values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])

In [169]:
key = np.array([2, 2, 1, 1, 1])

In [170]:
indexer = key.argsort(kind='mergesort')  # the only one kind that preserves original order for same elements

In [171]:
indexer

array([2, 3, 4, 0, 1])

In [172]:
values[indexer]

array(['1:first', '1:second', '1:third', '2:first', '2:second'], 
      dtype='<U8')

In [173]:
values.take(indexer)

array(['1:first', '1:second', '1:third', '2:first', '2:second'], 
      dtype='<U8')

### finding elements in a sorted array

In [174]:
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)

3

In [175]:
arr.searchsorted([0, 8, 11, 16])  # accepts array of values

array([0, 3, 3, 5])

In [176]:
arr = np.arange(2).repeat((3, 4))
arr

array([0, 0, 0, 1, 1, 1, 1])

In [177]:
arr.searchsorted([0, 1])  # default to left side

array([0, 3])

In [178]:
arr.searchsorted([0, 1], side='right')

array([3, 7])

In [179]:
# bin the data
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data

array([ 3002.,  2784.,   131.,  6923.,  2978.,  1083.,   630.,  8823.,
         520.,  1952.,  4117.,  5119.,  3799.,  7948.,  2093.,  1151.,
        6259.,  5544.,  2917.,  2679.,  5173.,   627.,  1937.,  5175.,
        9101.,  1668.,  1520.,  2787.,   805.,  5956.,  5960.,  9030.,
        3586.,  7911.,  9643.,  3497.,  1872.,  7883.,  6612.,  7856.,
        7244.,   869.,  9573.,  4638.,  8595.,  6592.,  8737.,  2791.,
        1894.,  4313.])

In [180]:
labels = bins.searchsorted(data)  # see which bins each data belongs to
labels

array([3, 3, 2, 4, 3, 3, 2, 4, 2, 3, 3, 4, 3, 4, 3, 3, 4, 4, 3, 3, 4, 2, 3,
       4, 4, 3, 3, 3, 2, 4, 4, 4, 3, 4, 4, 3, 3, 4, 4, 4, 4, 2, 4, 3, 4, 4,
       4, 3, 3, 3])

In [181]:
import pandas as pd

In [182]:
pd.Series(data).groupby(labels).mean()

2     597.000000
3    2684.454545
4    7348.045455
dtype: float64

In [183]:
np.digitize(data, bins)  # the same

array([3, 3, 2, 4, 3, 3, 2, 4, 2, 3, 3, 4, 3, 4, 3, 3, 4, 4, 3, 3, 4, 2, 3,
       4, 4, 3, 3, 3, 2, 4, 4, 4, 3, 4, 4, 3, 3, 4, 4, 4, 4, 2, 4, 3, 4, 4,
       4, 3, 3, 3])

### memory-mapped files

In [184]:
mmap = np.memmap('testmemmap', dtype=np.float64, shape=(10000, 10000), mode='w+')

In [185]:
mmap

memmap([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [187]:
section = mmap[:5]  # same, slice creates VIEWS

In [188]:
section[:] = np.random.randn(5, 10000)

In [189]:
mmap

memmap([[ 0.30035647,  0.50913131,  1.91651451, ...,  1.56097057,
        -2.88615489, -0.57053111],
       [ 1.35476772,  2.20596238,  1.47651762, ..., -0.46520529,
         0.78499977,  1.70854897],
       [-0.98642343, -0.55482546, -0.5652112 , ..., -0.60180777,
        -0.53515667,  0.6168574 ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [190]:
mmap.flush()  # flush to commit the change

In [191]:
del mmap

In [195]:
mmap = np.memmap('testmemmap', dtype=np.float64, shape=(10000, 10000))  # dtype and shape are still needed

In [196]:
mmap

memmap([[ 0.30035647,  0.50913131,  1.91651451, ...,  1.56097057,
        -2.88615489, -0.57053111],
       [ 1.35476772,  2.20596238,  1.47651762, ..., -0.46520529,
         0.78499977,  1.70854897],
       [-0.98642343, -0.55482546, -0.5652112 , ..., -0.60180777,
        -0.53515667,  0.6168574 ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [197]:
mmap[0, 0] = np.pi

In [199]:
del mmap  # automatically flush

In [200]:
mmap = np.memmap('testmemmap', dtype=np.float64, shape=(10000, 10000))

In [201]:
mmap

memmap([[ 3.14159265,  0.50913131,  1.91651451, ...,  1.56097057,
        -2.88615489, -0.57053111],
       [ 1.35476772,  2.20596238,  1.47651762, ..., -0.46520529,
         0.78499977,  1.70854897],
       [-0.98642343, -0.55482546, -0.5652112 , ..., -0.60180777,
        -0.53515667,  0.6168574 ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

### the importance of contiguous memory

In [202]:
arr_C = np.ones((1000, 1000), order='C')
arr_F = np.ones((1000, 1000), order='F')

In [203]:
arr_C.flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

In [204]:
arr_F.flags

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

In [206]:
arr_F.flags.f_contiguous

True

In [209]:
%timeit arr_C.sum(1)  # sum along the rows for row-major order, NOT FASTER???

100 loops, best of 3: 2.53 ms per loop


In [208]:
%timeit arr_F.sum(1)  # sum along the rows for column-major order

100 loops, best of 3: 2.34 ms per loop


In [210]:
arr_F.copy(order='C').flags  # copy column-major order to row-major order

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

In [211]:
arr_C[:50].flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

In [212]:
arr_C[:, :50].flags  # slicing NOT always contiguous

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False