# Advanced NumPy

In [1]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)

## ndarray Object Internals

In [2]:
np.ones((10, 5)).shape

(10, 5)

In [3]:
np.ones((3, 4, 5), dtype=np.float64).strides

(160, 40, 8)

### NumPy dtype Hierarchy

In [4]:
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)
np.issubdtype(ints.dtype, np.integer)
np.issubdtype(floats.dtype, np.floating)

True

True

In [5]:
np.float64.mro()

[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

In [6]:
np.issubdtype(ints.dtype, np.number)

True

## Advanced Array Manipulation

### Reshaping Arrays

In [7]:
arr = np.arange(8)
arr
arr.reshape((4, 2))

array([0, 1, 2, 3, 4, 5, 6, 7])

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [8]:
arr.reshape((4, 2)).reshape((2, 4))

array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [9]:
arr = np.arange(15)
arr.reshape((5, -1))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [10]:
other_arr = np.ones((3, 5))
other_arr.shape
arr.reshape(other_arr.shape)

(3, 5)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [11]:
arr = np.arange(15).reshape((5, 3))
arr
arr.ravel()

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [12]:
arr.flatten()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

### C Versus Fortran Order

In [13]:
arr = np.arange(12).reshape((3, 4))
arr
arr.ravel()
arr.ravel('F')

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

### Concatenating and Splitting Arrays

In [14]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
np.concatenate([arr1, arr2], axis=0)
np.concatenate([arr1, arr2], axis=1)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [15]:
np.vstack((arr1, arr2))
np.hstack((arr1, arr2))

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [16]:
arr = np.random.randn(5, 2)
arr
first, second, third = np.split(arr, [1, 3])
first
second
third

array([[-0.2047,  0.4789],
       [-0.5194, -0.5557],
       [ 1.9658,  1.3934],
       [ 0.0929,  0.2817],
       [ 0.769 ,  1.2464]])

array([[-0.2047,  0.4789]])

array([[-0.5194, -0.5557],
       [ 1.9658,  1.3934]])

array([[0.0929, 0.2817],
       [0.769 , 1.2464]])

#### Stacking helpers: r_ and c_

In [17]:
arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = np.random.randn(3, 2)
np.r_[arr1, arr2]
np.c_[np.r_[arr1, arr2], arr]

array([[ 0.    ,  1.    ],
       [ 2.    ,  3.    ],
       [ 4.    ,  5.    ],
       [ 1.0072, -1.2962],
       [ 0.275 ,  0.2289],
       [ 1.3529,  0.8864]])

array([[ 0.    ,  1.    ,  0.    ],
       [ 2.    ,  3.    ,  1.    ],
       [ 4.    ,  5.    ,  2.    ],
       [ 1.0072, -1.2962,  3.    ],
       [ 0.275 ,  0.2289,  4.    ],
       [ 1.3529,  0.8864,  5.    ]])

In [18]:
np.c_[1:6, -10:-5]

array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

### Repeating Elements: tile and repeat

In [19]:
arr = np.arange(3)
arr
arr.repeat(3)

array([0, 1, 2])

array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [20]:
arr.repeat([2, 3, 4])

array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [21]:
arr = np.random.randn(2, 2)
arr
arr.repeat(2, axis=0)

array([[-2.0016, -0.3718],
       [ 1.669 , -0.4386]])

array([[-2.0016, -0.3718],
       [-2.0016, -0.3718],
       [ 1.669 , -0.4386],
       [ 1.669 , -0.4386]])

In [22]:
arr.repeat([2, 3], axis=0)
arr.repeat([2, 3], axis=1)

array([[-2.0016, -0.3718],
       [-2.0016, -0.3718],
       [ 1.669 , -0.4386],
       [ 1.669 , -0.4386],
       [ 1.669 , -0.4386]])

array([[-2.0016, -2.0016, -0.3718, -0.3718, -0.3718],
       [ 1.669 ,  1.669 , -0.4386, -0.4386, -0.4386]])

In [23]:
arr
np.tile(arr, 2)

array([[-2.0016, -0.3718],
       [ 1.669 , -0.4386]])

array([[-2.0016, -0.3718, -2.0016, -0.3718],
       [ 1.669 , -0.4386,  1.669 , -0.4386]])

In [24]:
arr
np.tile(arr, (2, 1))
np.tile(arr, (3, 2))

array([[-2.0016, -0.3718],
       [ 1.669 , -0.4386]])

array([[-2.0016, -0.3718],
       [ 1.669 , -0.4386],
       [-2.0016, -0.3718],
       [ 1.669 , -0.4386]])

array([[-2.0016, -0.3718, -2.0016, -0.3718],
       [ 1.669 , -0.4386,  1.669 , -0.4386],
       [-2.0016, -0.3718, -2.0016, -0.3718],
       [ 1.669 , -0.4386,  1.669 , -0.4386],
       [-2.0016, -0.3718, -2.0016, -0.3718],
       [ 1.669 , -0.4386,  1.669 , -0.4386]])

### Fancy Indexing Equivalents: take and put

In [25]:
arr = np.arange(10) * 100
inds = [7, 1, 2, 6]
arr[inds]

array([700, 100, 200, 600])

In [26]:
arr.take(inds)
arr.put(inds, 42)
arr
arr.put(inds, [40, 41, 42, 43])
arr

array([700, 100, 200, 600])

array([  0,  42,  42, 300, 400, 500,  42,  42, 800, 900])

array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

In [27]:
inds = [2, 0, 2, 1]
arr = np.random.randn(2, 4)
arr
arr.take(inds, axis=1)

array([[-0.5397,  0.477 ,  3.2489, -1.0212],
       [-0.5771,  0.1241,  0.3026,  0.5238]])

array([[ 3.2489, -0.5397,  3.2489,  0.477 ],
       [ 0.3026, -0.5771,  0.3026,  0.1241]])

## Broadcasting

In [28]:
arr = np.arange(5)
arr
arr * 4

array([0, 1, 2, 3, 4])

array([ 0,  4,  8, 12, 16])

In [29]:
arr = np.random.randn(4, 3)
arr.mean(0)
demeaned = arr - arr.mean(0)
demeaned
demeaned.mean(0)

array([-0.3928, -0.3824, -0.8768])

array([[ 0.3937,  1.7263,  0.1633],
       [-0.4384, -1.9878, -0.9839],
       [-0.468 ,  0.9426, -0.3891],
       [ 0.5126, -0.6811,  1.2097]])

array([-0.,  0., -0.])

In [30]:
arr
row_means = arr.mean(1)
row_means.shape
row_means.reshape((4, 1))
demeaned = arr - row_means.reshape((4, 1))
demeaned.mean(1)

array([[ 0.0009,  1.3438, -0.7135],
       [-0.8312, -2.3702, -1.8608],
       [-0.8608,  0.5601, -1.2659],
       [ 0.1198, -1.0635,  0.3329]])

(4,)

array([[ 0.2104],
       [-1.6874],
       [-0.5222],
       [-0.2036]])

array([ 0., -0.,  0.,  0.])

### Broadcasting Over Other Axes

In [31]:
arr - arr.mean(1)

ValueError: operands could not be broadcast together with shapes (4,3) (4,) 

In [32]:
arr - arr.mean(1).reshape((4, 1))

array([[-0.2095,  1.1334, -0.9239],
       [ 0.8562, -0.6828, -0.1734],
       [-0.3386,  1.0823, -0.7438],
       [ 0.3234, -0.8599,  0.5365]])

In [33]:
arr = np.zeros((4, 4))
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape
arr_1d = np.random.normal(size=3)
arr_1d[:, np.newaxis]
arr_1d[np.newaxis, :]

(4, 1, 4)

array([[-2.3594],
       [-0.1995],
       [-1.542 ]])

array([[-2.3594, -0.1995, -1.542 ]])

In [34]:
arr = np.random.randn(3, 4, 5)
depth_means = arr.mean(2)
depth_means
depth_means.shape
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned.mean(2)

array([[-0.4735,  0.3971, -0.0228,  0.2001],
       [-0.3521, -0.281 , -0.071 , -0.1586],
       [ 0.6245,  0.6047,  0.4396, -0.2846]])

(3, 4)

array([[ 0.,  0., -0., -0.],
       [ 0.,  0., -0.,  0.],
       [ 0.,  0., -0., -0.]])

```python
def demean_axis(arr, axis=0):
    means = arr.mean(axis)

    # This generalizes things like [:, :, np.newaxis] to N dimensions
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]
```

### Setting Array Values by Broadcasting

In [35]:
arr = np.zeros((4, 3))
arr[:] = 5
arr

array([[5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.],
       [5., 5., 5.]])

In [36]:
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr
arr[:2] = [[-1.37], [0.509]]
arr

array([[ 1.28,  1.28,  1.28],
       [-0.42, -0.42, -0.42],
       [ 0.44,  0.44,  0.44],
       [ 1.6 ,  1.6 ,  1.6 ]])

array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

## Advanced ufunc Usage（ufunc的高级使用）

### ufunc Instance Methods（ufunc实例方法）

In [38]:
# reduce 接受 一个数组参数，并通过一系列的二元运算对其值进行聚合（可指明轴向）
# np.add.reduce 对数组各个元素进行求和
arr = np.arange(10)
np.add.reduce(arr)
arr.sum()

45

45

In [45]:
# 使用 np.logical_and检查数组 各行中的值是否是有序的
np.random.seed(12346)  # for reproducibility
arr = np.random.randn(5, 5)
arr
arr[::2]
arr[::2].sort(1) # sort a few rows
arr[:, :-1] < arr[:, 1:]  # 比较前四列 和 后四列
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)

array([[-0.09  ,  0.7594,  0.7483, -0.9815,  0.3658],
       [-0.3154, -0.8661,  0.0279, -0.4556, -1.6019],
       [ 0.2483, -0.3215, -0.8487,  0.0005, -0.5465],
       [ 0.2539,  1.9368, -0.7995, -0.5692,  0.0489],
       [-0.6491, -0.4795, -0.9535,  1.4225,  0.1754]])

array([[-0.09  ,  0.7594,  0.7483, -0.9815,  0.3658],
       [ 0.2483, -0.3215, -0.8487,  0.0005, -0.5465],
       [-0.6491, -0.4795, -0.9535,  1.4225,  0.1754]])

array([[ True,  True,  True,  True],
       [False,  True, False, False],
       [ True,  True,  True,  True],
       [ True, False,  True,  True],
       [ True,  True,  True,  True]])

array([ True, False,  True, False,  True])

In [46]:
# accumulate 和 sum的关系就像cumsum和sum的关系那样，它产生一个跟原数组大小相同的中间“累计”值数组
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)

array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]])

In [48]:
# outer 用于计算两个数组的叉积，outer输出结果的维度是两个输入数据的维度之和
#3、multiply
#①对应位置进行加减乘除
#②两个参数和结果的shape应该一致
arr = np.arange(3).repeat([1, 2, 2])
arr
# 第一个参数确定结果的行，第二个参数确定结果的列
np.multiply.outer(arr, np.arange(5))

array([0, 1, 1, 2, 2])

array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

In [49]:
# outer输出结果的维度是两个输入数据的维度之和
x, y = np.random.randn(3, 4), np.random.randn(5)
result = np.subtract.outer(x, y)
result.shape

(3, 4, 5)

In [52]:
# reduceat 用于计算“局部约简”，其实就是一个对数据各切片进行聚合的groupby运算
# 它接受 一组 用于指示 如何 对值 进行拆分和聚合的“面元边界”
arr = np.arange(10)
# 最后结果：是在arr[0:5]、arr[5:8]以及arr[8:]上执行约简（本例中就是求和）
np.add.reduceat(arr, [0, 5, 8])

array([10, 18, 17])

In [51]:
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr
np.add.reduceat(arr, [0, 2, 4], axis=1)

array([[ 0,  0,  0,  0,  0],
       [ 0,  1,  2,  3,  4],
       [ 0,  2,  4,  6,  8],
       [ 0,  3,  6,  9, 12]])

array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]])

### Writing New ufuncs in Python（自定义 ufunc）

In [55]:
# 实现元素级 加法 的简单函数
def add_elements(x, y):
    return x + y
add_them = np.frompyfunc(add_elements, 2, 1)
add_them(np.arange(8), np.arange(8))  # add_them类型为：numpy.ufunc

array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)

In [56]:
# numpy.vectorize虽然没有 numpy.frompyfunc强大，但它在类型推断上更智能
add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))  # add_them 类型：numpy.lib.function_base.vectorize

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14.])

In [58]:
# 性能测试：虽然上述两个函数能够提供一种创建 ufunc型函数的手段，但非常慢，因为它们在计算每个元素时都要执行一次 Python函数调用
# 这自然会比 NumPy自带的基于C的ufunc慢很多
arr = np.random.randn(10000)
%timeit add_them(arr, arr)
%timeit np.add(arr, arr)

857 µs ± 7.29 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.89 µs ± 8.43 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## Structured and Record Arrays（结构化和记录式数组）

In [59]:
# 定义结构化dtype 典型办法：元组列表，各元组的格式为（field_name, field_data_type）
# 这样，数组的元素就成了 元组式的对象，该对象中的元素可以像字典那样进行 访问
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr

array([(1.5   ,  6), (3.1416, -2)], dtype=[('x', '<f8'), ('y', '<i4')])

In [60]:
sarr[0]
sarr[0]['y']

(1.5, 6)

6

In [64]:
# 字段名 保存在 dtype.names属性中，在访问结构化数组的某个字段时，返回的是该数据的视图
sarr['x']

array([1.5   , 3.1416])

### Nested dtypes and Multidimensional Fields（嵌套dtype和多维字段）

In [65]:
# 在定义结构化 dtype时，可以设置一个形状（可以是一个整数，也可以是一个元组）
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)
arr

array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)],
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

In [66]:
# 在这种情况下，各个记录的x字段所表示的是一个长度为3的数组
arr[0]['x']

array([0, 0, 0])

In [68]:
# 访问 arr['x'] 即可得到一个二维数组，而不是前面例子中的一维数组
arr['x']

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [70]:
# 能用 单个数组的内存块 存放 复杂的嵌套结构
# 嵌套 dtype
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)
data['x']
data['y']
data['x']['a']

array([(1., 2.), (3., 4.)], dtype=[('a', '<f8'), ('b', '<f4')])

array([5, 6], dtype=int32)

array([1., 3.])

### Why Use Structured Arrays?（为什么使用结构化数组）

### 结构化数组操作：numpy.lib.recfunctions

## More About Sorting

In [72]:
# 同 python内置列表，ndarry的sort实例方法也是 就地排序，即数组内容的重新排序是不会产生新数组的
arr = np.random.randn(6)
arr.sort() # 调用 实例方法，就地排序，而 np.sort(arr) 则不会修改原始数据；这两个排序的方法都没有倒序的排列
arr

array([-0.8189, -0.5783,  0.1554,  0.523 ,  0.7867,  1.8125])

In [73]:
arr = np.random.randn(3, 5)
arr
# 如果 目标数组（实例）只是一个视图，则原始数组将会被修改
arr[:, 0].sort()  # Sort first column values in-place
arr

array([[ 0.3634,  0.9312, -0.9998,  0.4323,  0.5696],
       [-0.282 , -0.2629,  0.7848,  0.5311,  0.45  ],
       [-0.6169,  0.1529,  0.1935,  0.4795,  0.0288]])

array([[-0.6169,  0.9312, -0.9998,  0.4323,  0.5696],
       [-0.282 , -0.2629,  0.7848,  0.5311,  0.45  ],
       [ 0.3634,  0.1529,  0.1935,  0.4795,  0.0288]])

In [None]:
# 相反，np.sort(arr) 会为 原数组 创建一个已排序 副本
arr = np.random.randn(5)
arr
np.sort(arr)
arr

In [74]:
arr = np.random.randn(3, 5)
arr
# 指定轴进行排序
arr.sort(axis=1)
arr

array([[-0.1513, -0.2368,  0.61  , -0.385 , -1.4666],
       [-0.3805, -0.5927, -0.4491, -1.5949,  0.0232],
       [-1.3555, -0.7328,  0.5663,  1.1501,  0.5902]])

array([[-1.4666, -0.385 , -0.2368, -0.1513,  0.61  ],
       [-1.5949, -0.5927, -0.4491, -0.3805,  0.0232],
       [-1.3555, -0.7328,  0.5663,  0.5902,  1.1501]])

In [76]:
# 使用 value[::-1] 可以返回 一个反序的列表
arr[:, ::-1]

array([[ 0.61  , -0.1513, -0.2368, -0.385 , -1.4666],
       [ 0.0232, -0.3805, -0.4491, -0.5927, -1.5949],
       [ 1.1501,  0.5902,  0.5663, -0.7328, -1.3555]])

### Indirect Sorts: argsort and lexsort（直接排序：argsort和lexsort）

In [81]:
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort() # 索引数组（索引器）
# 将 value 按照 从小到大排序后，排序后原始索引的位置排列， 如 value[1] = 0 最小，应该放到 新的视图的第一个，
# 即，view[1,2,4,3,0]；其实代表的是 数组的 索引
indexer 
values[indexer]

array([1, 2, 4, 3, 0])

array([0, 1, 2, 3, 5])

In [83]:
# 针对 数组的第一行 进行排序
arr = np.random.randn(3, 5)
arr[0] = values
arr
arr[:, arr[0].argsort()] # arr[0].argsort() 表示 排序后的 原始索引的排列

array([[ 5.    ,  0.    ,  1.    ,  3.    ,  2.    ],
       [-0.4247, -1.3484, -0.7371, -0.5543, -1.1869],
       [ 1.9229,  0.6173, -0.4207, -0.8286, -1.1959]])

array([[ 0.    ,  1.    ,  2.    ,  3.    ,  5.    ],
       [-1.3484, -0.7371, -1.1869, -0.5543, -0.4247],
       [ 0.6173, -0.4207, -1.1959, -0.8286,  1.9229]])

In [91]:
# lexsort：可以的一次性对 多个 键数组 执行 间接排序（字典序）
# lexsort函数：键的应用顺序是从最后一个传入的算起的
# 例子：对一些以 姓和名标识的数据进行排序（因为最后传入的是last_name，所以先按照last_name排序）
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name))
sorter
# zip() 函数用于将可迭代的对象作为参数，将对象中对应的元素打包成一个个元组，然后返回由这些元组组成的列表。
zip(last_name[sorter], first_name[sorter])
list(zip(last_name[sorter], first_name[sorter]))

array([1, 2, 3, 0, 4])

<zip at 0x7fb49473e4c8>

[('Arnold', 'Jane'),
 ('Arnold', 'Steve'),
 ('Jones', 'Bill'),
 ('Jones', 'Bob'),
 ('Walters', 'Barbara')]

### Alternative Sort Algorithms（其他排序算法）

|kind|速度|稳定性|工作空间|最坏的情况|
|- |:---|---:|:---:|:---:|
|'quicksort'|1|否|O|(n^2)|
|'mergesort'|2|是|n/2|O(n logn)|
|'heapsort'|3|否|0|O(n logn)|


In [92]:
values = np.array(['2:first', '2:second', '1:first', '1:second',
                   '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer
values.take(indexer)

array([2, 3, 4, 0, 1])

array(['1:first', '1:second', '1:third', '2:first', '2:second'],
      dtype='<U8')

### Partially Sorting Arrays

In [99]:
# 以排序后的 第 n个数 作为轴 进行排序，稳定排序
list=[3,4,5,2,1]
np.partition(list,3)
# 以排序后的第3个数，即3进行分区，分区后的结果即是：
# 小于3的元素2,1位于3的前面，大于等于3的元素4,5位于3的后面
np.argpartition(list, 3)

array([2, 1, 3, 4, 5])

array([3, 4, 0, 1, 2])

In [97]:
np.random.seed(12345)
arr = np.random.randn(20)
arr
np.partition(arr, 3)

array([-0.2047,  0.4789, -0.5194, -0.5557,  1.9658,  1.3934,  0.0929,
        0.2817,  0.769 ,  1.2464,  1.0072, -1.2962,  0.275 ,  0.2289,
        1.3529,  0.8864, -2.0016, -0.3718,  1.669 , -0.4386])

array([-2.0016, -1.2962, -0.5557, -0.5194, -0.3718, -0.4386, -0.2047,
        0.2817,  0.769 ,  0.4789,  1.0072,  0.0929,  0.275 ,  0.2289,
        1.3529,  0.8864,  1.3934,  1.9658,  1.669 ,  1.2464])

In [98]:
indices = np.argpartition(arr, 3)
indices
arr.take(indices)

array([16, 11,  3,  2, 17, 19,  0,  7,  8,  1, 10,  6, 12, 13, 14, 15,  5,
        4, 18,  9])

array([-2.0016, -1.2962, -0.5557, -0.5194, -0.3718, -0.4386, -0.2047,
        0.2817,  0.769 ,  0.4789,  1.0072,  0.0929,  0.275 ,  0.2289,
        1.3529,  0.8864,  1.3934,  1.9658,  1.669 ,  1.2464])

### numpy.searchsorted: Finding Elements in a Sorted Array（在有序数组中查找元素）

In [101]:
arr = np.array([0, 1, 7, 12, 15]) # 有序数组
arr.searchsorted(9)

3

In [104]:
# 默认行为是，返回相等值组的左侧索引
arr.searchsorted([0, 8, 11, 16])

array([0, 7, 7, 7])

In [105]:
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])
arr.searchsorted([0, 1], side='right')

array([0, 3])

array([3, 7])

In [106]:
# 我们有一个数据数组（其中的值在 0到 10000之间）还有一个表示“面元边界”的数组
# 希望用 “面元边界”将数据数组拆分开
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data

array([9940., 6768., 7908., 1709.,  268., 8003., 9037.,  246., 4917.,
       5262., 5963.,  519., 8950., 7282., 8183., 5002., 8101.,  959.,
       2189., 2587., 4681., 4593., 7095., 1780., 5314., 1677., 7688.,
       9281., 6094., 1501., 4896., 3773., 8486., 9110., 3838., 3154.,
       5683., 1878., 1258., 6875., 7996., 5735., 9732., 6340., 8884.,
       4954., 3516., 7142., 5039., 2256.])

In [108]:
# bins.searchsorted(data) : 面元进行拆分数据数组
labels = bins.searchsorted(data) # 得到 个数据点 所属 区间的编号（其中 1代表面元【0,1000】）
labels

array([4, 4, 4, 3, 2, 4, 4, 2, 3, 4, 4, 2, 4, 4, 4, 4, 4, 2, 3, 3, 3, 3,
       4, 3, 4, 3, 4, 4, 4, 3, 3, 3, 4, 4, 3, 3, 4, 3, 3, 4, 4, 4, 4, 4,
       4, 3, 3, 4, 4, 3])

In [112]:
# 通过使用 pandas的groupby 使用该结果 即可非常轻松地对元数据集进行拆分
type(pd.Series(data).groupby(labels))
pd.Series(data).groupby(labels).mean()

pandas.core.groupby.groupby.SeriesGroupBy

2     498.000000
3    3064.277778
4    7389.035714
dtype: float64

In [115]:
# 计算这种面元编号：使用 numpy的digitize
labels=np.digitize(data, bins)
pd.Series(data).groupby(labels).mean()

2     498.000000
3    3064.277778
4    7389.035714
dtype: float64

#### 注意：索引问题导致的shape不一致

In [136]:
x = np.arange(16).reshape(4,4)
x[:,0] # 一列数据
x[:,0].shape
y = x[:,:1] # 切片操作 可产生 二维结果
x[:,:1].shape

array([ 0,  4,  8, 12])

(4,)

(4, 1)

In [137]:
# 积 y^T * y
y.shape
np.dot(y.T, y)

(4, 1)

array([[224]])

### NumPy的matrix类

In [138]:
xM = np.matrix(x)
yM = xM[:,0]
xM
yM
yM.shape
yM.T * xM * yM

matrix([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11],
        [12, 13, 14, 15]])

matrix([[ 0],
        [ 4],
        [ 8],
        [12]])

(4, 1)

matrix([[6720]])

In [139]:
# 返回矩阵的逆：属性 I
xM.I * xM

matrix([[-2.,  0., -2., -2.],
        [ 2.,  0.,  0.,  2.],
        [ 0.,  4.,  0.,  4.],
        [ 0.,  0.,  0.,  0.]])

## Writing Fast NumPy Functions with Numba

In [140]:
import numpy as np

def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count

```python
In [209]: x = np.random.randn(10000000)

In [210]: y = np.random.randn(10000000)

In [211]: %timeit mean_distance(x, y)
1 loop, best of 3: 2 s per loop

In [212]: %timeit (x - y).mean()
100 loops, best of 3: 14.7 ms per loop
```

```python
In [213]: import numba as nb

In [214]: numba_mean_distance = nb.jit(mean_distance)
```

```python
@nb.jit
def mean_distance(x, y):
    nx = len(x)
    result = 0.0
    count = 0
    for i in range(nx):
        result += x[i] - y[i]
        count += 1
    return result / count
```

```python
In [215]: %timeit numba_mean_distance(x, y)
100 loops, best of 3: 10.3 ms per loop
```

```python
from numba import float64, njit

@njit(float64(float64[:], float64[:]))
def mean_distance(x, y):
    return (x - y).mean()
```

### Creating Custom numpy.ufunc Objects with Numba

```python
from numba import vectorize

@vectorize
def nb_add(x, y):
    return x + y
```

```python
In [13]: x = np.arange(10)

In [14]: nb_add(x, x)
Out[14]: array([  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.,  16.,  18.])

In [15]: nb_add.accumulate(x, 0)
Out[15]: array([  0.,   1.,   3.,   6.,  10.,  15.,  21.,  28.,  36.,  45.])
```

## Advanced Array Input and Output（高级数组的输入输出）

### Memory-Mapped Files

In [141]:
# 使用 函数 np.memmap并传入一个文件路径、数据类型、文件模式、形状，即可创建一个新的memmap
mmap = np.memmap('mymmap', dtype='float64', mode='w+',
                 shape=(10000, 10000))
mmap

memmap([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [142]:
# 对 memmap进行切片，将返回 磁盘上的数据的视图
section = mmap[:5]

In [143]:
# 将数据 赋值给这些视图，数据会被先缓存到内存中（就像是python的文件对象），调用flush即可将其写入磁盘
section[:] = np.random.randn(5, 10000)
mmap.flush()
mmap
del mmap

memmap([[ 0.7584, -0.6605,  0.8626, ...,  0.6046, -0.6212,  2.0542],
        [-1.2113, -1.0375,  0.7093, ..., -1.4117, -0.1719, -0.8957],
        [-0.1419, -0.3375,  0.4329, ...,  1.2914, -0.752 , -0.44  ],
        ...,
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
        [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ]])

In [146]:
# 当打开一个已经存在的内存映像时，仍然要指明数据类型和形状，因为磁盘上的那个文件只是一块二进制数据而已
mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))
mmap

FileNotFoundError: [Errno 2] No such file or directory: 'mymmap'

In [147]:
%xdel mmap
!rm mymmap

NameError: name 'mmap' is not defined
rm: cannot remove 'mymmap': No such file or directory


### HDF5 and Other Array Storage Options

## Performance Tips

### The Importance of Contiguous Memory

In [150]:
arr_c = np.ones((1000, 1000), order='C')
arr_f = np.ones((1000, 1000), order='F')
arr_c.flags
arr_f.flags
arr_f.flags.f_contiguous
arr_f.flags.c_contiguous

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

True

False

In [152]:
# 因为涉及到 跨越式 的内存放访问，所以 对以 C型存储的数组的行求和 一般是最快的
%timeit arr_c.sum(1)
%timeit arr_f.sum(1)

215 µs ± 437 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
385 µs ± 727 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [154]:
# 提升性能：如果数组的 内存排序不符合你的要求，使用 copy并传入 ‘C’或‘F’即可解决该问题
arr_f.copy('C').flags

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [158]:
# 在构造数组的视图时，其结果 不一定是 连续的
arr_c[:50].flags.contiguous
arr_c[:, :50].flags

NameError: name 'arr_c' is not defined

In [156]:
%xdel arr_c
%xdel arr_f

#### 其他加速手段

In [162]:
# Cython 函数 用于 对一个一维数组的所有元素的求和
from numpy cimport ndarray,float64_t
def sum_elements(ndarray[float64_t] arr):
    cdef Py_ssize_t i, n = len(arr)
    cdef float64_t result = 0
    
    for i in range(n):
        result += arr[i]
    
    return result

SyntaxError: invalid syntax (<ipython-input-162-90140f21a2f0>, line 2)

In [163]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS