# Chapter9 程序的高速化

▲清单 9.1 “%timeit”命令的示例①

In [1]:
import numpy as np

x = np.random.randn(100000)

%timeit np.sum(x)

52.5 µs ± 2.18 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


▲清单 9.2 “%timeit”命令的示例②

In [2]:
res = %timeit -o np.sum(x)
vars(res)

53.1 µs ± 3.33 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


{'loops': 10000,
 'repeat': 7,
 'best': 4.943126000000007e-05,
 'worst': 5.855125999999977e-05,
 'all_runs': [0.5005145999999989,
  0.5736223000000003,
  0.535946599999999,
  0.5163469000000003,
  0.5093560000000004,
  0.49431260000000066,
  0.5855125999999977],
 'compile_time': 4.679999999979145e-05,
 '_precision': 3,
 'timings': [5.005145999999989e-05,
  5.736223000000003e-05,
  5.3594659999999906e-05,
  5.163469000000003e-05,
  5.0935600000000035e-05,
  4.943126000000007e-05,
  5.855125999999977e-05]}

▲清单 9.3 “%timeit”命令的示例③

In [3]:
%timeit -n 100 -r 5 np.sum(x)

64.2 µs ± 15.9 µs per loop (mean ± std. dev. of 5 runs, 100 loops each)


▲清单 9.4 %prun 的示例

In [4]:
import numpy as np
import time

def myfun(n):
    A = np.random.rand(n, n)
    b = np.random.rand(n, 1)
    time.sleep(.1)
    res = A @ b
    
%prun myfun(10000)

 

▲清单 9.5 “%lprun”命令的有效化

In [5]:
%load_ext line_profiler

▲清单 9.6 “%lprun”命令的示例

In [6]:
%lprun -f myfun myfun(10000)

▲清单 9.7 自定义函数与 NumPy 函数的处理时间的对比

In [7]:
import numpy as np

def max_py(x, y):
    res = np.empty_like(x)
    
    for i in range(len(x)):
        res[i] = max(x[i], y[i])
        
    return res

x = np.random.rand(1000000)
y = np.random.rand(1000000)

# 自定义函数与NumPy函数的执行时间
%timeit max_py(x, y)
%timeit np.maximum(x, y)

624 ms ± 34.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
11.5 ms ± 428 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


▲清单 9.8 “%%cython”命令的有效化

In [8]:
%load_ext cython

▲清单 9.9 “%%cython”命令的示例

In [9]:
%%cython
import numpy as np

def max_cy(x, y):
    res = np.empty_like(x)
    
    for i in range(len(x)):
        res[i] = max(x[i], y[i])
        
    return res

▲清单 9.10 测算 max_cy 函数的执行时间



In [10]:
%timeit max_cy(x, y)

213 ms ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


▲清单 9.11 基于 -a 选项输出报告

In [11]:
%%cython -a
import numpy as np

def max_cy(x, y):
    res = np.empty_like(x)
    
    for i in range(len(x)):
        res[i] = max(x[i], y[i])
        
    return res

▲清单 9.12 添加了类型声明的函数的创建

In [12]:
%%cython
from cython import boundscheck, wraparound
import numpy as np

@boundscheck(False)
@wraparound(False)
def max_typed(double[:] x, double[:] y):
    cdef int i
    cdef double[:] res
    
    res = np.empty_like(x)
    
    for i in range(len(x)):
        res[i] = max(x[i], y[i])
        
    return res

▲清单 9.13 测算 max_typed 函数的执行时间

In [13]:
%timeit max_typed(x, y)

6.57 ms ± 318 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


▲清单 9.14 @jit 效果的确认

In [14]:
import numpy as np
from numba import jit

@jit(nopython=True)
def max_jit(x, y):
    res = np.empty_like(x)
    
    for i in range(len(x)):
        res[i] = max(x[i], y[i])
        
    return res

x = np.random.rand(1000000)
y = np.random.rand(1000000)

# 原有函数、Numpy函数和使用了Numba的函数进行比较
%timeit max_jit.py_func(x, y)
%timeit np.maximum(x, y)
%timeit max_jit(x, y)

591 ms ± 16.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
10.7 ms ± 242 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5.3 ms ± 664 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


▲清单 9.15 使用 @jit 实现多核 CPU 的并行处理

In [15]:
from numba import prange

@jit(nopython=True, parallel=True)
def max_jit_parallel(x, y):
    res = np.empty_like(x)
    
    for i in prange(len(x)):
        res[i] = max(x[i], y[i])
        
    return res

%timeit max_jit_parallel(x, y)

5.73 ms ± 1.48 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


▲清单 9.16 np.vectorize 函数的示例

In [16]:
import math

def myfun(a, b):
    if a < b:
        return math.sin(b - a)
    elif a > b:
        return math.cos(a - b)
    else:
        return 0
    
myfun_np = np.vectorize(myfun)

x = 2 * np.pi * np.random.rand(1000000)
y = 2 * np.pi * np.random.rand(1000000)
    
%timeit myfun_np(x, y)

450 ms ± 9.55 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


▲清单 9.17 确认 @vectorize 装饰器效果

In [17]:
from numba import vectorize

@vectorize(nopython=True)
def myfun(a, b):
    if a < b:
        return math.sin(b - a)
    elif a > b:
        return math.cos(a - b)
    else:
        return 0
    
%timeit myfun(x, y)

12.2 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


▲清单 9.18 使用 @vectorize 装饰器实现多核 CPU 的并行处理

In [18]:
@vectorize(['f8(i8,i8)', 'f8(f8,f8)'],
           nopython=True, target='parallel')
def myfunc_parallel(a, b):
    if a < b:
        return math.sin(b - a)
    elif a > b:
        return math.cos(a - b)
    else:
        return 0
    
%timeit myfunc_parallel(x, y)

6.31 ms ± 701 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
