最初的实现逻辑

In [44]:
# coding = 'utf-8'
import numpy as np
import pandas as pd


def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result


In [None]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [45]:
%%timeit
target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 23.6 s per loop


使用字典缓存结果，避免每次循环都使用 groupby 聚合

In [46]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [52]:
%%timeit
target_mean_v2(data, 'y', 'x')

1 loop, best of 3: 258 ms per loop


In [54]:
v1_result = target_mean_v1(data, 'y', 'x')
v2_result = target_mean_v2(data, 'y', 'x')
np.linalg.norm(v2_result - v1_result)

0.0

优化数据处理逻辑，减少数组定位调用的次数。

In [None]:
def target_mean_v3(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    row_num = data.shape[0]
    for i in range(row_num):
        x = data.loc[i, x_name]
        if x not in value_dict.keys():
            value_dict[x] = data.loc[i, y_name]
            count_dict[x] = 1
        else:
            value_dict[x] += data.loc[i, y_name]
            count_dict[x] += 1

    for i in range(row_num):
        x = data.loc[i, x_name]
        result[i] = (value_dict[x] - data.loc[i, y_name]) / (count_dict[x] - 1)
    return result

In [None]:
%%timeit
target_mean_v3(data, 'y', 'x')

[0.52901786 0.46534653 0.51903808 ... 0.55009107 0.46732673 0.48080808]
[0.52901786 0.46534653 0.51903808 ... 0.55009107 0.46732673 0.48080808]
[0.52901786 0.46534653 0.51903808 ... 0.55009107 0.46732673 0.48080808]
[0.52901786 0.46534653 0.51903808 ... 0.55009107 0.46732673 0.48080808]
[0.52901786 0.46534653 0.51903808 ... 0.55009107 0.46732673 0.48080808]
[0.52901786 0.46534653 0.51903808 ... 0.55009107 0.46732673 0.48080808]
[0.52901786 0.46534653 0.51903808 ... 0.55009107 0.46732673 0.48080808]
[0.52901786 0.46534653 0.51903808 ... 0.55009107 0.46732673 0.48080808]
[0.52901786 0.46534653 0.51903808 ... 0.55009107 0.46732673 0.48080808]
[0.52901786 0.46534653 0.51903808 ... 0.55009107 0.46732673 0.48080808]
[0.52901786 0.46534653 0.51903808 ... 0.55009107 0.46732673 0.48080808]
[0.52901786 0.46534653 0.51903808 ... 0.55009107 0.46732673 0.48080808]
[0.52901786 0.46534653 0.51903808 ... 0.55009107 0.46732673 0.48080808]
[0.52901786 0.46534653 0.51903808 ... 0.55009107 0.46732673 0.48

In [58]:
v3_result = target_mean_v3(data, 'y', 'x')
np.linalg.norm(v3_result - v1_result)

0.0

使用 Cython 优化

In [None]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [64]:
%%cython -a

import numpy as np
cimport numpy as cnp

cpdef cnp.ndarray target_mean_v4(cnp.ndarray data_x, cnp.ndarray data_y):
    cdef int row_num = data_x.shape[0]
    cdef cnp.ndarray result = np.zeros(row_num)
    cdef cnp.ndarray value_arr = np.zeros(10)
    cdef cnp.ndarray count_arr = np.zeros(10)
    cdef int i
    for i in range(row_num):
        value_arr[data_x[i]] += data_y[i]
        count_arr[data_x[i]] += 1
    
    for i in range(row_num):
        result[i] = (value_arr[data_x[i]] - data_y[i]) / (count_arr[data_x[i]] - 1)

    return result

In [65]:
%%timeit
target_mean_v4(data['x'].values, data['y'].values)

100 loops, best of 3: 11 ms per loop


In [66]:
v4_result = target_mean_v4(data['x'].values, data['y'].values)
np.linalg.norm(v4_result - v1_result)

0.0

指定数据类型，提高效率

In [67]:
%%cython -a

import numpy as np
cimport numpy as cnp

cpdef cnp.ndarray[double] target_mean_v5(cnp.ndarray[long, ndim=1] data_x, cnp.ndarray[long, ndim=1] data_y):
    cdef int row_num = data_x.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(row_num, dtype=np.float64)
    cdef cnp.ndarray[long] value_arr = np.zeros(10, dtype=np.int64)
    cdef cnp.ndarray[long] count_arr = np.zeros(10, dtype=np.int64)
    cdef int i
    for i in range(row_num):
        value_arr[data_x[i]] += data_y[i]
        count_arr[data_x[i]] += 1
    
    for i in range(row_num):
        result[i] = (value_arr[data_x[i]] - data_y[i]) / (count_arr[data_x[i]] - 1)

    return result

In [77]:
%%timeit
target_mean_v5(data['x'].values, data['y'].values)

The slowest run took 4.29 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 40.5 µs per loop


In [76]:
v5_result = target_mean_v5(data['x'].values, data['y'].values)
np.linalg.norm(v5_result - v1_result)

0.0

关闭 boundscheck 和 wraparound，使用 prange 多线程加速

In [78]:
%%cython -a

import numpy as np
cimport numpy as cnp
from cython.parallel import prange
import cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef cnp.ndarray[double] target_mean_v6(cnp.ndarray[long, ndim=1] data_x, cnp.ndarray[long, ndim=1] data_y):
    cdef int row_num = data_x.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(row_num, dtype=np.float64)
    cdef cnp.ndarray[long] value_arr = np.zeros(10, dtype=np.int64)
    cdef cnp.ndarray[long] count_arr = np.zeros(10, dtype=np.int64)
    cdef int i
    for i in prange(row_num, nogil=True):
        value_arr[data_x[i]] += data_y[i]
        count_arr[data_x[i]] += 1
    
    for i in prange(row_num, nogil=True):
        result[i] = (value_arr[data_x[i]] - data_y[i]) / (count_arr[data_x[i]] - 1)

    return result

In [79]:
#@title 默认标题文本
%%timeit
target_mean_v6(data['x'].values, data['y'].values)

The slowest run took 69.05 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 39.3 µs per loop


In [75]:
v6_result = target_mean_v6(data['x'].values, data['y'].values)
np.linalg.norm(v6_result - v1_result)

0.0