In [103]:
import numpy as np
import pandas as pd

# 生成数据

In [104]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

## version_1

In [3]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

### 测试

In [88]:
%%timeit 
target_mean_v1(data, 'y', 'x')

23.6 s ± 25.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## version_2

In [5]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

### 测试

In [92]:
%%timeit 
target_mean_v2(data, 'y', 'x')

302 ms ± 822 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [96]:
result1 = target_mean_v1(data, 'y', 'x')
result2 = target_mean_v2(data, 'y', 'x')
is_eq = np.linalg.norm(result2 - result1)
print(is_eq)

0.0


## version_4 群内同学的方法

In [9]:
from collections import defaultdict
def target_mean_v4(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    X = data[x_name].values
    Y = data[y_name].values
    value_dict = defaultdict(lambda: 0)
    count_dict = defaultdict(lambda: 0)
    for x, y in zip(X, Y):
        value_dict[x] += y
        count_dict[x] += 1
    result = [(value_dict[x] - y) / (count_dict[x] - 1) for x, y in zip(X, Y)]
    return result

### 测试

In [10]:
%%timeit 
target_mean_v4(data, 'y', 'x')

7.66 ms ± 9.38 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [98]:
result4 = target_mean_v4(data, 'y', 'x')
is_eq = np.linalg.norm(result4 - result1)
print(is_eq)

0.0


## version_5 使用cython + unordered_map

In [11]:
%load_ext cython

In [56]:
%%cython --cplus 

import numpy as np
cimport numpy as cnp
from libcpp.unordered_map cimport unordered_map

# 最好每个都加上类型
cpdef cnp.ndarray[double] target_mean_v5(cnp.ndarray[long,ndim=2] data):
    # get row numbers
    cdef int n = data.shape[0]
    # result to return 
    cdef cnp.ndarray[double] result = np.zeros(n,dtype=np.double)
    # get y and x 
    cdef cnp.ndarray[long] y = data[:,0]
    cdef cnp.ndarray[long] x = data[:,1]
    # define map
    cdef unordered_map[long, long] value_map
    cdef unordered_map[long, long] count_map
    
    for i in range(n):
        # x[i] y[i]
        if value_map.find(x[i]) == value_map.end():
            value_map[x[i]] = y[i]
            count_map[x[i]] = 1
        else:
            value_map[x[i]] += y[i]
            count_map[x[i]] += 1
    for i in range(n):
        result[i] = (value_map[x[i]] - y[i]) / (count_map[x[i]] - 1)
    return result

### 测试

In [57]:
data2 = data.values.astype(np.int_)

In [58]:
%%timeit
target_mean_v5(data2)

527 µs ± 145 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [100]:
result5 = target_mean_v5(data2)
is_eq = np.linalg.norm(result5 - result1)
print(is_eq)

0.0


## version_6 使用cython + numpy

In [81]:
%%cython

import numpy as np
cimport numpy as cnp

# 最好每个都加上类型
cpdef cnp.ndarray[double] target_mean_v6(cnp.ndarray[long,ndim=2] data):
    # get row numbers
    cdef int n = data.shape[0]
    # result to return 
    cdef cnp.ndarray[double] result = np.zeros(n,dtype=np.double)
    # get y and x 
    cdef cnp.ndarray[long] y = data[:,0]
    cdef cnp.ndarray[long] x = data[:,1]
    # define map
    cdef cnp.ndarray[long] value_map = np.zeros(n).astype(np.int_)
    cdef cnp.ndarray[long] count_map = np.zeros(n).astype(np.int_)
    
    for i in range(n):
        value_map[x[i]] += y[i]
        count_map[x[i]] += 1
    for i in range(n):
        result[i] = (value_map[x[i]] - y[i]) / (count_map[x[i]] - 1)
    return result

### 测试

In [82]:
%%timeit
target_mean_v6(data2)

66.4 µs ± 66.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [101]:
result6 = target_mean_v6(data2)
is_eq = np.linalg.norm(result6 - result1)
print(is_eq)

0.0


## version_7 使用cython + 并行

In [86]:
%%cython

import numpy as np
cimport numpy as cnp
cimport cython
from cython.parallel import prange

# 最好每个都加上类型
# 报错: Indexing Python object not allowed without gil

# 不添加装饰器会出现警告 Use boundscheck(False) for faster access
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef cnp.ndarray[double] target_mean_v7(cnp.ndarray[long,ndim=2] data):
    # get row numbers
    cdef int n = data.shape[0]
    # result to return 
    cdef cnp.ndarray[double] result = np.zeros(n,dtype=np.double)
    # get y and x 
    cdef cnp.ndarray[long] y = data[:,0]
    cdef cnp.ndarray[long] x = data[:,1]
    # define map
    cdef cnp.ndarray[long] value_map = np.zeros(n).astype(np.int_)
    cdef cnp.ndarray[long] count_map = np.zeros(n).astype(np.int_)
    
    cdef int i
    for i in prange(n, nogil=True):
        value_map[x[i]] += y[i]
        count_map[x[i]] += 1
    for i in prange(n, nogil=True):
        result[i] = (value_map[x[i]] - y[i]) / (count_map[x[i]] - 1)
    return result

### 测试

In [87]:
%%timeit
target_mean_v7(data2)

58.6 µs ± 43 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [102]:
result7 = target_mean_v7(data2)
is_eq = np.linalg.norm(result7 - result1)
print(is_eq)

0.0
