In [1]:
%load_ext Cython

In [2]:
import numpy as np
import pandas as pd

def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [3]:
y = np.random.randint(2, size=(500, 1))
x = np.random.randint(10, size=(500, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [4]:
%timeit target_mean_v1(data, 'y', 'x')

1.34 s ± 13.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%cython
cimport cython
cimport numpy as cnp
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)
def target_mean_v2(cnp.ndarray[long] arr_y, cnp.ndarray[long] arr_x):
    cdef int arr_len = 0
    arr_len = len(arr_x)
    cdef cnp.ndarray[double] result = np.zeros(arr_len)
    value_dict = dict()
    count_dict = dict()
    cdef int i
    for i from 0 <= i < arr_len by 1:
        x1 = arr_x[i]
        y1 = arr_y[i]
        if x1 not in value_dict.keys():
            value_dict[x1] = y1
            count_dict[x1] = 1
        else:
            value_dict[x1] += y1
            count_dict[x1] += 1        
    for i from 0 <= i < arr_len by 1:
        x1 = arr_x[i]
        y1 = arr_y[i]        
        result[i] = (value_dict[x1] - y1) / (count_dict[x1] - 1)
    return result

In [11]:
arr_y = np.squeeze(y)
arr_x = np.squeeze(x)
%timeit target_mean_v2(arr_y, arr_x)

114 µs ± 658 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [13]:
result_1 = target_mean_v1(data, 'y', 'x')
result_2 = target_mean_v2(arr_y, arr_x)

diff = np.linalg.norm(result_1 - result_2)
print(diff)

0.0
