In [1]:
import numpy as np
import pandas as pd

In [3]:
    y = np.random.randint(2, size=(5000, 1))
    x = np.random.randint(10, size=(5000, 1))
    data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [4]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [5]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [8]:
%%timeit
target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 23.6 s per loop


In [7]:
%%timeit
target_mean_v2(data, 'y', 'x')

1 loop, best of 3: 263 ms per loop


In [13]:
%load_ext Cython

%%cython --cplus

import cython
cimport cython
import numpy as np
cimport numpy as np
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v4(int[:] x, int [:] y):
    cdef int size = 10
    cdef int n = x.shape[0]
    cdef int[:] count = np.zeros(size).astype(np.intc)
    cdef int[:] sum = np.zeros(size).astype(np.intc)
    cdef double[:] result = np.zeros(n)
    
    cdef int i = 0

    for i in prange(n, nogil = True):
      sum[x[i]] += y[i]
      count[x[i]] += 1
        
        
    for i in prange(n, nogil = True):
      result[i] = (sum[x[i]] - y[i]) / (count[x[i]] - 1)
    
    return result

In [15]:
%%timeit
target_mean_v4(data['x'].values.astype(np.intc), data['y'].values.astype(np.intc))

The slowest run took 13.57 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 50.4 µs per loop
