
**目标：将python语言全部转为Cython语言，不使用python的数据类型，不使用Python的feature**

**一共实现了3个版本，相关说明如下：**

**V7版本说明：**

1、将DataFrame转为memoryview，memoryview的速率比np.ndarray要快很多

2、使用C++的map代替了dict，也可以用unordered_map，经测试map较快

3、将结果保存在memoryview中，比保存在vector中要快一些

4、如果不考虑异常数据，也可以用两个一维数组实现map的功能，用一维数组的话可以省略if判断的过程

**V8版本说明：**

1、在V7版本的基础上，实现了并行

2、将此处的map转为两个一维数据实现，避免if else过程出现race condition的问题，但是这此过程使用并行，会使结果变慢

3、选择合适的位置进行并行，并行可能导致程序变慢，需要注意

**V9版本说明：**

1、在V7版本的基础上，做了多种异常判断

2、对于数据的输入类型进行判断，当输入为的dataframe中数据为int或float时，分别进行处理

3、对于data中的类别(x)，在使用map前，转换为string，避免np.nan等问题，同时考虑了空值的异常处理。此时不能使用两个一组数组代替map，不然索引会出问题

4、由于增加了异常判断，执行时间大大增加




In [1]:
%load_ext Cython

In [2]:
import numpy as np
import pandas as pd

def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [3]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

**V7版本说明：** 

目标：将python语言全部转为Cython语言，不使用python的数据类型，不使用Python函数

举措：

1、将DataFrame转为memoryview，memoryview的速率比np.ndarray要快很多

2、使用C++的map代替了dict，也可以用unordered_map，经测试map较快

3、将结果保存在memoryview中，比保存在vector中要快一些

4、如果不考虑异常数据，也可以用两个一维数组实现map的功能，用一维数组的话可以省略if判断的过程

In [60]:
%%cython  --cplus 
import numpy as np
cimport numpy as np
import pandas as pd
from libcpp.unordered_map cimport unordered_map
cimport cython
from libcpp.map cimport map


@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v7(data,int y,int x):
  #cdef np.ndarray[double, ndim=2, mode='fortran'] data_c = np.asfortranarray(data, dtype=np.float64)
  cdef long[:,:] data_c=data.values
  cdef int i=0
  cdef int start=0
  cdef int shape=data_c.shape[0]
  cdef int step=1

  cdef map[int, float] m_value
  cdef map[int, int] m_count
  cdef double[:] result_c = np.zeros(shape, dtype=np.float64)

  for i from start <= i < shape by step:
    if m_value.count(data_c[i, x]):
      m_value[data_c[i, x]] += data_c[i, y]
      m_count[data_c[i, x]] += 1
    else:
      m_value[data_c[i, x]] = data_c[i, y]
      m_count[data_c[i, x]] = 1
  for i from start <= i < shape by step:
      result_c[i] = (m_value[data_c[i, x]] - data_c[i, y]) / (m_count[data_c[i, x]] - 1)
  return result_c

**V8版本说明：** 

1、在V7版本的基础上，实现了并行

2、将此处的map转为两个一维数据实现，避免if else过程出现race condition的问题，但是这此过程使用并行，会使结果变慢

3、选择合适的位置进行并行，并行可能导致程序变慢，需要注意

In [59]:
%%cython --cplus  --compile-args=-fopenmp  --link-args=-fopenmp 
import numpy as np
cimport numpy as np
import pandas as pd
from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector
from libcpp.map cimport map
import cython
cimport cython
from cython.parallel cimport prange,parallel

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v8(data,int y,int x):
  #cdef np.ndarray[double, ndim=2, mode='fortran'] data_c = np.asfortranarray(data, dtype=np.float64)
  cdef long[:,:] data_c=data.values  #memoryview速度超过cnp
  cdef int i=0
  cdef int start=0
  cdef int shape=data_c.shape[0]
  cdef int step=1    # range可以直接用，range的速度稍稍超过 for i from start<=i <shape by step 种写法，不知为什么

  cdef double[:] m_value=np.zeros(10)
  cdef double[:] m_count=np.zeros(10)  
  cdef double[:] result_c = np.zeros(shape, dtype=np.float)   #视图比np.ndarray快一些


  for i from start <= i < shape by step:  #此处并行，会导致结果变慢2us，故不在此处并行
      m_value[data_c[i, x]] += data_c[i, y]
      m_count[data_c[i, x]] += 1
  for i in prange(shape,nogil=True):  
    result_c[i]=(m_value[data_c[i, x]] - data_c[i, y]) / (m_count[data_c[i, x]] - 1)
  return result_c

**V9版本说明：** 

1、在V7版本的基础上，做了多种异常判断

2、对于数据的输入类型进行判断，当输入为的dataframe中数据为int或float时，分别进行处理

3、对于data中的类别(x)，在使用map前，转换为string，避免np.nan等问题，同时考虑了空值的异常处理。此时不能使用两个一组数组代替map，不然索引会出问题

4、由于增加了异常判断，执行时间大大增加

In [58]:
%%cython  --cplus  --compile-args=-fopenmp  --link-args=-fopenmp
import numpy as np
cimport numpy as np
import pandas as pd
from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector
from libcpp.map cimport map
import cython
cimport cython
from cython.parallel cimport prange,parallel
from libcpp.string cimport string

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v9(data,int y,int x):


  #进行data的类型判断，当data中存在np.nan时，类型为double，当全为Int时，类型为long,需要转换一下，如果是object类型，可能需要单独写一个循环来处理（可以和异常值处理写在一起，暂不实现）
  if data.iloc[:,x].dtypes==np.int:
    data.iloc[:,x]=data.iloc[:,x].astype(np.float32)
  
  cdef float[:] data_x=data.iloc[:,1].values
  cdef long[:] data_y=data.iloc[:,0].values  #memoryview速度超过cnp
  cdef int i=0
  cdef int start=0
  cdef int shape=data_y.shape[0]
  cdef int step=1    # range可以直接用，range的速度稍稍超过 for i from start<=i <shape by step 写法，不知为什么

  cdef map[string, float] m_value
  cdef map[string, int] m_count  #速度比unordered_map快，unordered_map比dict快，dict比网上找的一些字典快
  cdef double[:] result_c = np.zeros(shape, dtype=np.float)   #视图比np.ndarray快一些
  cdef vector[string] value_array     #不如视图快
  cdef string key
  cdef int value_y

  for i from start <= i < shape by step:

    # x 异常值处理，空值:赋值'value_null',np.nan:直接转str，其它的异常情况暂不考虑，需要根据数据进行针对性处理，加了异常处理后，500行数据的速度下降了200us
    if data_x[i]:
      value_array.push_back(str.encode(str(data_x[i])))   #不知道这样转是否合适，但是没查到其它的办法
    else:
      value_array.push_back(b'value_null')

    key=value_array[i]
    value_y=data_y[i]
    if m_value.count(key):
      m_value[key] += value_y
      m_count[key] += 1
    else:
      m_value[key] = value_y
      m_count[key] = 1
  for i from start <= i < shape by step:  
    result_c[i]=(m_value[value_array[i]] - data_y[i]) / (m_count[value_array[i]] - 1)
  return result_c



In [7]:
import time
def main():
  y = np.random.randint(2, size=(500, 1))
  x = np.random.randint(10, size=(500, 1))
  data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])
  
  t1=time.time()
  result_1 = target_mean_v2(data, 'y', 'x')
  t2=time.time()
  result_2 = target_mean_v7(data, 0, 1)
  t3=time.time()
  result_3 = target_mean_v8(data, 0, 1)
  t4=time.time()
  result_4 = target_mean_v9(data, 0, 1)
  t5=time.time()

  diff7 = np.linalg.norm(result_1 - result_2)
  diff8 = np.linalg.norm(result_1 - result_3)
  diff9 = np.linalg.norm(result_1 - result_4)
  print('V2：不做异常处理，不并行：the executed time is:{}'.format(round(t2-t1,6)))
  print('V7：不做异常处理，不并行：the executed time is:{}, diff:{}'.format(round((t3-t2),6),diff7))
  print('V8：不做异常处理，  并行：the executed time is:{}, diff:{}'.format(round((t4-t3),6),diff8))
  print('V9：进行异常处理，不并行：the executed time is:{}, diff:{}'.format(round((t5-t4),6),diff9))
  

In [47]:
if __name__ == '__main__':
    main()


V2：不做异常处理，不并行：the executed time is:0.031983
V7：不做异常处理，不并行：the executed time is:0.000133, diff:3.336299622452143e-07
V8：不做异常处理，  并行：the executed time is:9.7e-05, diff:0.0
V9：进行异常处理，不并行：the executed time is:0.001629, diff:3.336299622452143e-07


In [51]:
  y = np.random.randint(2, size=(500, 1))
  x = np.random.randint(10, size=(500, 1))
  data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [21]:
%%timeit
#原始版本
target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 2.41 s per loop


In [10]:
%%timeit
#作业中原代码执行时长
target_mean_v2(data, 'y', 'x')

10 loops, best of 3: 26.8 ms per loop


In [11]:
%%timeit
#默认所有值合法，不进行异常值处理，不并行
target_mean_v7(data,0, 1)

The slowest run took 4.52 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 37.7 µs per loop


In [56]:
%%timeit
#默认所有值合法，不进行异常值处理，并行
target_mean_v8(data,0, 1)

The slowest run took 22.24 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 27.7 µs per loop


In [49]:
%%timeit
#进行多项异常检测和转换
target_mean_v9(data,0, 1)

1000 loops, best of 3: 477 µs per loop
