# 分箱--chimerge算法

a. chimerge算法是一种有监督的，将连续值离散化的分箱算法

b. 基于卡方检验，将两组分布独立（卡方值最小）数据合并

In [None]:
from time import ctime
'''读取数据'''
def read(file):
    instances = []
    fp = open(file,'r')
    for line in fp:
        line = line.strip('\n')
        if line != '':
            instances.append(line.split(','))
    fp.close()
    return instances

In [336]:
from time import ctime
'''将第i个特征和类标签组合起来，如：[[0.2,'Iris-setosa'],……],最后4个连续值属性数据分别成列加入到数组中，如上所示'''
def split(instances,i):
    log = []
    for line in instances:
        log.append([line[i],line[4]]) #修改点
    return log

'''统计每个属性值所具有的实例数量，如[[4.3,'Iris-setosa',1],[4.4,'Iris-setosa',3],……]'''
def count(log):
    log_cnt = []
    #以第0列进行排序，升序排列
    log.sort(key = lambda log:log[0])
    i = 0
    while(i < len(log)):
        cnt = log.count(log[i]) #计数log[i]总共的出现次数
        record = log[i][:]
        record.append(cnt)
        log_cnt.append(record)
        i += cnt #排序后方便计数，下一个计数值为i+cnt
    return log_cnt
        
'''log_cnt是形如：[2.2, 'versicolor'，3]，统计对某个属性值，对于三个类所含有的数量，
    {4.4:[0,1,3],……} 属性为4.4的对三个类实例数分别0，1，3'''
def build(log_cnt):
    log_dict = {}
    for record in log_cnt: #对log_cnt双重数组中取一个数组出来判断，加入特征对应类别的分别记数
        if record[0] not in log_dict.keys():
            log_dict[record[0]] = [0,0,0]
        if record[1] == 'setosa':
            log_dict[record[0]][0] = record[2]
        elif record[1] == 'versicolor':
            log_dict[record[0]][1] = record[2]
        elif record[1] == 'virginica':
            log_dict[record[0]][2] = record[2]
        else:
            raise TypeError('Data Exception')
    log_tuple = sorted(log_dict.items())
    return log_tuple

def collect(instances,i):
    log = split(instances,i)
    log_cnt = count(log)
    log_tuple = build(log_cnt)
    return log_tuple

def combine(a,b):
    '''a =(4.4,[3,1,0]), b=(4.5,[1,0,2]) combine(a,b)=(4.4,[4,1,2]) 合并分割点取较小的区间代表值'''
    c = a[:]
    #表示a[1]的类别个数
    for i in range(len(a[1])): 
        c[1][i] += b[1][i]
    return c

def chi2(A):
    '''计算两个区间的卡方值,[[3,1,0],[1,0,2]]，相邻区间内各个类别的个数'''
    m = len(A) #A是一个二重数组，相邻的两个区间，需要合并的区间个数
    k = len(A[0]) #特征个数
    R = []
    '''第i个区间的实例数,卡方列联表的行求和数，2行，A[0]=A[0][0]+……+A[0][1]'''
    for i in range(m):
        sum = 0
        for j in range(k):
            sum += A[i][j]
        R.append(sum)
    C = []
    '''第j个类的实例数，卡方列联表的列求和数，3行'''
    for j in range(k):
        sum = 0
        for i in range(m):
            sum += A[i][j]
        C.append(sum)
    N = 0
    '''总的实例数'''
    for ele in C:
        N += ele
    res = 0.0
    for i in range(m):
        for j in range(k):
            Eij = 1.0 * R[i] * C[j] /N
            if Eij != 0:
                res = 1.0*res + 1.0*(A[i][j] - Eij)**2 / Eij
    return res

def ChiMerge(log_tuple,max_interval):
    '''ChiMerge算法
    合并一个区间后对相邻区间的卡方值进行重新计算，将相邻区间的最小卡方值做合并'''
    num_interval = len(log_tuple) #目前的分区数
    while num_interval > max_interval:
        num_pair = num_interval - 1
        chi_values = []
        '''计算相邻区间的卡方值'''
        for i in range(num_pair): #1-2,2-3,3-4
            arr = [log_tuple[i][1],log_tuple[i+1][1]]
            chi_values.append(chi2(arr))
        min_chi = min(chi_values)
        for i in range(num_pair-1, -1, -1): #倒序，从最开始算，所以要-1，range(3,-1,-1)--[3, 2, 1, 0]
            if chi_values[i] == min_chi: #哪一个组卡方值是最小的
                log_tuple[i] = combine(log_tuple[i],log_tuple[i+1])
                log_tuple[i+1] = 'Merged'
        while 'Merged' in log_tuple:
            log_tuple.remove('Merged')
        num_interval = len(log_tuple)
    split_points = [record[0] for record in log_tuple] #分割的样本点
    return split_points

def discrete(path):
    #instances = read(path) 读取文件
    instances = a #从sklearn中读取
    max_interval = 6 #最多的分组数
    num_log = 4 #多少个需要分箱的特征
    for i in range(num_log):
        log_tuple = collect(instances, i)
        split_points = ChiMerge(log_tuple, max_interval)
        print(split_points)

if __name__ == '__main__':
    print('Start:' + ctime())
    discrete(a)
    print('End:' + ctime())

Start:Fri Aug  7 18:20:32 2020
[4.3, 4.9, 5.0, 5.5, 5.8, 7.1]
[2.0, 2.5, 2.9, 3.0, 3.4, 3.5]
[1.0, 3.0, 4.5, 4.8, 5.0, 5.2]
[0.1, 1.0, 1.4, 1.7, 1.8, 1.9]
End:Fri Aug  7 18:20:32 2020


## 实例case

In [1]:
import numpy as np
from sklearn import datasets
iris = datasets.load_iris()

In [115]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [117]:
import pandas as pd
df1 = pd.DataFrame(data = iris.data, columns = iris.feature_names)
df2 = pd.DataFrame(iris.target, columns = ['target'])
df = pd.concat([df1,df2],axis = 1) #按列凭接

In [119]:
df['target'].loc[df['target'] == 0] = 'setosa'
df['target'].loc[df['target'] == 1] = 'versicolor'
df['target'].loc[df['target'] == 2] = 'virginica'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [120]:
a = np.array(df)
a #变成array类型操作

array([[5.1, 3.5, 1.4, 0.2, 'setosa'],
       [4.9, 3.0, 1.4, 0.2, 'setosa'],
       [4.7, 3.2, 1.3, 0.2, 'setosa'],
       [4.6, 3.1, 1.5, 0.2, 'setosa'],
       [5.0, 3.6, 1.4, 0.2, 'setosa'],
       [5.4, 3.9, 1.7, 0.4, 'setosa'],
       [4.6, 3.4, 1.4, 0.3, 'setosa'],
       [5.0, 3.4, 1.5, 0.2, 'setosa'],
       [4.4, 2.9, 1.4, 0.2, 'setosa'],
       [4.9, 3.1, 1.5, 0.1, 'setosa'],
       [5.4, 3.7, 1.5, 0.2, 'setosa'],
       [4.8, 3.4, 1.6, 0.2, 'setosa'],
       [4.8, 3.0, 1.4, 0.1, 'setosa'],
       [4.3, 3.0, 1.1, 0.1, 'setosa'],
       [5.8, 4.0, 1.2, 0.2, 'setosa'],
       [5.7, 4.4, 1.5, 0.4, 'setosa'],
       [5.4, 3.9, 1.3, 0.4, 'setosa'],
       [5.1, 3.5, 1.4, 0.3, 'setosa'],
       [5.7, 3.8, 1.7, 0.3, 'setosa'],
       [5.1, 3.8, 1.5, 0.3, 'setosa'],
       [5.4, 3.4, 1.7, 0.2, 'setosa'],
       [5.1, 3.7, 1.5, 0.4, 'setosa'],
       [4.6, 3.6, 1.0, 0.2, 'setosa'],
       [5.1, 3.3, 1.7, 0.5, 'setosa'],
       [4.8, 3.4, 1.9, 0.2, 'setosa'],
       [5.0, 3.0, 1.6, 0.

In [57]:
log = []
for line in a:
    log.append([line[0],line[4]])

log.sort(key = lambda log: log[0])
log[0]

[4.3, 'setosa']

In [176]:
for i in range(len(df['sepal length (cm)'])):
    if df.iloc[i,0] < 4.9:
        df.iloc[i,0] = '[4.3, 4.9)'
    elif (df.iloc[i,0]>=4.9) & (df.iloc[i,0] < 5.0):
        df.iloc[i,0] = '[4.9, 5.0)'
    elif (df.iloc[i,0]>=5.0) & (df.iloc[i,0] < 5.5):
        df.iloc[i,0] = '[5.0, 5.5)'
    elif (df.iloc[i,0]>=5.5) & (df.iloc[i,0] < 5.8):
        df.iloc[i,0] = '[5.5, 5.8)'
    elif (df.iloc[i,0]>=5.8) & (df.iloc[i,0] < 7.1):
        df.iloc[i,0] = '[5.8, 7.1)'
    elif df.iloc[i,0]>=7.1:
        df.iloc[i,0] = '[7.1, inf)'

In [111]:
#等频分箱函数
pd.qcut(df['sepal length (cm)'],5,duplicates='drop')

0                     (5.0, 5.6]
1      (4.2989999999999995, 5.0]
2      (4.2989999999999995, 5.0]
3      (4.2989999999999995, 5.0]
4      (4.2989999999999995, 5.0]
                 ...            
145                  (6.52, 7.9]
146                  (6.1, 6.52]
147                  (6.1, 6.52]
148                  (6.1, 6.52]
149                   (5.6, 6.1]
Name: sepal length (cm), Length: 150, dtype: category
Categories (5, interval[float64]): [(4.2989999999999995, 5.0] < (5.0, 5.6] < (5.6, 6.1] < (6.1, 6.52] < (6.52, 7.9]]

# IV值

In [290]:
'''df-数据集 df_Xvar-待分析的变量 Yvar-目标值 bin_cols-需要进行分箱的变量,计算IV值的任何一个分箱不能有0'''

def IV(X, Y):
    '''
    当类别不止两类可用字典存储类别值
    N = {}
    for res in np.unique(df[0]):
        if res not in N.keys():
            N[res] = np.sum(df[0]==res)
    N_setosa = N['setosa']
    N_versicolor = N['versicolor']
    N_virginica = N['virginica']
    '''
    N_0 = np.sum(Y=='KA')
    N_1 = np.sum(Y=='LA')
    #N_0 = np.sum(Y=='setosa')
    #N_1 = np.sum(Y=='versicolor')
    N_0_group = np.zeros(np.unique(X).shape)
    N_1_group = np.zeros(np.unique(X).shape)
    for i in range(len(np.unique(X))): #对每一个分组
        N_0_group[i] = Y[(X == np.unique(X)[i]) & (Y == 'KA')].count() #X某一个分组，目标值为Y==0的数量
        N_1_group[i] = Y[(X == np.unique(X)[i]) & (Y == 'LA')].count()
    iv = np.sum((1.0*N_0_group/N_0 - 1.0*N_1_group/N_1) * np.log((1.0*N_0_group/N_0)/(1.0*N_1_group/N_1)))
    return iv

def caliv_batch(df, df_X, Y, bin_cols,n):
    ivlist = []
    for col in df_X.columns:
        if col in bin_cols:
            iv = IV(pd.qcut(df[col],n,duplicates='drop'),Y) #等频分箱,需要做分箱处理的
        else:
            iv = IV(df[col],Y)
        ivlist.append(iv)
    names = list(df_X.columns)
    iv_df = pd.DataFrame({'Var':names,'Iv':ivlist},columns=['Var','Iv'])
    return iv_df   

## 测试case

In [269]:
df1 = df[:100]
df1_X = df1.iloc[:,1:4]
Y = df1.iloc[:,-1]
bin_cols2 = ['sepal width (cm)','petal length (cm)','petal width (cm)']

In [287]:
caliv_batch(df1,df1_X,Y,bin_cols2,3)



Unnamed: 0,Var,Iv
0,sepal width (cm),3.878655
1,petal length (cm),inf
2,petal width (cm),inf


In [259]:
re = pd.read_csv('archived-data/aeolus-data/20200707/12/25092081-暗投监控逻辑-cost监控-查询23.csv',encoding = 'gb18030')

In [301]:
Y = re.iloc[:,3]
re_X = pd.concat([re.iloc[:,4],re.iloc[:,8],re.iloc[:,9]],axis=1)
bin_cols = ['cost']

In [309]:
caliv_batch(re,re_X,Y,bin_cols,5)



Unnamed: 0,Var,Iv
0,industry_nm,inf
1,flag,0.028078
2,cost,0.203964
