## data_generate

In [41]:
import random
import copy 
import pandas as pd
data = pd.DataFrame([[random.random(),random.randint(0,1)] for _ in range(500)],columns=['feature','label'])

## 分箱

In [42]:
df = copy.copy(data)

In [43]:
df

Unnamed: 0,feature,label
0,0.734970,1
1,0.665404,1
2,0.082067,0
3,0.496671,1
4,0.534150,1
...,...,...
495,0.893421,1
496,0.573376,0
497,0.750586,1
498,0.386089,1


In [44]:
bins=10

In [45]:
df['feature']=pd.qcut(x=df['feature'], q=bins)

In [46]:
df=pd.crosstab(index=df['feature'], columns=df['label'], margins=False)

In [52]:
df=df.rename(columns={0:'neg',1:'pos'})

## calculate_woe_iv

In [57]:
df['pos_rate'] = (df['pos'] + 1) / df['pos'].sum()  # 计算每个分组内的响应（Y=1）占比，加1为了防止在计算woe时分子分母为0
df['neg_rate'] = (df['neg'] + 1) / df['neg'].sum()  # 计算每个分组内的未响应（Y=0）占比
df['woe'] = np.log(df['pos_rate'] / df['neg_rate'])  # 计算每个分组的WOE
df['iv'] = (df['pos_rate'] - df['neg_rate']) * df['woe']  # 计算每个分组的IV
  

<IPython.core.display.Javascript object>

In [59]:
##特征分享之后的重要性，例如：feature为年龄
df

label,neg,pos,pos_rate,neg_rate,woe,iv
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(0.0033499999999999997, 0.109]",24,26,0.116379,0.093284,0.221211,0.005109
"(0.109, 0.204]",29,21,0.094828,0.11194,-0.165905,0.002839
"(0.204, 0.295]",29,21,0.094828,0.11194,-0.165905,0.002839
"(0.295, 0.409]",29,21,0.094828,0.11194,-0.165905,0.002839
"(0.409, 0.524]",23,27,0.12069,0.089552,0.2984,0.009291
"(0.524, 0.601]",24,26,0.116379,0.093284,0.221211,0.005109
"(0.601, 0.698]",25,25,0.112069,0.097015,0.14425,0.002172
"(0.698, 0.765]",31,19,0.086207,0.119403,-0.325754,0.010814
"(0.765, 0.896]",29,21,0.094828,0.11194,-0.165905,0.002839
"(0.896, 0.997]",25,25,0.112069,0.097015,0.14425,0.002172


## 整合版

In [None]:
import numpy as np
import pandas as pd
import copy


def calculate_woe_iv(dataset):
    """
    对分箱后的特征计算WOE和IV
    :param dataset:DataFrame，计算数据,需要在特征分箱后的数据
    :return:
        iv: float，iv值
        df:DataFrame，woe和IV计算后结果

    Example
    -----------------------------------------------------------------
    >>> import random
    >>> data = pd.DataFrame([[random.random(),random.randint(0,1)] for _ in range(500)],columns=['feature','label'])
    >>> df = cut_width(dataset=data,inputcol='feature',labelcol='label',bins=10)
    >>> df.rename(columns={0:'neg',1:'pos'},inpalce=True)
    >>> iv, woe_iv_df = calculate_woe_iv(dataset=df)
    >>> iv
    0.037619588549634465
    >>> woe_iv_df
    label               neg  pos  pos_rate  neg_rate       woe        iv
    feature
    (-0.000313, 0.103]   23   27  0.104869  0.103004  0.017940  0.000033
    (0.103, 0.206]       23   27  0.104869  0.103004  0.017940  0.000033
    (0.206, 0.312]       29   21  0.082397  0.128755 -0.446365  0.020693
    (0.312, 0.418]       22   28  0.108614  0.098712  0.095591  0.000947
    (0.418, 0.535]       19   31  0.119850  0.085837  0.333793  0.011353
    (0.535, 0.614]       22   28  0.108614  0.098712  0.095591  0.000947
    (0.614, 0.705]       24   26  0.101124  0.107296 -0.059249  0.000366
    (0.705, 0.8]         24   26  0.101124  0.107296 -0.059249  0.000366
    (0.8, 0.891]         22   28  0.108614  0.098712  0.095591  0.000947
    (0.891, 0.991]       25   25  0.097378  0.111588 -0.136210  0.001936
    """
    df = copy.copy(dataset)
    df['pos_rate'] = (df['pos'] + 1) / df['pos'].sum()  # 计算每个分组内的响应（Y=1）占比，加1为了防止在计算woe时分子分母为0
    df['neg_rate'] = (df['neg'] + 1) / df['neg'].sum()  # 计算每个分组内的未响应（Y=0）占比
    df['woe'] = np.log(df['pos_rate'] / df['neg_rate'])  # 计算每个分组的WOE
    df['iv'] = (df['pos_rate'] - df['neg_rate']) * df['woe']  # 计算每个分组的IV
    iv = df['iv'].sum()
    return iv, df
def cut_width(dataset, inputcol, labelcol='label', bins=10):
    """
    等宽分箱
    :param dataset: DataFrame，计算数据
    :param inputcol: String,待分箱列列名
    :param labelcol: String,目标列列名
    :param bins: int,正整数，分箱数
    :return:
    :return:
        df: DataFrame，分箱后结果

    Example
    -----------------------------------------------------------------
    >>> import random
    >>> data = pd.DataFrame([[random.random(),random.randint(0,1)] for _ in range(500)],columns=['feature','label'])
    >>> df = cut_width(data,inputcol='feature',labelcol='label',bins=10)
    >>> df
        label                             good  bad
    feature
    (-0.0009308000000000001, 0.0968]    23   27
    (0.0968, 0.188]                     27   23
    (0.188, 0.29]                       25   25
    (0.29, 0.385]                       32   18
    (0.385, 0.472]                      31   19
    (0.472, 0.567]                      24   26
    (0.567, 0.686]                      24   26
    (0.686, 0.778]                      24   26
    (0.778, 0.912]                      26   24
    (0.912, 0.999]                      29   21
    """
    df = copy.copy(dataset)
    df[inputcol] = pd.qcut(x=df[inputcol], q=bins)
    df = pd.crosstab(index=df[inputcol], columns=df[labelcol], margins=False)
    return df