In [5]:
#导入包
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,roc_auc_score
import math
import numpy as np
from scipy import stats#包括统计工具和随机过程的概率过程
from sklearn.utils.multiclass import type_of_target#可以检查变量类型，连续或二分类等
from sklearn.model_selection import GridSearchCV#网格搜索(grid search)。
from sklearn.ensemble import RandomForestClassifier#随机森林分类器
import warnings
warnings.filterwarnings("ignore")

#加载数据
data = pd.read_csv('data_processed_2.csv',encoding='unicode_escape')  #先前处理好的数据
X = data.drop(['status'],axis=1)
y = data['status']

数据集划分

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2018)

**特征选择：  **
  
 IV的全称是Information Value，中文意思是信息价值，或者信息量  
 
 计算过程  
 1 计算分组的WOE（他表示的是当前这个组中响应的客户和未响应客户的比值，和所有样本中这个比值的差异。这个差异是用这两个比值的比值，再取对数来表示的。WOE越大，这种差异越大，这个分组里的样本响应的可能性就越大，WOE越小，差异越小，这个分组里的样本响应的可能性就越小）  
   
 2 计算分组的IV  
   IV=p(yi)-p(Ni) * WOE  
    
 3 求和整个变量的IV值  

In [14]:
#处理上述特征时, 遇到了IV的极端情况, 响应数为0或未响应数为0。为简单起见, 我们在代码中对极端值进行平滑处理。
def discrete(x):
    # 使用5等分离散化特征
    res  = np.zeros(x.shape)#返回来一个给定形状(shape)和类型的用0填充的数组
    for i in range(5):
        point1 = stats.scoreatpercentile(x,i*20)#stats.scoreatpercentile(x, 50)  得到x在50%处的数值
        point2 = stats.scoreatpercentile(x,(i+1)*20)
        x1 = x[np.where((x>= point1)&(x<=point2))]
        mask = np.in1d(x,x1)#np.in1d(B,A) # 在序列B中寻找与序列A相同的值，并返回一逻辑值（True,False）
        res[mask] = i+1# 将[i, i+1]块内的值标记成i+1
    return res

def woe_single_x(x,y,feature,event=1):
    # event代表预测正例的标签
    event_total = sum(y == event)#所有y=1的用户
    non_event_total = y.shape[-1] - event_total#所有y=0的用户

    iv = 0
    woe_dict = {}
    for x1 in set(x):  # 遍历各个块
        y1 = y.reindex(np.where(x == x1)[0])
        event_count = sum(y1 == event)#当前分组中y=1的用户
        non_event_count = y1.shape[-1] - event_count#当前分组中y=0的用户
        rate_event = event_count / event_total
        rate_non_event = non_event_count / non_event_total
        #极端值
        if rate_event == 0:
            rate_event = 0.0001
            # woei = -20
        elif rate_non_event == 0:
            rate_non_event = 0.0001
            # woei = 20
        woei = math.log(rate_event / rate_non_event)#第i组的WOE值
        woe_dict[x1] = woei
        iv += (rate_event - rate_non_event) * woei#第i组的IV值
    return woe_dict, iv

def woe(X,y,event=1):
    res_woe = []
    iv_dict = {}
    for feature in X.columns:
        x = X[feature].values
        #1.连续特征离散化
        if type_of_target(x) == 'continuous':
            x = discrete(x)
        #2.计算该特征的woe和iv
        woe_dict,iv = woe_single_x(x,y,feature,event)
        iv_dict[feature] = iv
        res_woe.append(woe_dict)
    return iv_dict

iv_dict = woe(X_train,y_train)#字典
iv = sorted(iv_dict.items(),key=lambda x:x[1],reverse=True)#按 IV 值从高到低筛选  列表类型
print(iv)#iv含有两列,第一列是特征名，第二列是IV指
L = []
for item in iv:
    if item[1] > 0.3:#选出IV值大于0.3的特征
        L.append(item[0])

print("选择出来的iv",L,"一共",len(L),"个变量")#只把第一列读取出来，即把IV值选出来的特征读取出来

[('custid', 2.6909066431469095), ('historical_trans_amount', 2.6609646134512865), ('trans_amount_3_month', 2.5546436077538357), ('repayment_capability', 2.327229251967252), ('pawns_auctions_trusts_consume_last_6_month', 2.220777389641486), ('first_transaction_day', 2.164322996058534), ('first_transaction_time', 2.164322996058532), ('abs', 1.966985825643712), ('consfin_avg_limit', 1.692832469252038), ('loans_avg_limit', 1.4613631156793805), ('max_cumulative_consume_later_1_month', 1.4598660465564153), ('consume_mini_time_last_1_month', 1.378784503719742), ('historical_trans_day', 1.1182950410251806), ('consfin_credit_limit', 0.8711723952424449), ('pawns_auctions_trusts_consume_last_1_month', 0.8530625616084101), ('avg_price_last_12_month', 0.7281431950917352), ('loans_score', 0.6627296771379482), ('loans_latest_day', 0.629468278102047), ('apply_score', 0.6279175775788947), ('history_suc_fee', 0.5114093090124674), ('latest_query_day', 0.49299812238889124), ('trans_days_interval_filter', 

随机森林挑选特征  
  
  用随机森林进行特征重要性评估的思想比较简单，主要是看每个特征在随机森林中的**每棵树上做了多大的贡献，然后取平均值，最后比较不同特征之间的贡献大小**。  

In [15]:
#模型评估
def model_metrics(clf, X_train, X_test, y_train, y_test):
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)

    if hasattr(clf,'decision_function'):
        y_train_proba = clf.decision_function(X_train)
        y_test_proda = clf.decision_function(X_test)
    else:
        y_train_proba = clf.predict_proba(X_train)[:,1]
        y_test_proda = clf.predict_proba(X_test)[:,1]


    #准确率
    print('<准确率>', end='')
    print('训练集：','%.4f'%accuracy_score(y_train,y_train_pred))
    print('测试集：','%.4f'%accuracy_score(y_test,y_test_pred))

    #auc值：用roc_auc_score或auc
    print('<auc值>',end='')
    print('训练集：','%.4f'%roc_auc_score(y_train,y_train_proba))
    print('测试集：','%.4f'%roc_auc_score(y_test,y_test_proda))
    
rf0 = RandomForestClassifier(oob_score=True,random_state=2018)
rf0.fit(X_train,y_train)

model_metrics(rf0,X_train,X_test,y_train,y_test)
rf0_impc = pd.Series(rf0.feature_importances_, index=X.columns).sort_values(ascending=False)#随机森林评判特征的重要性
fea_gini = rf0_impc[:10].index.tolist()#选出重要性排前10的特征
print(fea_gini)


#特征合并
features = list(set(fea_gini)|set(L))
print("features个数为：",len(features))
X_train = X_train[features]
X_test = X_test[features]

<准确率>训练集： 0.9823
测试集： 0.7715
<auc值>训练集： 0.9993
测试集： 0.7171
['trans_fail_top_count_enum_last_1_month', 'apply_score', 'history_fail_fee', 'loans_overdue_count', 'latest_one_month_fail', 'loans_score', 'abs', 'trans_fail_top_count_enum_last_6_month', 'latest_query_day', 'max_cumulative_consume_later_1_month']
features个数为： 43


  参考资料：IV值和WOE值的理解 https://blog.csdn.net/iModel/article/details/79420437  
  https://blog.csdn.net/qq_30006749/article/details/86025350