scikit-learn随机森林调参小结

In [1]:
%matplotlib inline
import matplotlib.pylab as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection, metrics

# 系统库
import os, sys

# 自带数据
datalib_path = os.path.join(os.path.abspath('.'), '../../../../')
sys.path.append(datalib_path)
import dataset

In [2]:
train = pd.read_csv(os.path.join(dataset.creditcard_path,'train_modified.csv')) # 一个小额贷款数据集
train.head()

Unnamed: 0,Disbursed,Existing_EMI,ID,Loan_Amount_Applied,Loan_Tenure_Applied,Monthly_Income,Var4,Var5,Age,EMI_Loan_Submitted_Missing,...,Var2_2,Var2_3,Var2_4,Var2_5,Var2_6,Mobile_Verified_0,Mobile_Verified_1,Source_0,Source_1,Source_2
0,0,0.0,ID000002C20,300000,5,20000,1,0,37,1,...,0,0,0,0,1,1,0,1,0,0
1,0,0.0,ID000004E40,200000,2,35000,3,13,30,0,...,0,0,0,0,1,0,1,1,0,0
2,0,0.0,ID000007H20,600000,4,22500,1,0,34,1,...,0,0,0,0,0,0,1,0,0,1
3,0,0.0,ID000008I30,1000000,5,35000,3,10,28,1,...,0,0,0,0,0,0,1,0,0,1
4,0,25000.0,ID000009J40,500000,2,100000,3,17,31,1,...,0,0,0,0,0,0,1,0,0,1


In [3]:
target='Disbursed' # Disbursed的值就是二元分类的输出 是否支付了贷款
train['Disbursed'].value_counts() 

0    19680
1      320
Name: Disbursed, dtype: int64

In [4]:
# 索引列
IDcol = 'ID'

In [5]:
x_columns = [x for x in train.columns if x not in [target, IDcol]]
X = train[x_columns]
y = train['Disbursed']

In [6]:
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X,y)
# 袋外样本oob (Out of bag)：在随机森林中，
# m个训练样本会通过bootstrap (有放回的随机抽样) 的抽样方式进行T次抽样每次抽样产生样本数为m的采样集，
# 进入到并行的T个决策树中。这样有放回的抽样方式会导致有部分训练集中的样本(约36.8%)未进入决策树的采样集中，
# 而这部分未被采集的的样本就是袋外数据oob
print(rf0.oob_score_)
y_predprob = rf0.predict_proba(X)[:,1]
print("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

0.98315
AUC Score (Train): 0.999994


In [7]:
param_test1 = {'n_estimators':[10,20,30,40,50,60,70]}
# Grid Search寻找最优参数
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100, min_samples_leaf=20,
                                                           max_depth=8, max_features='sqrt',random_state=10), 
                        param_grid = param_test1, scoring='roc_auc', cv=5)
gsearch1.fit(X,y)
gsearch1.best_params_, gsearch1.best_score_

({'n_estimators': 60}, 0.8211334476626015)

In [8]:
param_test2 = {'max_depth':[3,5,7,9,11,13], 'min_samples_split':[50,70,90,110,130,150,170,190]}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=60, min_samples_leaf=20,
                                                           max_features='sqrt', oob_score=True, random_state=10),
                        param_grid = param_test2, scoring='roc_auc', cv=5)
gsearch2.fit(X,y)
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 13, 'min_samples_split': 110}, 0.8242016800050813)

In [9]:
rf1 = RandomForestClassifier(n_estimators=60, max_depth=13, min_samples_split=110, 
                             min_samples_leaf=20, max_features='sqrt', oob_score=True, random_state=10)
rf1.fit(X,y)
print(rf1.oob_score_)

0.984


In [10]:
param_test3 = {'min_samples_split':[80,100,120,140], 'min_samples_leaf':[10,20,30,40,50]}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=60, max_depth=13, max_features='sqrt', 
                                                           oob_score=True, random_state=10), 
                        param_grid = param_test3, scoring='roc_auc', cv=5)
gsearch3.fit(X,y)
gsearch3.best_params_, gsearch3.best_score_

In [None]:
param_test4 = {'max_features':[3,5,7,9]}
gsearch4 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=60, max_depth=13, min_samples_split=120,
                                                           min_samples_leaf=20 ,oob_score=True, random_state=10),
                        param_grid=param_test4, scoring='roc_auc', cv=5)
gsearch4.fit(X,y)
gsearch4.best_params_, gsearch4.best_score_

({'max_features': 7}, 0.8248650279471545)

In [None]:
rf2 = RandomForestClassifier(n_estimators=60, max_depth=13, min_samples_split=120, min_samples_leaf=20, 
                             max_features=7 ,oob_score=True, random_state=10)
rf2.fit(X,y)
print (rf2.oob_score_)

0.984
