**K折交叉验证** 
  
K折交叉验证将k个训练子集重复holdout方法k次。  
  
K折交叉验证不重复的随机将训练集划分为k个，k-1个用于训练，剩余一个用于测试，重复该过程k次，得到k个模型对模型性能的评价。  

基于评价结果可以计算平均性能。  
  
与holdput相比，这样得到的结果**对数据划分方法敏感度相对较低。**

参考资料： https://blog.csdn.net/ChenVast/article/details/79257097

调参参考：https://blog.csdn.net/bear507/article/details/86696246#

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import  LGBMClassifier
import warnings

warnings.filterwarnings('ignore')


#加载数据
data = pd.read_csv('data_processed_2.csv',encoding='unicode_escape')  #先前处理好的数据
X = data.drop(['status'],axis=1)
y = data['status']
print('数据的行列',data.shape)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2018)

#归一化
sc = StandardScaler()
sc.fit(X_train)# 估算每个特征的平均值和标准差
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

#定义网格搜索交叉验证函数（5折）
def gridsearch(model,parameters):
    grid = GridSearchCV(model,parameters,scoring='accuracy',cv=5)
    grid = grid.fit(X_train_std,y_train)
    if hasattr(model,'decision_function'):
        y_predict_pro = grid.decision_function(X_test_std)
    else:
        y_predict_pro = grid.predict_proba(X_test_std)[:,1]
    print('best score:',grid.best_score_)
    print(grid.best_params_)
    print('test score:',grid.score(X_test_std,y_test))
    print('AUC:',metrics.roc_auc_score(y_test,y_predict_pro))

#逻辑回归
print('逻辑回归：')
# C浮点型，默认：1.0；其值等于正则化强度的倒数，为正的浮点数。数值越小表示正则化越强。
# penalty 字符串型，’l1’ or ‘l2’，默认：’l2’；正则化类型。
parameters = {'C':[0.1,1,2,3],'penalty':['l1','l2']}
lr = LogisticRegression(random_state=2018)
lr.fit(X_train_std,y_train)
gridsearch(lr, parameters)
print('')

#SVM
print('SVM:')
parameters = {'C':[0.1,1,2,3],'kernel':['linear','poly','rbf']}
svc = SVC(random_state=2018)
svc.fit(X_train_std,y_train)
gridsearch(svc,parameters)
print('')

#决策树
print('决策树:')
parameters = {'criterion': ['gini', 'entropy'], 'max_depth': [1,2,3,4,5,6], 'splitter': ['best', 'random'],
            'max_features': ['log2', 'sqrt', 'auto']}
clf = DecisionTreeClassifier(random_state=2018)
clf.fit(X_train_std,y_train)
gridsearch(clf,parameters)
print('')

#随机森林
print('随机森林:')
parameters = {'n_estimators': range(1,200), 'max_features': ['log2', 'sqrt', 'auto']}
rfc = RandomForestClassifier(random_state=2018)
rfc.fit(X_train_std,y_train)
gridsearch(rfc,parameters)
print('')

#GBDT
print('GBDT:')
parameters = {'n_estimators': range(1,100,10),'learning_rate': np.arange(0.1, 1, 0.1)}
gbdt = GradientBoostingClassifier(random_state=2018)
gbdt.fit(X_train_std,y_train)
gridsearch(gbdt,parameters)
print('')

#XGBoost
print('XGBoost:')
parameters = {'eta': np.arange(0.1, 0.5, 0.1), 'max_depth': range(1,6,1), 'min_child_weight': range(1,6,1)}
xgbs = XGBClassifier(random_state=2018)
xgbs.fit(X_train_std,y_train)
gridsearch(xgbs,parameters)
print('')

#LightGBM
parameters = {'learning_rate': np.arange(0.1,0.5,0.1), 'max_depth': range(1,6,1), 'n_estimators':range(30,50,5)}
lgbm = LGBMClassifier(random_state=2018)
lgbm.fit(X_train_std,y_train)
gridsearch(lgbm,parameters)
print('')


数据的行列 (4754, 93)
逻辑回归：
best score: 0.7908025247971145
{'C': 0.1, 'penalty': 'l1'}
test score: 0.7869656622284513
AUC: 0.7814518064119015

SVM:
best score: 0.7953110910730388
{'C': 0.1, 'kernel': 'linear'}
test score: 0.7771548703573932
AUC: 0.7791983558156761

决策树:
best score: 0.7664562669071235
{'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'splitter': 'best'}
test score: 0.7533286615276804
AUC: 0.584311393487945

随机森林:


KeyboardInterrupt: 