In [1]:
import pandas as pd
import numpy as np
import datetime
import re
import gc
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer  
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
#读取数据
train_data = pd.read_csv('train_public.csv')
test_data = pd.read_csv('test_public.csv')
train_init_data = pd.read_csv('train_internet.csv')

In [3]:
# 填补缺失值f系
col = ['f0', 'f1', 'f2', 'f3', 'f4']
x_col = ['f0', 'f1', 'f2', 'f3', 'f4', 'f5']

imp = SimpleImputer(strategy = 'median')
for c in col:
    train_data[c] = imp.fit_transform(train_data[c].values.reshape(-1,1))
    test_data[c] = imp.fit_transform(test_data[c].values.reshape(-1,1))
for x_ in x_col:
    train_init_data[x_] = imp.fit_transform(train_init_data[x_].values.reshape(-1,1))

In [4]:
# 处理特征work_year、class、work_type

work_year_dict = {'< 1 year': 0,'1 year': 1,'2 years': 2,'3 years': 3,'4 years': 4,'5 years': 5,
                  '6 years': 6,'7 years': 7,'8 years': 8,'9 years': 9,'10+ years': 10}
train_data['work_year'] = train_data['work_year'].map(work_year_dict)
test_data['work_year'] = test_data['work_year'].map(work_year_dict)
train_init_data['work_year'] = train_init_data['work_year'].map(work_year_dict)
train_data['work_year'] = train_data['work_year'].fillna(1)
test_data['work_year'] = test_data['work_year'].fillna(1)
train_init_data['work_year'] = train_init_data['work_year'].fillna(1)

class_dict = {'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7}
train_data['class'] = train_data['class'].map(class_dict)
test_data['class'] = test_data['class'].map(class_dict)
train_init_data['class'] = train_init_data['class'].map(class_dict)

work_type_dict = {'公务员':0,'其他':1,'工人':2,'工程师':3,'职员':4}
train_init_data['work_type'] = train_init_data['work_type'].map(work_type_dict)

In [5]:
# issue_date提取时间特征

train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])
test_data['issue_date'] = pd.to_datetime(test_data['issue_date'])
train_init_data['issue_date'] = pd.to_datetime(train_init_data['issue_date'])

train_data['issue_date_year'] = train_data['issue_date'].dt.year
test_data['issue_date_year'] = test_data['issue_date'].dt.year
train_init_data['issue_date_year'] = train_init_data['issue_date'].dt.year
train_data['issue_date_month'] = train_data['issue_date'].dt.month
test_data['issue_date_month'] = test_data['issue_date'].dt.month
train_init_data['issue_date_month'] = train_init_data['issue_date'].dt.month
train_data['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek
test_data['issue_date_dayofweek'] = test_data['issue_date'].dt.dayofweek
train_init_data['issue_date_dayofweek'] = train_init_data['issue_date'].dt.dayofweek

train_data.drop('issue_date', axis = 1, inplace=True)
test_data.drop('issue_date', axis = 1, inplace=True)
train_init_data.drop('issue_date', axis = 1, inplace=True)

In [6]:
# 处理earlies_credit_mon特征

def findDate(val):
    fd = re.search('(\d+-)', val)
    if fd is None:
        return '1-'+val
    return val + '-01'

train_data['earlies_credit_mon'] = pd.to_datetime(train_data['earlies_credit_mon'].map(findDate))
test_data['earlies_credit_mon'] = pd.to_datetime(test_data['earlies_credit_mon'].map(findDate))
train_init_data['earlies_credit_mon'] = pd.to_datetime(train_init_data['earlies_credit_mon'].map(findDate))

train_data['earliesCreditMon'] = train_data['earlies_credit_mon'].dt.month
test_data['earliesCreditMon'] = test_data['earlies_credit_mon'].dt.month
train_data['earliesCreditYear'] = train_data['earlies_credit_mon'].dt.year
test_data['earliesCreditYear'] = test_data['earlies_credit_mon'].dt.year
train_init_data['earliesCreditMon'] = train_init_data['earlies_credit_mon'].dt.month

train_data.drop('earlies_credit_mon', axis = 1, inplace=True)
test_data.drop('earlies_credit_mon', axis = 1, inplace=True)
train_init_data.drop('earlies_credit_mon', axis = 1, inplace=True)

In [7]:
# 处理特征  employer_type、industry

cat_cols = ['employer_type', 'industry']
for col in cat_cols:
    lab = LabelEncoder().fit(train_data[col])
    train_data[col] = lab.transform(train_data[col])
    test_data[col] = lab.transform(test_data[col])
    train_init_data[col] = lab.transform(train_init_data[col])

In [8]:
# 对数据集缺失进行填充
train_data['pub_dero_bankrup'] = train_data['pub_dero_bankrup'].fillna(method = 'ffill')
test_data['pub_dero_bankrup'] = test_data['pub_dero_bankrup'].fillna(method = 'ffill')
train_init_data['pub_dero_bankrup'] = train_init_data['pub_dero_bankrup'].fillna(method = 'ffill')
train_init_data = train_init_data.dropna(subset = ['post_code', 'debt_loan_ratio', 'title'])
train_init_data['recircle_u'] = train_init_data['recircle_u'].fillna(train_init_data['recircle_u'].median())

In [9]:
# 构造policy_code

#构建train_data的特征
train_data.drop('policy_code', axis = 1, inplace=True)
scaler = MinMaxScaler()
scaler_data = scaler.fit_transform(train_data)
clf = KMeans(n_clusters=2, random_state=10)
pre = clf.fit(scaler_data)
train = pre.labels_

#构建test_data的特征
test_data.drop('policy_code', axis = 1, inplace=True)
scaler = MinMaxScaler()
scaler_data = scaler.fit_transform(test_data) 
clf = KMeans(n_clusters=2, random_state=10)
pre = clf.fit(scaler_data)
test = pre.labels_

In [10]:
# 将kmeans预测的policy_code特征保存
train = pd.DataFrame(train)
train.to_csv('train.csv', index=False)

test = pd.DataFrame(test)
test.to_csv('test.csv', index=False)

In [11]:
# 构建sub_class特征
def Kmeans(data, num):
    scaler = MinMaxScaler()
    scaler_data = scaler.fit_transform(data.loc[data['class'] == num])
    clf = KMeans(n_clusters=5, random_state=546789)
    pre = clf.fit(scaler_data)
    test = pre.labels_
    return test

In [12]:
data = Kmeans(train_data, 1)

In [16]:
data = pd.DataFrame(data, columns=['class1'])
data.value_counts()

class1
3         407
1         380
2         337
0         307
4         267
dtype: int64

In [11]:
# 通过class的A~G特征分为七类，对此七类分别预测
##################################### 构建train集的sub_class特征
for i in range(1,8):
    data = Kmeans(train_data, i)
    if i == 1:
        data = pd.Series(data, index= train_data.loc[train_data['class'] == 1].index)
        s1 = data.map({0:'A1', 1:'A2',2:'A3',3:'A4',4:'A5'})
    if i == 2:
        data = pd.Series(data, index= train_data.loc[train_data['class'] == 2].index)
        s2 = data.map({0:'B1',1:'B2',2:'B3',3:'B4',4:'B5'})
    if i == 3:
        data = pd.Series(data, index= train_data.loc[train_data['class'] == 3].index)
        s3 = data.map({0:'C1',1:'C2',2:'C3',3:'C4',4:'C5'})
    if i == 4:
        data = pd.Series(data, index= train_data.loc[train_data['class'] == 4].index)
        s4 = data.map({0:'D1',1:'D2',2:'D3',3:'D4',4:'D5'})
    if i == 5:
        data = pd.Series(data, index= train_data.loc[train_data['class'] == 5].index)
        s5 = data.map({0:'E1',1:'E2',2:'E3',3:'E4',4:'E5'})
    if i == 6:
        data = pd.Series(data, index= train_data.loc[train_data['class'] == 6].index)
        s6 = data.map({0:'F1',1:'F2',2:'F3',3:'F4',4:'F5'})
    if i == 7:
        data = pd.Series(data, index= train_data.loc[train_data['class'] == 7].index)
        s7 = data.map({0:'G1',1:'G2',2:'G3',3:'G4',4:'G5'})
# 合并并保存
data_train = pd.concat([s1, s2, s3, s4, s5, s6, s7]).reset_index(drop=True)
data_train.to_csv('train_sub_class.csv', index=False)

In [12]:
##################################### 构建train集的sub_class特征
for i in range(1,8):
    data = Kmeans(test_data, i)
    if i == 1:
        data = pd.Series(data, index= test_data.loc[test_data['class'] == 1].index)
        s1 = data.map({0:'A1', 1:'A2',2:'A3',3:'A4',4:'A5'})
    if i == 2:
        data = pd.Series(data, index= test_data.loc[test_data['class'] == 2].index)
        s2 = data.map({0:'B1',1:'B2',2:'B3',3:'B4',4:'B5'})
    if i == 3:
        data = pd.Series(data, index= test_data.loc[test_data['class'] == 3].index)
        s3 = data.map({0:'C1',1:'C2',2:'C3',3:'C4',4:'C5'})
    if i == 4:
        data = pd.Series(data, index= test_data.loc[test_data['class'] == 4].index)
        s4 = data.map({0:'D1',1:'D2',2:'D3',3:'D4',4:'D5'})
    if i == 5:
        data = pd.Series(data, index= test_data.loc[test_data['class'] == 5].index)
        s5 = data.map({0:'E1',1:'E2',2:'E3',3:'E4',4:'E5'})
    if i == 6:
        data = pd.Series(data, index= test_data.loc[test_data['class'] == 6].index)
        s6 = data.map({0:'F1',1:'F2',2:'F3',3:'F4',4:'F5'})
    if i == 7:
        data = pd.Series(data, index= test_data.loc[test_data['class'] == 7].index)
        s7 = data.map({0:'G1',1:'G2',2:'G3',3:'G4',4:'G5'})
# 合并并保存
data_test = pd.concat([s1, s2, s3, s4, s5, s6, s7]).reset_index(drop=True)
data_test.to_csv('test_sub_class.csv', index=False)

In [13]:
#重新读取数据
train_data = pd.read_csv('train_public.csv')
test_data = pd.read_csv('test_public.csv')
train_init_data = pd.read_csv('train_internet.csv')

In [14]:
# 修改数据集里的policy_code
tra = pd.read_csv('train.csv')
tes = pd.read_csv('test.csv')

train_data['policy_code'] = tra
test_data['policy_code'] = tes

In [15]:
# 在训练集和测试集里加入sub_class
train_sub = pd.read_csv('train_sub_class.csv')
test_sub = pd.read_csv('test_sub_class.csv')

train_data['sub_class'] = train_sub
test_data['sub_class']  = test_sub

In [16]:
# 采用均值填充
train_data['work_year'] = train_data['work_year'].fillna(-1)
test_data['work_year'] = test_data['work_year'].fillna(-1)
train_init_data['work_year'] = train_init_data['work_year'].fillna(-1)

col_fill = ['f0', 'f1', 'f2', 'f3', 'f4']
imp = SimpleImputer(strategy = 'median')
for c in col_fill:
    train_init_data[c] = imp.fit_transform(train_init_data[c].values.reshape(-1,1))

In [17]:
work_year_dict = {'< 1 year': 0,'1 year': 1,'2 years': 2,'3 years': 3, '4 years': 4,'5 years': 5,
                  '6 years': 6,'7 years': 7, '8 years': 8,'9 years': 9,'10+ years': 10}

train_data['work_year'] = train_data['work_year'].map(work_year_dict)
test_data['work_year'] = test_data['work_year'].map(work_year_dict)
train_init_data['work_year'] = train_init_data['work_year'].map(work_year_dict)

class_dict = {'A':1, 'B':2,'C':3, 'D':4,'E':5, 'F':6,'G':7}

train_data['class'] = train_data['class'].map(class_dict)
test_data['class'] = test_data['class'].map(class_dict)
train_init_data['class'] = train_init_data['class'].map(class_dict)

In [18]:
# 日期转换
train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])
test_data['issue_date'] = pd.to_datetime(test_data['issue_date'])
train_init_data['issue_date'] = pd.to_datetime(train_init_data['issue_date'])

train_data['issue_date_year'] = train_data['issue_date'].dt.year
test_data['issue_date_year'] = test_data['issue_date'].dt.year
train_init_data['issue_date_year'] = train_init_data['issue_date'].dt.year

train_data['issue_date_month'] = train_data['issue_date'].dt.month
test_data['issue_date_month'] = test_data['issue_date'].dt.month
train_init_data['issue_date_month'] = train_init_data['issue_date'].dt.month

train_data['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek
test_data['issue_date_dayofweek'] = test_data['issue_date'].dt.dayofweek
train_init_data['issue_date_dayofweek'] = train_init_data['issue_date'].dt.dayofweek


train_data.drop('issue_date', axis = 1, inplace=True)
test_data.drop('issue_date', axis = 1, inplace=True)
train_init_data.drop('issue_date', axis = 1, inplace=True)

def findDate(val):
    fd = re.search('(\d+-)', val)
    if fd is None:
        return '1-'+val
    return val + '-01'

train_data['earlies_credit_mon'] = pd.to_datetime(train_data['earlies_credit_mon'].map(findDate))
test_data['earlies_credit_mon'] = pd.to_datetime(test_data['earlies_credit_mon'].map(findDate))
train_init_data['earlies_credit_mon'] = pd.to_datetime(train_init_data['earlies_credit_mon'].map(findDate))

train_data['earliesCreditMon'] = train_data['earlies_credit_mon'].dt.month
test_data['earliesCreditMon'] = test_data['earlies_credit_mon'].dt.month

train_data['earliesCreditYear'] = train_data['earlies_credit_mon'].dt.year
test_data['earliesCreditYear'] = test_data['earlies_credit_mon'].dt.year

train_init_data['earliesCreditMon'] = train_init_data['earlies_credit_mon'].dt.month

train_data.drop('earlies_credit_mon', axis = 1, inplace=True)
test_data.drop('earlies_credit_mon', axis = 1, inplace=True)
train_init_data.drop('earlies_credit_mon', axis = 1, inplace=True)

In [19]:
# 特征编码
cat_cols = ['employer_type', 'industry', 'sub_class']
for col in cat_cols:
    lab = LabelEncoder().fit(train_data[col])
    train_data[col] = lab.transform(train_data[col])
    test_data[col] = lab.transform(test_data[col])
    train_init_data[col] = lab.transform(train_init_data[col])

In [20]:
# 异常值处理
train_init_data = train_init_data[train_init_data['total_loan'] <= 38000] 
train_init_data = train_init_data[train_init_data['debt_loan_ratio'] <= 43.34] 
train_init_data = train_init_data[train_init_data['house_exist'] <= 2] 

train_init_data.reset_index()

train_data.drop('user_id', axis = 1, inplace=True)
test_data.drop('user_id', axis = 1, inplace=True)
train_init_data.drop('user_id', axis = 1, inplace=True)

In [21]:
for df in [train_data, test_data, train_init_data]:
    for item in ['f0','f1','f2','f3','f4']:
        df['industry_to_mean_' + item] = df.groupby(['industry'])[item].transform('mean')

In [22]:
print(train_data.shape)
print(test_data.shape)
print(train_init_data.shape)

(10000, 47)
(5000, 46)
(742813, 48)


In [23]:
def gen_target_encoding_feats(train, test, train_init, features, target_feature, target_feature1, n_fold=10):
    # Target编码
    # -----------------------------------train
    tg_feats = np.zeros((train.shape[0], len(features)))
    kfold = StratifiedKFold(n_splits=n_fold, random_state=1024, shuffle=True)
    for _, (train_index, val_index) in enumerate(kfold.split(train[features], train[target_feature])):
        df_train, df_val = train.iloc[train_index], train.iloc[val_index]
        for idx, feat in enumerate(features):
            target_mean_dict = df_train.groupby(feat)[target_feature].mean()
            df_val[f'{feat}_mean_target'] = df_val[feat].map(target_mean_dict)
            tg_feats[val_index, idx] = df_val[f'{feat}_mean_target'].values

    for idx, feature in enumerate(features):
        train[f'{feature}_mean_target'] = tg_feats[:, idx]
        
    # ---------------------------------train_init
    tg_feats = np.zeros((train_init.shape[0], len(features)))
    for _, (train_index, val_index) in enumerate(kfold.split(train_init[features], train_init[target_feature1])):
        df_train, df_val = train_init.iloc[train_index], train_init.iloc[val_index]
        for idx, feat in enumerate(features):
            target_mean_dict = df_train.groupby(feat)[target_feature1].mean()
            df_val[f'{feat}_mean_target'] = df_val[feat].map(target_mean_dict)
            tg_feats[val_index, idx] = df_val[f'{feat}_mean_target'].values

    for idx, feature in enumerate(features):
        train_init[f'{feature}_mean_target'] = tg_feats[:, idx]
        
    # -------------------------------------test
    for feat in features:
        target_mean_dict = train.groupby(feat)[target_feature].mean()
        test[f'{feat}_mean_target'] = test[feat].map(target_mean_dict)

    return train, test, train_init

features = ['house_exist', 'debt_loan_ratio', 'industry', 'title']
train_data, test_data, train_init_data = \
    gen_target_encoding_feats(train_data, test_data, train_init_data, features, 'isDefault', 'is_default', n_fold=10)

In [24]:
# 构造交叉特征
train_data['post_code_to_mean_interst'] = train_data.groupby(['post_code'])['interest'].transform('mean')
test_data['post_code_to_mean_interst'] = test_data.groupby(['post_code'])['interest'].transform('mean')
train_init_data['post_code_to_mean_interst'] = train_init_data.groupby(['post_code'])['interest'].transform('mean')

train_data['industry_mean_interest'] = train_data.groupby(['industry'])['interest'].transform('mean')
test_data['industry_mean_interest'] = test_data.groupby(['industry'])['interest'].transform('mean')
train_init_data['industry_mean_interest'] = train_init_data.groupby(['industry'])['interest'].transform('mean')

train_data['employer_type_mean_interest'] = train_data.groupby(['employer_type'])['interest'].transform('mean')
test_data['employer_type_mean_interest'] = test_data.groupby(['employer_type'])['interest'].transform('mean')
train_init_data['employer_type_mean_interest'] = train_init_data.groupby(['employer_type'])['interest'].transform('mean')

train_data['recircle_u_std_recircle_b'] = train_data.groupby(['recircle_u'])['recircle_b'].transform('std')
test_data['recircle_u_std_recircle_b'] = test_data.groupby(['recircle_u'])['recircle_b'].transform('std')
train_init_data['recircle_u_std_recircle_b'] = train_init_data.groupby(['recircle_u'])['recircle_b'].transform('std')

train_data['early_return_remove_early_return_amount'] = train_data['early_return_amount'] / train_data['early_return']
test_data['early_return_remove_early_return_amount'] = test_data['early_return_amount'] / test_data['early_return']
train_init_data['early_return_remove_early_return_amount'] = \
    train_init_data['early_return_amount'] / train_init_data['early_return']

# 将比值后存在inf的值设为 0
inf1 = np.isinf(train_data['early_return_remove_early_return_amount'])
train_data['early_return_remove_early_return_amount'][inf1] = 0
inf2 = np.isinf(test_data['early_return_remove_early_return_amount'])
test_data['early_return_remove_early_return_amount'][inf2] = 0
inf3 = np.isinf(train_init_data['early_return_remove_early_return_amount'])
train_init_data['early_return_remove_early_return_amount'][inf3] = 0

In [25]:
print(train_data.shape)
print(test_data.shape)
print(train_init_data.shape)

(10000, 56)
(5000, 55)
(742813, 57)


In [26]:
# 样本扩充
column1 = set(train_data.columns)
column2 = set(test_data.columns)
column3 = set(train_init_data.columns)
same_col = list(column1.intersection(column3))
nosasme_col = list(column1.difference(column3))

train_init_name_data = train_init_data[same_col].copy()
for col in nosasme_col:
    train_init_name_data[col] = np.nan

In [27]:
#过滤模型
y= train_data['isDefault']
oof_preds = np.zeros(train_data.shape[0])
sub_preds = np.zeros(train_init_name_data.shape[0])
feats = [f for f in train_data.columns if f not in ['loan_id','isDefault']]
fold = KFold(n_splits=10, shuffle=True, random_state=546789)
for n_fold, (trn_idx, val_idx) in enumerate(fold.split(train_data)):
    trn_x, trn_y = train_data[feats].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train_data[feats].iloc[val_idx], y.iloc[val_idx]
    clf = CatBoostClassifier(class_weights = [1,1.15],depth = 6,learning_rate = 0.08,iterations = 4000,
                            bootstrap_type = 'Bernoulli',subsample = 0.9,random_seed = 546789,verbose = 0)
    clf.fit(trn_x, trn_y, eval_set= [(trn_x, trn_y), (val_x, val_y)], verbose=100, early_stopping_rounds=40)
    oof_preds[val_idx] = clf.predict_proba(val_x)[:, 1]
    sub_preds += clf.predict_proba(train_init_name_data[feats])[:, 1] / fold.n_splits
    print('第%d次过滤auc分数：%.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del clf, trn_x, trn_y, val_x, val_y
    gc.collect()
train_init_name_data['isDefault'] = sub_preds
IntePre = train_init_name_data[['loan_id', 'isDefault']]
InteId = IntePre.loc[IntePre.isDefault<0.07, 'loan_id'].tolist()
train = train_data
test = test_data
train_init = train_init_name_data
train_init['isDefault'] = train_init_data['is_default']
use_te = train_init[train_init.loan_id.isin(InteId)].copy()
data = pd.concat([train,test,use_te]).reset_index(drop=True)
print('='*30+'\n过滤分数:%.6f\n过滤后的数据量:' % (roc_auc_score(y, oof_preds)) ,data.shape)

0:	learn: 0.6131516	test: 0.6132079	test1: 0.6125669	best: 0.6125669 (0)	total: 144ms	remaining: 9m 34s
100:	learn: 0.2558248	test: 0.2558565	test1: 0.2838707	best: 0.2838558 (99)	total: 816ms	remaining: 31.5s
200:	learn: 0.2025371	test: 0.2025632	test1: 0.2804053	best: 0.2789965 (171)	total: 1.41s	remaining: 26.6s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 0.2789965295
bestIteration = 171

Shrink model to first 172 iterations.
第1次过滤auc分数：0.915497
0:	learn: 0.6128461	test: 0.6129046	test1: 0.6137123	best: 0.6137123 (0)	total: 8.22ms	remaining: 32.9s
100:	learn: 0.2555057	test: 0.2555374	test1: 0.2977512	best: 0.2972134 (91)	total: 576ms	remaining: 22.3s
200:	learn: 0.1987487	test: 0.1987746	test1: 0.2951716	best: 0.2946017 (162)	total: 1.12s	remaining: 21.2s
Stopped by overfitting detector  (40 iterations wait)

bestTest = 0.2946016588
bestIteration = 162

Shrink model to first 163 iterations.
第2次过滤auc分数：0.900992
0:	learn: 0.6133939	test: 0.6134516	test1: 0.60906

In [28]:
data.shape

(75586, 56)

In [29]:
#重新定义新的数据集
train = data[data['isDefault'].notna()]
test  = data[data['isDefault'].isna()]
y = train['isDefault']
#xgboost单模
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])
feats = [f for f in train.columns if f not in ['loan_id','isDefault'] ]
fold = KFold(n_splits=10, shuffle=True, random_state=1122)
for n_fold, (trn_idx, val_idx) in enumerate(fold.split(train,y)):
    trn_x, trn_y = train[feats].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = train[feats].iloc[val_idx], y.iloc[val_idx]
    clf = XGBClassifier(eval_metric='auc',max_depth=5, alpha=0.3, reg_lambda=0.3, subsample=0.8,
                        colsample_bylevel = 0.867, objective='binary:logistic', use_label_encoder=False,
                        learning_rate=0.08, n_estimators=4000, min_child_weight = 2, tree_method='hist',
                        n_jobs=-1)
    clf.fit(trn_x, trn_y, eval_set= [(trn_x, trn_y), (val_x, val_y)], verbose=100, early_stopping_rounds=40)
    oof_preds[val_idx] = clf.predict_proba(val_x, ntree_limit=clf.best_ntree_limit)[:, 1]
    sub_preds += clf.predict_proba(test[feats], ntree_limit=clf.best_ntree_limit)[:, 1]/fold.n_splits
    print('第%d次auc分数：%.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del clf, trn_x, trn_y, val_x, val_y
    gc.collect() 
test['isDefault'] = sub_preds
print('='*30+'\nXGB单模线下分:%.6f' % roc_auc_score(y, oof_preds)) 

[0]	validation_0-auc:0.97367	validation_1-auc:0.97842
[88]	validation_0-auc:0.99316	validation_1-auc:0.98783
第1次auc分数：0.987867
[0]	validation_0-auc:0.98137	validation_1-auc:0.97098
[100]	validation_0-auc:0.99413	validation_1-auc:0.98518
[116]	validation_0-auc:0.99496	validation_1-auc:0.98551
第2次auc分数：0.985654
[0]	validation_0-auc:0.98303	validation_1-auc:0.98533
[100]	validation_0-auc:0.99398	validation_1-auc:0.98805
[118]	validation_0-auc:0.99488	validation_1-auc:0.98803
第3次auc分数：0.988116
[0]	validation_0-auc:0.97774	validation_1-auc:0.98252
[80]	validation_0-auc:0.99286	validation_1-auc:0.98859
第4次auc分数：0.989050
[0]	validation_0-auc:0.98263	validation_1-auc:0.98357
[44]	validation_0-auc:0.99107	validation_1-auc:0.98564
第5次auc分数：0.986039
[0]	validation_0-auc:0.97377	validation_1-auc:0.97935
[57]	validation_0-auc:0.99184	validation_1-auc:0.98685
第6次auc分数：0.987011
[0]	validation_0-auc:0.97498	validation_1-auc:0.96006
[44]	validation_0-auc:0.99168	validation_1-auc:0.98445
第7次auc分数：0.9845

In [30]:
#导出结果
pre = test[['loan_id', 'isDefault']]
pre.rename({'loan_id': 'id'}, axis=1)[['id', 'isDefault']].to_csv('result.csv', index=False)

In [32]:
import pandas as pd
data = pd.read_csv('result.csv')
data = data[data.isDefault > 0.5]
print(data.shape)

(471, 2)


AttributeError: module 'scipy' has no attribute 'norm'

<Figure size 432x432 with 0 Axes>