In [1]:
import pandas as pd 
import gc
import os 
import warnings
import numpy as np 
import time
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
warnings.filterwarnings('ignore')

In [2]:
# from pyspark import SparkService
# # 创建local模式的SparkSession
# spark = SparkService.get_local_spark(executor_instances=3, driver_mem='10g')

In [37]:
from pyspark import SparkService
# 创建spark会话，自定义资源
# spark = SparkService.get_spark(executor_instances=1, per_executor_mem='1g', driver_mem='1g')
# 创建spark会话，使用默认资源
spark = SparkService.get_spark(executor_instances=2,per_executor_mem='4g',driver_mem='2g')


In [4]:
# # the spark short for SparkSession and sc short for SparkContext have already declared
# spark.sql("show databases").show()
# spark.sql("use qx_testing").show()
# spark.sql("show tables").toPandas()



## 重新刷一遍表

In [5]:
# paths='/opt/notebook/xuyinghao'

# def createtTable(paths):
#     for file in os.listdir():
#         col_list =[]
#         if 'csv' in file:
#             print('table %s 正在导入'%(file.replace('.csv','')))
#             start = time.time()
#             dir_ = os.path.join(paths,file)
#             df = pd.read_csv(dir_)
#             df.replace(np.NAN,'',inplace=True)
#             for col in df.columns:
#                 col_list.append(col.lower())
#             df.columns = col_list
#             df[col_list]=df[col_list].astype(str)
#             spark_table = spark.createDataFrame(df,verifySchema=False)
#             del df,col_list
#             gc.collect()
#             spark_table.write.saveAsTable(name='qx_testing.home_credit_%s'%(file.replace('.csv','')),mode='overwrite',partitionBy=None)
#             end = time.time()
#             print('table %s 导入成功!'%(file.replace('.csv','')))
#             print('导入开销%f s'%(end-start))
 


In [6]:
# createtTable(paths)

### 定义一些辅助函数

In [7]:


def change_age_tobin(days_birth):
    x = -days_birth / 365
    if x < 20: return 1
    elif x < 30: return 2
    elif x < 40: return 3
    elif x < 50: return 4
    elif x < 60: return 5
    else: return 0
    
def cal_mean(df, group_cols, col, agg_name):
    """
    计算均值
    """
    gp = df[group_cols + [col]].groupby(group_cols)[col].mean().reset_index().rename(
        columns={col: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df


def cal_median(df, group_cols, col, agg_name):
    """
    计算中位数
    """
    gp = df[group_cols + [col]].groupby(group_cols)[col].median().reset_index().rename(
        columns={col: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df


def cal_std(df, group_cols, col, agg_name):
    """
    计算标准差
    """
    gp = df[group_cols + [col]].groupby(group_cols)[col].std().reset_index().rename(
        columns={col: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    gc.collect()
    return df

def onehot_label_encoder(df,categorical_columns=None):
    """
    ont hot 
    """
    original_columns = list(df.columns)
    if not categorical_columns:
        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns)
    categorical_columns = [c for c in df.columns if c not in original_columns]
    return df,categorical_columns
    
def read_table(file_or_path):
    """
    将列名转换为小写
    """
    df = pd.read_csv(file_or_path)
    col_list = []
    for col in df.columns:
        col_list.append(col.lower())
    df.columns = col_list
    return df 

def group(df_to_agg, prefix, aggregations, aggregate_by= 'sk_id_curr'):
    """
     对每个表按照主键groupby
    """
    agg_df = df_to_agg.groupby(aggregate_by).agg(aggregations)
    agg_df.columns = pd.Index(['{}{}_{}'.format(prefix, e[0], e[1].lower())
                               for e in agg_df.columns.tolist()])
    return agg_df.reset_index()

def group_and_merge(df_to_agg, df_to_merge, prefix, aggregations, aggregate_by= 'sk_id_curr'):
    """
    合并groupby后的数据集
    """
    agg_df = group(df_to_agg, prefix, aggregations, aggregate_by= aggregate_by)
    return df_to_merge.merge(agg_df, how='left', on= aggregate_by)


@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(name, time.time() - t0))


### 处理主表

In [8]:
def process_train_test():
    df_train = read_table('application_train.csv')
    df_test = read_table('application_test.csv')
    df = df_train.append(df_test)
    del df_train,df_test
    gc.collect()
    
    #过滤掉异常样本以及用NAN替换掉异常值
    df= df[df['code_gender']!='XNA']
    df = df[df['amt_income_total']<20000000]
    
    df['days_employed'].replace(365243, np.nan, inplace=True)
    df['days_last_phone_change'].replace(0, np.nan, inplace=True)

    docs = [x for x in df.columns if 'flag_doc' in x]
    df['document_count'] = df[docs].sum(axis=1)
    df.drop(columns=docs,inplace=True)
    df['age_bin'] = df['days_birth'].map(change_age_tobin)
    df['ext_sources_prod'] = df['ext_source_1'] * df['ext_source_2'] * df['ext_source_3']
    df['ext_source_weighted'] = df['ext_source_1'] * 2 + df['ext_source_1'] * 1 + df['ext_source_1'] * 3
    
    
    #构建一些比例特征
    df['credit_to_annuity_ratio'] = df['amt_credit'] / df['amt_annuity']
    df['credit_to_goods_ratio'] = df['amt_credit'] / df['amt_goods_price']
    # 收入类型比列
    
    df['own_car_age'] = pd.to_numeric(df['own_car_age'],errors='ignore')
    df['annuity_to_income_ratio'] = df['amt_annuity'] / df['amt_income_total']
    df['credit_to_income_ratio'] = df['amt_credit'] / df['amt_income_total']
    df['income_to_employed_ratio'] = df['amt_income_total'] / df['days_employed']
    df['income_to_birth_ratio'] = df['amt_income_total'] / df['days_birth']
    # 时间序列形式比列特征
    df['employed_to_birth_ratio'] = df['days_employed'] / df['days_birth']
    df['car_to_birth_ratio'] = df['own_car_age'] / df['days_birth']
    df['car_to_employed_ratio'] = df['own_car_age'] / df['days_employed']
    
    #统计特征
    #重点关注EXT_SOURCE_1，EXT_SOURCE_2，EXT_SOURCE_3三个字段
    for function_name in ['min','max','mean']:
        feature_name = 'ext_sources_{}'.format(function_name)
        df[feature_name] = eval('np.{}'.format(function_name))(df[['ext_source_1','ext_source_2','ext_source_3']],axis=1)
    
    group_col = ['organization_type', 'name_education_type', 'occupation_type', 'age_bin', 'code_gender']
  
    #根据group_col来计算分组后的ext_source_median
    df=cal_median(df,group_col,'ext_sources_mean','group_ext_sources_median')
    df =cal_std(df,group_col,'ext_sources_mean','group_ext_sources_std')
    #计算分组后收入的平均值
    df = cal_mean(df,group_col,'amt_income_total','group_income_mean')
    df = cal_std(df,group_col,'amt_income_total','group_income_std')
    #计算分组申请贷款的金额
    df = cal_mean(df,group_col,'amt_credit','group_credit_mean')
    df = cal_std(df,group_col,'amt_credit','group_credit_std')
    df = cal_mean(df,group_col,'amt_annuity','group_annuity_mean')
    df = cal_std(df,group_col,'amt_annuity','group_annuity_std')
    
    #变量编码
    df,categorical_col= onehot_label_encoder(df)
    
    return df

    

## 衍生bureau_balance表数据

In [9]:
def process_bureau_balance():
    df = read_table('bureau_balance.csv')
    df, categorical_cols = onehot_label_encoder(df,categorical_columns=None)
    # 计算各个类别所占的比例
    bb_processed = df.groupby('sk_id_bureau')[categorical_cols].mean().reset_index()
    agg = {'months_balance': ['min', 'max', 'mean']}
    bb_processed = group_and_merge(df, bb_processed, '', agg, 'sk_id_bureau')
    del df; gc.collect()
    return bb_processed

    

### 衍生bureau以及合并bureau_balance表特征

In [10]:
def process_bureau():
    df = read_table('bureau.csv')
    ##衍生一些比例特征
    #逾期金额/当前额度
    df['credit_sum_overdue_ratio']=df['amt_credit_sum_overdue']/df['amt_credit_sum']
    #负债金额/当前额度
    df['debt_percentage']=df['amt_credit_sum_debt']/df['amt_credit_sum']
    #当前金额/年金
    df['credit_to_annuity_ratio']=df['amt_credit_sum']/df['amt_annuity']
    
    #onehot 
    df, categorical_cols = onehot_label_encoder(df)
    
    df = df.merge(process_bureau_balance(),on='sk_id_bureau',how='left')
    #衍生一些统计特征
    group_col=['sk_id_bureau']
    df = cal_mean(df,group_col,'amt_credit_sum_debt','group_sum_debt_mean')
    df= cal_std(df,group_col,'amt_credit_sum_debt','group_sum_debt_std')
    df = cal_mean(df,group_col,'amt_credit_sum_overdue','group_sum_overdue_mean')
    df = cal_std(df,group_col,'amt_credit_sum_overdue','group_sum_overdue_std')
    
    bureau_agg ={
    'sk_id_bureau': ['nunique'],
    'days_credit': ['min', 'max', 'mean'],
    'days_credit_enddate': ['min', 'max'],
    'amt_credit_max_overdue': ['max', 'mean'],
    'amt_credit_sum': ['max', 'mean', 'sum'],
    'amt_credit_sum_debt': ['max', 'mean', 'sum'],
    'amt_credit_sum_overdue': ['max', 'mean', 'sum'],
    'amt_annuity': ['mean'],
    'amt_credit_sum_debt':['mean'],
    'amt_credit_sum_overdue':['mean'],
    # 类别型特征
    'status_0': ['mean'],
    'status_1': ['mean'],
    'status_2': ['mean'],
    'status_C': ['mean'],
    'status_X': ['mean'],
    'credit_active_Active': ['mean'],
    'credit_active_Closed': ['mean'],
    'credit_active_Sold': ['mean'],
    'credit_type_Mortgage': ['mean'],
    'credit_type_Microloan': ['mean']
}
    #聚合特征
    df = group(df,prefix='bureau_',aggregations=bureau_agg)
    
    return df
   
    

### 衍生previous_applicaton表特征

In [11]:
def process_previous_application():
    df = read_table('previous_application.csv')
   
    #衍生一些比例特征以及差值特征
    df['application_credit_diff']= df['amt_application'] - df['amt_credit']
    df['application_to_credit_ratio']= df['amt_application']/df['amt_credit']
    df['credit_to_annuity_ratio']=df['amt_credit']/df['amt_annuity']
    
    group_col = ['name_client_type','name_contract_status','name_contract_type','name_cash_loan_purpose','code_reject_reason']
    
    df = cal_mean(df,group_col,'amt_annuity','group_annuity_mean')
    df = cal_median(df,group_col,'amt_annuity','group_annuity_std')
    df = cal_mean(df,group_col,'amt_credit','group_amt_credity_mean')
    df = cal_median(df,group_col,'amt_credit','group_amt_credit_std')
    df = cal_mean(df,group_col,'amt_application','group_amt_application_mean')
    df = cal_median(df,group_col,'amt_application','group_amt_application_std')
    
    
    #onehot编码
    df,categorical_cols = onehot_label_encoder(df)
    
    
    #将365243替换为nan
    df['days_first_drawing'].replace(365243, np.nan, inplace= True)
    df['days_first_due'].replace(365243, np.nan, inplace= True)
    df['days_last_due_1st_version'].replace(365243, np.nan, inplace= True)
    df['days_last_due'].replace(365243, np.nan, inplace= True)
    df['days_termination'].replace(365243, np.nan, inplace= True)
    
    prev_agg ={
    'sk_id_prev': ['nunique'],
    'amt_annuity': ['min', 'max', 'mean'],
    'amt_down_payment': ['max', 'mean'],
    'rate_down_payment': ['max', 'mean'],
    'days_decision': ['min', 'max', 'mean'],
    'cnt_payment': ['max', 'mean'],
    'days_termination': ['max'],
    'amt_application':['max','mean'],
    'amt_goods_price':['min','max'],
    'days_first_drawing':['max','mean'],
    'days_first_due':['min','mean'],
    'days_last_due':['max','mean'],
    'days_last_due_1st_version':['min','max','mean'],
    # 衍生的特征
    'credit_to_annuity_ratio': ['mean', 'max'],
    'application_credit_diff': ['min', 'max', 'mean'],
    'application_to_credit_ratio': ['min', 'max', 'mean'],
    'group_annuity_mean':['mean'],
    'group_annuity_std':['std'],
    'group_amt_credity_mean':['mean'],
    'group_amt_credit_std':['std'],
    'group_amt_application_mean':['mean'],
    'group_amt_application_std':['std'],
     #类别型特征
    'name_contract_type_Cash loans':['mean'],
    'name_contract_type_Consumer loans':['mean'],
    'name_contract_type_Revolving loans':['mean']
      
}
    
    df = group(df,prefix='prev_',aggregations=prev_agg)
    
    return df
    
    
    

### 衍生 pos_cash表特征

In [36]:
def process_pos_cash_balance():
    df = read_table('POS_CASH_balance.csv')
    df['late_payment']=df['sk_dpd'].map(lambda x:1 if x>0 else 0)
    df['sk_dpd_diff']=df['sk_dpd']-df['sk_dpd_def']
    df['instalment_diff']=df['cnt_instalment'] - df['cnt_instalment_future']
    
    df = cal_mean(df,['sk_id_curr'],'cnt_instalment','group_cnt_instalment_mean')
    df = cal_std(df,['sk_id_curr'],'cnt_instalment','group_cnt_instalment_std')
    df = cal_mean(df,['sk_id_curr'],'cnt_instalment_future','group_cnt_instalment_future_mean')
    df = cal_std(df,['sk_id_curr'],'cnt_instalment_future','group_cnt_instalment_future_std')
    
    
    pos_card_agg={
    'sk_id_prev': ['nunique'],
    'months_balance': ['min', 'max'],
    'sk_dpd': ['max', 'mean', 'sum', 'var'],
    'sk_dpd_def': ['max', 'mean', 'sum'],
    'late_payment': ['mean'],
    'instalment_diff':['mean','max','min'],
    'group_cnt_instalment_mean':['mean'],
    'group_cnt_instalment_std':['mean'],
    'group_cnt_instalment_future_mean':['mean'],
    'group_cnt_instalment_future_std':['mean']
} 
    
    df = group(df,prefix='pos_cash_',aggregations=pos_card_agg)
    return df
  

### 衍生 installments_payment表特征


In [13]:
def process_payment():
#     df = spark.sql('select * from qx_testing.home_credit_installments_payments').toPandas()
    df = read_table('installments_payments.csv')
    
    df['days_payment_diff'] = df['days_instalment']-df['days_entry_payment']
    
#     df['flag_pay_more'] = df.apply(lambda x:1 if (x['amt_payment']-x['amt_instalment'])>0 else 0 ,axis=1)

    df = cal_mean(df,['sk_id_curr'],'days_instalment','group_instalment_mean')
    df = cal_std(df,['sk_id_curr'],'days_instalment','group_instalment_std')
    df = cal_mean(df,['sk_id_curr'],'amt_instalment','group_amt_instalment_mean')
    df = cal_std(df,['sk_id_curr'],'amt_instalment','group_amt_instalment_std')
    
    paymet_agg={
    'sk_id_prev': ['nunique'],
    'days_payment_diff':['mean','max','min'],
    'flag_pay_more':['mean','min','max'],
    'group_instalment_mean':['mean'],
    'group_instalment_std':['mean'],
    'group_amt_instalment_mean':['mean'],
    'group_amt_instalment_std':['mean'],
    'num_instalment_version':['mean','max','min'],
    'num_instalment_number':['mean','max','min'],
    'amt_instalment':['mean','max','min'],
    'amt_payment':['mean','max','min'],
    'days_instalment':['mean','max','min'],
    'days_entry_payment':['mean','max','min']
    }
    df = group(df,'instalment_',aggregations=paymet_agg)
    
    return df


    

### 衍生 credit_card表特征

In [14]:
def process_credit_card():
#     df = spark.sql('select * from qx_testing.home_credit_credit_card_balance').toPandas()
    df = read_table('credit_card_balance.csv')
    df['limit_use'] = df['amt_balance']/df['amt_credit_limit_actual']
    df['payment_div_min'] =  df['amt_payment_current'] - df['amt_inst_min_regularity']
    df['late_payment'] = df['sk_dpd'].map(lambda x:1 if x>0 else 0)
    
    df['drawing_limit_ratio']= df['amt_drawings_atm_current']/df['amt_credit_limit_actual']
    
    credit_card_agg = {
    'months_balance': ['min'],
    'amt_balance': ['max'],
    'amt_credit_limit_actual': ['max'],
    'amt_drawings_atm_current': ['max', 'sum'],
    'amt_drawings_current': ['max', 'sum'],
    'amt_drawings_pos_current': ['max', 'sum'],
    'amt_inst_min_regularity': ['max', 'mean'],
    'amt_payment_total_current': ['max', 'mean', 'sum', 'var'],
    'amt_total_receivable': ['max', 'mean'],
    'cnt_drawings_atm_current': ['max', 'mean', 'sum'],
    'cnt_drawings_current': ['max', 'mean', 'sum'],
    'cnt_drawings_pos_current': ['mean'],
    'sk_dpd': ['mean', 'max', 'sum'],
    'sk_dpd_def': ['max', 'sum'],
    'limit_use': ['max', 'mean'],
    'payment_div_min': ['min', 'mean'],
    'late_payment': ['max', 'sum'],
    'drawing_limit_ratio':['mean','max','min']
}

    df = group(df,prefix='credit_card_',aggregations=credit_card_agg)
    return df


    

In [23]:
def main():
    with timer("application_train and application_test"):
        df = process_train_test()
        print("Application dataframe shape: ", df.shape)
    with timer("Bureau and bureau_balance data"):
        bureau_df = process_bureau()
        df = pd.merge(df, bureau_df, on='sk_id_curr', how='left')
        print("Bureau dataframe shape: ", bureau_df.shape)
        del bureau_df; gc.collect()
    with timer("previous_application"):
        prev_df = process_previous_application()
        df = pd.merge(df, prev_df, on='sk_id_curr', how='left')
        print("Previous dataframe shape: ", prev_df.shape)
        del prev_df; gc.collect()
    with timer("pos_cash_balance"):
        pos_cash = process_pos_cash_balance()
        df = pd.merge(df,pos_cash,on='sk_id_curr',how='left')
        print("POS dataframe shape:",pos_cash.shape)
        del pos_cash;gc.collect()
    with timer("installments_payment"):
        install_pay = process_payment()
        df = pd.merge(df,install_pay,on='sk_id_curr',how='left')
        print("Installment_payment shape :",install_pay.shape)
        del install_pay;gc.collect()
    with timer("credit_card "):
        credit_card = process_credit_card()
        df = pd.merge(df,credit_card,on='sk_id_curr',how='left')
        print("Credit_card shape:",credit_card.shape)
        del credit_card;gc.collect()
        
    with timer("Run LightGBM"):
        feat_importance = kfold_lightgbm_sklearn(df)
        print(feat_importance)
 
 

In [24]:
NUM_THREADS = 4
DATA_DIRECTORY = "../input/"
SUBMISSION_SUFIX = "_model_3_29"



# 模型参数以及超参
GENERATE_SUBMISSION_FILES = True
STRATIFIED_KFOLD = False
RANDOM_SEED = 737851
NUM_FOLDS = 10
EARLY_STOPPING = 100

LIGHTGBM_PARAMS = {
    'boosting_type': 'goss',
    'n_estimators': 10000,
        'learning_rate': 0.005134,
    'num_leaves': 54,
    'max_depth': 10,
    'subsample_for_bin': 240000,
    'reg_alpha': 0.436193,
    'reg_lambda': 0.479169,
    'colsample_bytree': 0.508716,
    'min_split_gain': 0.024766,
    'subsample': 1,
    'is_unbalance': False,
    'silent':-1,
    'verbose':-1
}

In [35]:
def kfold_lightgbm_sklearn(data, categorical_feature = None):
    df = data[data['target'].notnull()]
    test = data[data['target'].isnull()]
    print("Train/valid shape: {}, test shape: {}".format(df.shape, test.shape))
    del_features = ['target', 'sk_id_curr', 'sk_id_bureau', 'sk_id_prev']
    predictors = list(filter(lambda v: v not in del_features, df.columns))

    if not STRATIFIED_KFOLD:
        folds = KFold(n_splits= NUM_FOLDS, shuffle=True, random_state= RANDOM_SEED)
    else:
        folds = StratifiedKFold(n_splits= NUM_FOLDS, shuffle=True, random_state= RANDOM_SEED)

    oof_preds = np.zeros(df.shape[0])
    sub_preds = np.zeros(test.shape[0])
    importance_df = pd.DataFrame()
    eval_results = dict()

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df[predictors], df['target'])):
        train_x, train_y = df[predictors].iloc[train_idx], df['target'].iloc[train_idx]
        valid_x, valid_y = df[predictors].iloc[valid_idx], df['target'].iloc[valid_idx]

        params = {'random_state': RANDOM_SEED, 'nthread': NUM_THREADS}
        clf = LGBMClassifier(**{**params, **LIGHTGBM_PARAMS})
        if not categorical_feature:
            clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
                    eval_metric='auc', verbose=400, early_stopping_rounds= EARLY_STOPPING)
        else:
            clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
                    eval_metric='auc', verbose=400, early_stopping_rounds=EARLY_STOPPING,
                    feature_name= list(df[predictors].columns), categorical_feature= categorical_feature)

        oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test[predictors], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance = pd.DataFrame()
        fold_importance["feature"] = predictors
        fold_importance["gain"] = clf.booster_.feature_importance(importance_type='gain')
        fold_importance["split"] = clf.booster_.feature_importance(importance_type='split')
        importance_df = pd.concat([importance_df, fold_importance], axis=0)
        eval_results['train_{}'.format(n_fold+1)]  = clf.evals_result_['training']['auc']
        eval_results['valid_{}'.format(n_fold+1)] = clf.evals_result_['valid_1']['auc']

        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(df['target'], oof_preds))
    test['target'] = sub_preds.copy()

    mean_importance = importance_df.groupby('feature').mean().reset_index()
    mean_importance.sort_values(by= 'gain', ascending=False, inplace=True)
    if GENERATE_SUBMISSION_FILES:

        oof = pd.DataFrame()
        oof['sk_id_curr'] = df['sk_id_curr'].copy()
        df['predictions'] = oof_preds.copy()
        df['target'] = df['target'].copy()
        df.to_csv('oof{}.csv'.format(SUBMISSION_SUFIX), index=False)
        test[['sk_id_curr', 'target']].to_csv('submission{}.csv'.format(SUBMISSION_SUFIX), index=False)
        mean_importance.to_csv('feature_importance{}.csv'.format(SUBMISSION_SUFIX), index=False)
    return mean_importance


In [34]:
kfold_lightgbm_sklearn(df)

Train/valid shape: (307506, 403), test shape: (48744, 403)
Training until validation scores don't improve for 100 rounds.
[400]	training's binary_logloss: 0.24001	training's auc: 0.79058	valid_1's binary_logloss: 0.246077	valid_1's auc: 0.762525
[800]	training's binary_logloss: 0.228868	training's auc: 0.816134	valid_1's binary_logloss: 0.240461	valid_1's auc: 0.775536
[1200]	training's binary_logloss: 0.221397	training's auc: 0.834419	valid_1's binary_logloss: 0.238095	valid_1's auc: 0.781605
[1600]	training's binary_logloss: 0.215374	training's auc: 0.849187	valid_1's binary_logloss: 0.236912	valid_1's auc: 0.784649
[2000]	training's binary_logloss: 0.210017	training's auc: 0.862378	valid_1's binary_logloss: 0.236273	valid_1's auc: 0.786405
[2400]	training's binary_logloss: 0.205163	training's auc: 0.87398	valid_1's binary_logloss: 0.235865	valid_1's auc: 0.787495
[2800]	training's binary_logloss: 0.200638	training's auc: 0.884489	valid_1's binary_logloss: 0.235604	valid_1's auc: 0.7

[2800]	training's binary_logloss: 0.20053	training's auc: 0.884434	valid_1's binary_logloss: 0.236005	valid_1's auc: 0.795783
[3200]	training's binary_logloss: 0.196255	training's auc: 0.893917	valid_1's binary_logloss: 0.235703	valid_1's auc: 0.796537
[3600]	training's binary_logloss: 0.192221	training's auc: 0.902636	valid_1's binary_logloss: 0.235532	valid_1's auc: 0.797011
[4000]	training's binary_logloss: 0.188388	training's auc: 0.910428	valid_1's binary_logloss: 0.235398	valid_1's auc: 0.797301
[4400]	training's binary_logloss: 0.18467	training's auc: 0.91775	valid_1's binary_logloss: 0.235293	valid_1's auc: 0.797477
Early stopping, best iteration is:
[4347]	training's binary_logloss: 0.185153	training's auc: 0.916826	valid_1's binary_logloss: 0.23528	valid_1's auc: 0.797539
Fold  6 AUC : 0.797539
Training until validation scores don't improve for 100 rounds.
[400]	training's binary_logloss: 0.240132	training's auc: 0.789863	valid_1's binary_logloss: 0.245636	valid_1's auc: 0.76

Unnamed: 0,feature,gain,split
114,ext_sources_mean,771262.361903,2858.8
115,ext_sources_min,276381.822786,2314.1
113,ext_sources_max,179075.527504,2006.4
111,ext_source_3,142918.488955,3011.6
89,credit_to_annuity_ratio,128053.027334,4033.4
90,credit_to_goods_ratio,90546.955924,2013.5
110,ext_source_2,73748.915337,2056.1
92,days_birth,64032.486006,2681.5
1,amt_annuity,60778.280872,2371.5
165,instalment_days_payment_diff_min,60723.477559,2004.2


In [41]:

def write_to_hive(df):
    train = df[df['target'].notnull()].astype(str)
    test = df[df['target'].isnull()].astype(str)
    print("-----正在创建train DataFrame----")
    t0 = time.time()
    hive_train = spark.createDataFrame(train)
    hive_train.write.saveAsTable(name='qx_testing.feature_train',mode='overwrite',partitionBy=None)
    print("---train 写入hive success！耗时 %.5f s"%(time.time()-t0))
    print('\n')
    print("-----正在创建train DataFrame----")
    t1 = time.time()
    hive_test = spark.createDataFrame(test)
    hive_test.write.saveAsTable(name='qx_testing.feature_test',mode='overwrite',partitionBy=None)
    print("---test 写入hive success！耗时 %.5f s"%(time.time()-t1))
    del train,test,hive_train,hive_test;gc.collect()