In [1]:
import pandas as pd
import os
import gc
import math
import numpy as np
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn import datasets
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss

from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions

In [2]:
train = pd.read_csv('../train.csv')
testA = pd.read_csv('../testA.csv')
data = pd.concat([train, testA], axis=0, ignore_index=True)

数据预处理

In [3]:
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
    
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

In [4]:
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))

In [5]:
data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

In [6]:
for f in ['employmentTitle', 'postCode', 'title']:
    data[f+'_cnts'] = data.groupby([f])['id'].transform('count')
    data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
    del data[f]

训练/测试集准备

In [7]:
features = [f for f in data.columns if f not in ['id','issueDate','isDefault']]

train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)

x_train = train[features]
x_test = test[features]

y_train = train['isDefault']

建模

In [8]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        train[valid_index] = val_pred
        test = test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
       
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

In [9]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat") 
    return cat_train, cat_test

In [10]:
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.742898	valid_1's auc: 0.730406
[400]	training's auc: 0.755553	valid_1's auc: 0.731185
[600]	training's auc: 0.766567	valid_1's auc: 0.731421
[800]	training's auc: 0.77656	valid_1's auc: 0.731297
Early stopping, best iteration is:
[658]	training's auc: 0.769561	valid_1's auc: 0.731571
[0.7315707699391983]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.743889	valid_1's auc: 0.726598
[400]	training's auc: 0.756346	valid_1's auc: 0.727829
[600]	training's auc: 0.767237	valid_1's auc: 0.728122
[800]	training's auc: 0.777257	valid_1's auc: 0.728164
Early stopping, best iteration is:
[700]	training's auc: 0.772432	valid_1's auc: 0.728318
[0.7315707699391983, 0.7283181812019169]
************************************ 3 ****

In [11]:
xgb_train, xgb_test = xgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.69097	eval-auc:0.69212
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.72765	eval-auc:0.72402
[400]	train-auc:0.73530	eval-auc:0.72799
[600]	train-auc:0.74057	eval-auc:0.72988
[800]	train-auc:0.74497	eval-auc:0.73100
[1000]	train-auc:0.74882	eval-auc:0.73170
[1200]	train-auc:0.75236	eval-auc:0.73223
[1400]	train-auc:0.75562	eval-auc:0.73264
[1600]	train-auc:0.75882	eval-auc:0.73296
[1800]	train-auc:0.76187	eval-auc:0.73318
[2000]	train-auc:0.76487	eval-auc:0.73338
[2200]	train-auc:0.76766	eval-auc:0.73348
[24

In [12]:
cat_train, cat_test = cat_model(x_train, y_train, x_test)

************************************ 1 ************************************
0:	learn: 0.3985252	test: 0.3966187	best: 0.3966187 (0)	total: 273ms	remaining: 1h 30m 51s
500:	learn: 0.3771946	test: 0.3759285	best: 0.3759285 (500)	total: 24.5s	remaining: 15m 54s
1000:	learn: 0.3756449	test: 0.3751634	best: 0.3751634 (1000)	total: 48.2s	remaining: 15m 14s
1500:	learn: 0.3745709	test: 0.3748276	best: 0.3748276 (1500)	total: 1m 11s	remaining: 14m 43s
2000:	learn: 0.3736588	test: 0.3746263	best: 0.3746258 (1998)	total: 1m 36s	remaining: 14m 25s
2500:	learn: 0.3728292	test: 0.3744849	best: 0.3744849 (2500)	total: 2m	remaining: 13m 59s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3744018679
bestIteration = 2905

Shrink model to first 2906 iterations.
[0.7327200609336475]
************************************ 2 ************************************
0:	learn: 0.3979537	test: 0.3988945	best: 0.3988945 (0)	total: 54.4ms	remaining: 18m 7s
500:	learn: 0.3764995	test: 0.3787237	bes

In [None]:
lgb_score_mean: 0.7311197790922761
lgb_score_std: 0.001507802995682687
xgb_score_mean: 0.7328694823569253
xgb_score_std: 0.00159958725807308
cat_score_mean: 0.7318428632974363
cat_score_std: 0.0018918585561348224

In [13]:
rh_test = 0.3*lgb_test+0.4*xgb_test+0.3*cat_test

In [14]:
testA['isDefault'] = rh_test

In [15]:
testA[['id','isDefault']].to_csv('test_sub.csv', index=False)

stacking

In [14]:
clf1 = lgb.LGBMClassifier()
clf2 = xgb.XGBClassifier()
clf3 = CatBoostRegressor
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], 
                          meta_classifier=lr)

In [15]:
label = ['LGBM', 'XGBoost', 'CatBoost', 'Stacking Classifier']
clf_list = [clf1, clf2, clf3, sclf]

In [18]:
clf_cv_mean = []
clf_cv_std = []
for clf, label in zip(clf_list, label):
    scores = cross_val_score(clf, x_train, y_train, cv=5, scoring='accuracy')
    print("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))
    clf_cv_mean.append(scores.mean())
    clf_cv_std.append(scores.std())
        
    clf.fit(x_train, y_train)

Accuracy: 0.81 (+/- 0.00) [L]
Accuracy: 0.74 (+/- 0.11) [G]


TypeError: Cannot clone object. You should provide an instance of scikit-learn estimator instead of a class.

blending

In [59]:
clfs = [lgb.LGBMClassifier(),
        xgb.XGBClassifier(),
        GradientBoostingClassifier()]

In [60]:
x_train_split, x_val, y_train_split, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=914)

In [61]:
x_d1, x_d2, y_d1, y_d2 = train_test_split(x_train_split, y_train_split, test_size=0.5, random_state=914)
dataset_d1 = np.zeros((x_d2.shape[0], len(clfs)))
dataset_d2 = np.zeros((x_val.shape[0], len(clfs)))

In [62]:
for j, clf in enumerate(clfs):
    #依次训练各个单模型
    clf.fit(x_d1, y_d1)
    y_submission = clf.predict_proba(x_d2)[:, 1]
    dataset_d1[:, j] = y_submission
    #对于测试集，直接用这k个模型的预测值作为新的特征。
    dataset_d2[:, j] = clf.predict_proba(x_val)[:, 1]
    print("val auc Score: %f" % roc_auc_score(y_val, dataset_d2[:, j]))

val auc Score: 0.724985
val auc Score: 0.723947


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [63]:
data.isnull()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,employmentLength,annualIncome,issueDate,isDefault,dti,...,regionCode_47,regionCode_48,regionCode_49,regionCode_50,employmentTitle_cnts,employmentTitle_rank,postCode_cnts,postCode_rank,title_cnts,title_rank
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
999996,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
999997,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
999998,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
