> 
## 导入包<a class="anchor" id="1"></a>

In [None]:
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
import math
import pandas as pd 
import matplotlib.pyplot as plt
import lightgbm as lgb
import seaborn as sns
import sklearn.metrics as metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import f1_score
import os
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.model_selection import KFold,StratifiedKFold
import warnings
warnings.filterwarnings("ignore")
random_seed=1024

In [None]:
def get_threshold(y,cv_pred):
    best_score=0
    for i in range(0,1000,10):
        threshold=i/1000
        pred=np.round(cv_pred-threshold+0.5)
        score=f1_score(y,pred,average='macro')
        if score>best_score:
            best_score=score
            best_threshold=threshold
    return best_threshold,best_score

In [None]:
def cv_lgb(X_train,y_train,X_test,y_test,is_unbalance=False):
    test_pred = np.zeros((X_test.shape[0],))
    cv_pred = np.zeros((X_train.shape[0],))
    skf = StratifiedKFold(n_splits=5, random_state=random_seed, shuffle=True)
    for index, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        train_x, val_x, train_y, val_y = X_train.iloc[train_index], X_train.iloc[test_index], y_train.iloc[train_index], y_train.iloc[test_index]
        model = lgb.LGBMClassifier(n_estimators=5000,
                                    max_depth=4,
                                    boosting_type="gbdt",
                                    #subsample=0.7,
                                    colsample_bytree=0.7,
                                    learning_rate=0.01,
                                    random_state=random_seed,
                                    bagging_seed= random_seed,
                                    feature_fraction_seed= random_seed,
                                    early_stopping_rounds=100,
                                    metric='auc',
                                    is_unbalance=is_unbalance
            )
        clf=model.fit(train_x,train_y, eval_set=(val_x,val_y), verbose=0, )
        cv_pred[test_index] =clf.predict_proba(val_x)[:,1]
        test_pred += clf.predict_proba(X_test)[:,1] / 5
    auc=roc_auc_score(y_test,test_pred)
    print('auc : ',round(auc,4))
    fpr,tpr,thresholds= roc_curve(y_test,test_pred)
    print('ks : ',round(max(tpr-fpr),4))
    threshold,best_score=get_threshold(y_train,cv_pred)
    print('best cv f1',round(best_score,4))
    print('best threshold',threshold)
    test_pred_label=np.round(test_pred-threshold+0.5)
    score=f1_score(y_test,test_pred_label,average='macro')
    print('best test f1',round(score,4))
    #get_threshold(y_test,test_pred)
    return 

In [None]:
def load_kaggle_data():
    path='../input/GiveMeSomeCredit/'
    df_train = pd.read_csv(path+"/cs-training.csv",index_col=0)
    df_test = pd.read_csv(path+"/cs-test.csv",index_col=0)
    df_train=df_train.fillna(0)
    df_test=df_test.fillna(0)
    #total_1 = int(df_train['SeriousDlqin2yrs'].sum()*0.1)
    #df_train = pd.concat([df_train[df_train['SeriousDlqin2yrs']==1].sample(total_1,random_state=42),df_train[df_train['SeriousDlqin2yrs']==0]])
    df_train=df_train.sample(frac=1)
    print(df_train.shape,df_train['SeriousDlqin2yrs'].mean())
    #train test split
    X_train, X_test, y_train, y_test = train_test_split(df_train.drop(['SeriousDlqin2yrs'],axis=1), df_train['SeriousDlqin2yrs'], test_size=0.2, random_state=random_seed)
    print('train bad rate : ',y_train.mean(),'test bad rate : ',y_test.mean())
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test=load_kaggle_data()

In [None]:
#base model
print('base model')
cv_lgb(X_train,y_train,X_test,y_test)

In [None]:
#base model is_unbalance
print('base model is_unbalance')
cv_lgb(X_train,y_train,X_test,y_test,is_unbalance=True)

In [None]:
#over sampling
print('over sampling')
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_train_balanced, y_train_balanced = ros.fit_resample(X_train, y_train)
print(y_train_balanced.value_counts())
cv_lgb(X_train_balanced, y_train_balanced,X_test,y_test)

In [None]:
#under sampling
print('under sampling')
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_train_balanced, y_train_balanced = rus.fit_resample(X_train, y_train)
print(y_train_balanced.value_counts())
cv_lgb(X_train_balanced, y_train_balanced,X_test,y_test)

In [None]:
#EasyEnsemble
print('EasyEnsemble')
model5 = EasyEnsembleClassifier(n_estimators=20, random_state=random_seed, base_estimator=lgb.LGBMClassifier(random_state=random_seed))
model5.fit(X_train,y_train)
y_hat = model5.predict_proba(X_test)[:,1]
auc=roc_auc_score(y_test,y_hat)
print('auc : ',round(auc,4))
fpr,tpr,thresholds= roc_curve(y_test,y_hat)
print('ks : ',round(max(tpr-fpr),4))

In [None]:
#SMOTE
print('SMOTE')
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = sm.fit_resample(X_train, y_train)
model6 = lgb.LGBMClassifier(random_state=random_seed,is_unbalance=True)
cv_lgb(X_train_balanced, y_train_balanced,X_test,y_test)