> 
## 导入包<a class="anchor" id="1"></a>

In [None]:
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
import math
import pandas as pd 
import matplotlib.pyplot as plt

import seaborn as sns
import sklearn.metrics as metrics
%matplotlib inline
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## 导入数据<a class="anchor" id="2"></a>

In [None]:
df_train = pd.read_csv("/kaggle/input/GiveMeSomeCredit/cs-training.csv",index_col=0)
df_test = pd.read_csv("/kaggle/input/GiveMeSomeCredit/cs-test.csv",index_col=0)
df_train.head()

## EDA <a class="anchor" id="3"></a>

详见 https://www.kaggle.com/orange90/credit-scorecard-example

## 一、原数据标签1比例

In [None]:
#标签1比例
df_train['SeriousDlqin2yrs'].mean()

In [None]:
#标签1总数
df_train['SeriousDlqin2yrs'].sum()

In [None]:
import xgboost as xgb

留30%作为模型的验证集

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop(['SeriousDlqin2yrs'],axis=1), df_train['SeriousDlqin2yrs'], test_size=0.2, random_state=42)

因为比赛是用AUC作为标准，所以我也看看我的模型AUC多少,这里写个函数等下用

In [None]:
def plot_AUC(model,X_test,y_test):
    probs = model.predict_proba(X_test)
    preds = probs[:,1]
    fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
    roc_auc = metrics.auc(fpr, tpr)

    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

### 建模 <a class="anchor" id="6"></a>

 #### model 1: use default parameter

In [None]:

model1 = AdaBoostClassifier(random_state=42)
model1.fit(X_train.fillna(0),y_train)

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
plot_AUC(model1,X_test.fillna(0),y_test)

In [None]:
disp = plot_confusion_matrix(model1, X_test.fillna(0), y_test,
                                 display_labels=[0,1],
                                 cmap=plt.cm.Blues,
                                 values_format='' 
                                 )
disp.ax_.set_title('confusion matrix')

####  model 2: use smote

In [None]:

from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = sm.fit_resample(X_train.fillna(0), y_train)
print('bad rate is: ',y_train_balanced.mean())

In [None]:
model2 =AdaBoostClassifier(random_state=42)
model2.fit(X_train_balanced,y_train_balanced)

In [None]:
plot_AUC(model2,X_test.fillna(0),y_test)

In [None]:
#混淆矩阵
y_pred = model2.predict(X_test.fillna(0))
metrics.confusion_matrix(y_test,y_pred)

#### model 3: easy ensemble

In [None]:
from imblearn.ensemble import EasyEnsembleClassifier

In [None]:
model3 = EasyEnsembleClassifier(n_estimators=20, random_state=42, base_estimator=AdaBoostClassifier(random_state=42))
model3.fit(X_train.fillna(0),y_train)

In [None]:
plot_AUC(model3,X_test.fillna(0),y_test)

In [None]:
disp = plot_confusion_matrix(model3, X_test.fillna(0), y_test,
                                 display_labels=[0,1],
                                 values_format='',
                                 cmap=plt.cm.Blues
                                 )
disp.ax_.set_title('confusion matrix')

1. ## 模拟一个标签极度不平衡情况

In [None]:
# 模拟极度不平衡情况，约0.35%的样本为1，其余为0

In [None]:
total_1 = int(df_train['SeriousDlqin2yrs'].sum()*0.05)
df_train_extreme = pd.concat([df_train[df_train['SeriousDlqin2yrs']==1].sample(total_1,random_state=42),
                              df_train[df_train['SeriousDlqin2yrs']==0]])

In [None]:
X_train_ex, X_test_ex, y_train_ex, y_test_ex = train_test_split(
                                                    df_train_extreme.drop(['SeriousDlqin2yrs'],axis=1), 
                                                    df_train_extreme['SeriousDlqin2yrs'], test_size=0.2, 
                                                    random_state=42)

In [None]:
#数据分布上接近
print(y_train_ex.mean(),y_test_ex.mean())

In [None]:
model_a = AdaBoostClassifier(random_state=42)
model_a.fit(X_train_ex.fillna(0),y_train_ex)

In [None]:
plot_AUC(model_a,X_test_ex.fillna(0),y_test_ex)

In [None]:
disp = plot_confusion_matrix(model_a, X_test_ex.fillna(0), y_test_ex,
                                 display_labels=[0,1],
                                 cmap=plt.cm.Blues
                                 )
disp.ax_.set_title('confusion matrix')

In [None]:
y_pred = model_a.predict(X_test_ex.fillna(0))
metrics.confusion_matrix(y_test_ex,y_pred)

In [None]:
model_b = EasyEnsembleClassifier(n_estimators=20, random_state=42, 
                                 base_estimator=AdaBoostClassifier(random_state=42))
model_b.fit(X_train_ex.fillna(0),y_train_ex)

In [None]:
plot_AUC(model_b,X_test_ex.fillna(0),y_test_ex)

In [None]:
disp = plot_confusion_matrix(model_b, X_test_ex.fillna(0), y_test_ex,
                                 display_labels=[0,1],
                                 cmap=plt.cm.Blues
                                 )
disp.ax_.set_title('confusion matrix')

In [None]:
y_pred = model_b.predict(X_test_ex.fillna(0))
metrics.confusion_matrix(y_test_ex,y_pred)

Reference:
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.309.1465&rep=rep1&type=pdf