In [15]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

## 获取数据

In [8]:
from pathlib import Path

train = pd.read_csv(Path.cwd() / 'input' / 'application_train.csv')
test = pd.read_csv(Path.cwd() / 'input' / 'application_train.csv')
bureau = pd.read_csv(Path.cwd() / 'input' / 'bureau.csv')
bureau_balance = pd.read_csv(Path.cwd() / 'input' / 'bureau_balance.csv')


### 构造训练数据

In [9]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(train, test_size=0.1, random_state=42)
train_set.reset_index(inplace=True, drop=True)
test_set.reset_index(inplace=True, drop=True)

## 数据预处理

### 构建预处理pipeline

**1. 离散特征**
- DataFrameSelector
- 处理缺失值
    - NAME_TYPE_SUITE: 用众数填充
    - OCCUPATION_TYPE, WALLSMATERIAL_MODE: 按比例分配
    - FONDKAPREMONT_MODE, HOUSETYPE_MODE, EMERGENCYSTATE_MODE: 删掉
- 编码
    - 有高基值特征
    - 先用直接编码的方式
    - 可以考虑更复杂的编码方式，例如基值不多的特征使用One-Hot，反之使用LabelEncoder，或是进一步归类（归为other之类的），然后再做进一步处理

**2. 数值特征**
- DataFrameSelector
- 处理缺失值
    - 缺失率大于30%的删除
    - 其他选择合适的填充方式: median, mean, most_frequent
- 标准化（Optional）

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OneHotEncoder, Imputer
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, include=None, exclude=None):
        TransformerMixin.__init__(self)
        self.include = include
        self.exclude = exclude

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if self.include:
            return X.select_dtypes(include=self.include)
        elif self.exclude:
            return X.select_dtypes(exclude=self.exclude)
        else:
            return X


class MyLabelEncoder(TransformerMixin):
    def __init__(self, *args, **kwargs):
        TransformerMixin.__init__(self)
        self.encoder = LabelEncoder(*args, **kwargs)

    def fit(self, X, y=None):
        self.encoder.fit(X)
        self.classes_ = self.encoder.classes_
        return self

    def transform(self, X, y=None):
        return self.encoder.transform(X)


class CatLabelEncoder(TransformerMixin):
    def __init__(self, *args, **kwargs):
        TransformerMixin.__init__(self)
        self.classes_ = {}

    def fit(self, X, y=None):
        for col in list(X):
            feat_vals = X[col].unique()
            feat_vals.sort()
            self.classes_[col] = {
                val: idx for idx, val in enumerate(feat_vals)
            }
        return self

    def transform(self, X, y=None):
        for col in X:
            X[col] = X[col].map(self.classes_[col])
        return X


def get_by_ratio(series, size):
    '''传入一个Series，以数据值的大小为概率返回index'''
    val_dict = {idx: val for idx, val in enumerate(series.index)}
    np.random.seed(1)
    dice = np.random.randint(0, series.sum(), size=size)
    cs = series.cumsum()
    fillvals = pd.Series(
        np.apply_along_axis(
            func1d=lambda x: len(x) - len(x[x]),
            arr=(cs.values - dice[:, None]) > 0,
            axis=1)
    )
    # 返回np.ndarray，排除Series赋值时index的影响
    return fillvals.map(val_dict).values


class CatImputer(BaseEstimator, TransformerMixin):
    '''离散特征缺失值填充类'''
    def __init__(self, allocate_by_mode, allocate_by_ratio, todrop):
        TransformerMixin.__init__(self)
        self.allocate_by_mode = allocate_by_mode
        self.allocate_by_ratio = allocate_by_ratio
        self.todrop = todrop

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        if 'NAME_FAMILY_STATUS' in X.columns:
            col = 'NAME_FAMILY_STATUS'    # 处理NAME_FAMILY_STATUS中的'Unknown'
            max_name_family = X[col].value_counts().sort_values().index[-1]
            X.loc[X[col]=='Unknown', col] = max_name_family
        for col in self.allocate_by_mode:
            if col in X.columns:
                X[col] = X[col].fillna(X[col].value_counts().sort_values().index[-1])
            else:
                print("%s not in X's columns, impute nothing.")
        for col in self.allocate_by_ratio:
            if col in X.columns:
                series = X[col].value_counts()
                X.loc[X[col].isnull(), col] = get_by_ratio(series, X[col].isnull().sum())
            else:
                print("%s not in X's columns, impute nothing.")
        for col in self.todrop:
            if col in X.columns:
                X.drop(col, inplace=True, axis=1)
            else:
                print("%s not in X's columns, drop nothing.")
        return X


# 数值特征转换流程
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(exclude=['object',])),
    ('imputer', Imputer(strategy='median'))
])

# 离散特征转换流程
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(include=['object',])),
    ('imputer', CatImputer(
        allocate_by_mode=['NAME_TYPE_SUITE'],
        allocate_by_ratio=['OCCUPATION_TYPE', 'WALLSMATERIAL_MODE'],
        todrop=['FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'EMERGENCYSTATE_MODE']
    )),
    ('label_encoder', CatLabelEncoder())
])

# 合在一起
pre_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline)
])

## 建模

In [3]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [10]:
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([
    ('preprocess', pre_pipeline),
    ('classifier', LogisticRegression())
])

In [11]:
train_set.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,252724,0,Cash loans,F,N,Y,2,135000.0,1078200.0,31653.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,372834,0,Cash loans,M,N,Y,2,337500.0,1288350.0,37800.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
2,326336,1,Cash loans,F,Y,N,0,135000.0,521280.0,28408.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2.0
3,382390,0,Cash loans,F,N,Y,0,144000.0,808650.0,31333.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
4,330511,0,Cash loans,F,N,Y,1,112500.0,942300.0,27679.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0


In [12]:
from sklearn.model_selection import train_test_split

X = train_set.drop(['SK_ID_CURR', 'TARGET'], axis=1)
target = train_set['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.1, random_state=2)

In [16]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
accuracy_score(y_pred, y_test)

0.9194247723659489

In [17]:
print("训练auc", roc_auc_score(y_train, pipe.predict_proba(X_train)[:, 1]))
print("验证auc", roc_auc_score(y_test, pipe.predict_proba(X_test)[:, 1]))

训练auc 0.623786436442069
验证auc 0.6176571048723949
