# 二、特征工程

In [253]:
## 1. 导入数据

import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/testA.csv')

print(train_data.shape)
print(test_data.shape)

(800000, 47)
(200000, 46)


## 2. 特征值预处理

### 2.1 缺失值处理

首先查看缺失值特征值有哪些。

In [254]:
# 区分出数值特征和对象特征
numeric_features = list(train_data.select_dtypes(exclude='object').columns)
object_features = list(filter(lambda x: x not in numeric_features, list(train_data.columns)))
numeric_features.remove('id')
numeric_features.remove('isDefault')
print(numeric_features)
print(object_features)

['loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle', 'homeOwnership', 'annualIncome', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']


In [255]:
print("train_data 缺失值统计：\r\n")
print(train_data.isnull().sum())
print("test_data 缺失值统计：\r\n")
print(test_data.isnull().sum())

train_data 缺失值统计：

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           1
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  1
regionCode                0
dti                     239
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies      405
revolBal                  0
revolUtil               531
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     1
policyCode                0
n0                    40270
n1                    40270
n2                    40270
n

In [256]:
# employmentTitle, postCode, title只有一条空数据，相对于80w来说，可以忽略不计。我们直接删除这三行数据即可。
train_data = train_data.drop(index=(train_data.loc[train_data['employmentTitle'].isnull()].index))
print("train_data employmentTitle is null:", train_data['employmentTitle'].isnull().sum())
train_data = train_data.drop(index=(train_data.loc[train_data['postCode'].isnull()].index))
print("train_data postCode is null:", train_data['postCode'].isnull().sum())
train_data = train_data.drop(index=(train_data.loc[train_data['title'].isnull()].index))
print("train_data title is null:", train_data['title'].isnull().sum())

train_data employmentTitle is null: 0
train_data postCode is null: 0
train_data title is null: 0


In [257]:
# 数值类型填充中位数
train_data[numeric_features] = train_data[numeric_features].fillna(train_data[numeric_features].median())
test_data[numeric_features] = test_data[numeric_features].fillna(test_data[numeric_features].median())
# 对象类型填充众数
train_data[object_features] = train_data[object_features].fillna(train_data[object_features].mode())
test_data[object_features] = test_data[object_features].fillna(test_data[object_features].mode())
# 重新检查缺失值
print("train_data 缺失值统计：\r\n")
print(train_data.isnull().sum())
print("test_data 缺失值统计：\r\n")
print(test_data.isnull().sum())

train_data 缺失值统计：

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           0
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  0
regionCode                0
dti                       0
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies        0
revolBal                  0
revolUtil                 0
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     0
policyCode                0
n0                        0
n1                        0
n2                        0
n

## 2.2 对象特征值转换为数值


In [258]:
# employmentLength 通过众数填充失败了，我们通过暂时保留Nan值，因为是分类属性，所以可以假设这个Nan是一个新的类型
print(train_data['employmentLength'].value_counts(dropna=False).sort_index())
print(test_data['employmentLength'].value_counts(dropna=False).sort_index())

1 year        52489
10+ years    262751
2 years       72358
3 years       64152
4 years       47985
5 years       50101
6 years       37254
7 years       35407
8 years       36192
9 years       30272
< 1 year      64237
NaN           46799
Name: employmentLength, dtype: int64
1 year       13182
10+ years    65772
2 years      18207
3 years      16011
4 years      11833
5 years      12543
6 years       9328
7 years       8823
8 years       8976
9 years       7594
< 1 year     15989
NaN          11742
Name: employmentLength, dtype: int64


In [259]:
# 处理employmentLength
def employmentLengthToInt(data):
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
    data['employmentLength'].replace(to_replace='< 1 year', value='0 years', inplace=True)
    data['employmentLength'].fillna('-1 years', inplace=True)
    return data['employmentLength'].apply(lambda x: np.int8(str(x).split()[0]))
train_data['employmentLength'] = employmentLengthToInt(train_data)
test_data['employmentLength'] = employmentLengthToInt(test_data)

# 检查结果
print(train_data['employmentLength'].value_counts(dropna=False).sort_index())
print(test_data['employmentLength'].value_counts(dropna=False).sort_index())

-1      46799
 0      64237
 1      52489
 2      72358
 3      64152
 4      47985
 5      50101
 6      37254
 7      35407
 8      36192
 9      30272
 10    262751
Name: employmentLength, dtype: int64
-1     11742
 0     15989
 1     13182
 2     18207
 3     16011
 4     11833
 5     12543
 6      9328
 7      8823
 8      8976
 9      7594
 10    65772
Name: employmentLength, dtype: int64


In [260]:
print(object_features)

['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']


In [261]:
# grade 可以自映射
print(train_data['grade'].value_counts())

B    233690
C    227116
A    139661
D    119452
E     55661
F     19053
G      5364
Name: grade, dtype: int64


In [262]:
grade_map = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7}
train_data['grade'] = train_data['grade'].map(grade_map)
test_data['grade'] = test_data['grade'].map(grade_map)

# 检查结果
print(train_data['grade'].value_counts(dropna=False).sort_index())
print(test_data['grade'].value_counts(dropna=False).sort_index())

1    139661
2    233690
3    227116
4    119452
5     55661
6     19053
7      5364
Name: grade, dtype: int64
1    34927
2    58365
3    56701
4    29924
5    14010
6     4698
7     1375
Name: grade, dtype: int64


In [263]:
# subGrade
print(train_data['subGrade'].value_counts(dropna=False))

C1    50763
B4    49516
B5    48965
B3    48600
C2    47068
C3    44751
C4    44271
B2    44227
B1    42382
C5    40263
A5    38045
A4    30928
D1    30538
D2    26528
A1    25909
D3    23410
A3    22655
A2    22124
D4    21138
D5    17838
E1    14064
E2    12746
E3    10925
E4     9273
E5     8653
F1     5925
F2     4340
F3     3577
F4     2859
F5     2352
G1     1759
G2     1231
G3      978
G4      751
G5      645
Name: subGrade, dtype: int64


In [264]:
# subGrade进行一下替换，子级别和级别
def subGradeReplace(data):
    data['subGrade'] = data['subGrade'].str[:1].apply(lambda x: str(grade_map[x])) + data['subGrade'].str[1:]
    data['subGrade'] = data['subGrade'].astype('int8')
    return data

train_data = subGradeReplace(train_data)
test_data = subGradeReplace(test_data)

# 检查结果
print(train_data['subGrade'].value_counts(dropna=False).sort_index())
print(test_data['subGrade'].value_counts(dropna=False).sort_index())

11    25909
12    22124
13    22655
14    30928
15    38045
21    42382
22    44227
23    48600
24    49516
25    48965
31    50763
32    47068
33    44751
34    44271
35    40263
41    30538
42    26528
43    23410
44    21138
45    17838
51    14064
52    12746
53    10925
54     9273
55     8653
61     5925
62     4340
63     3577
64     2859
65     2352
71     1759
72     1231
73      978
74      751
75      645
Name: subGrade, dtype: int64
11     6398
12     5503
13     5644
14     7753
15     9629
21    10544
22    10898
23    12100
24    12423
25    12400
31    12857
32    11791
33    11018
34    11110
35     9925
41     7667
42     6713
43     5821
44     5236
45     4487
51     3527
52     3175
53     2780
54     2414
55     2114
61     1462
62     1073
63      906
64      714
65      543
71      488
72      325
73      232
74      166
75      164
Name: subGrade, dtype: int64


In [265]:
import datetime

# issueDate 贷款发放的月份
def createDayFeature(data, feature, new_feature, date):
    data[feature] = pd.to_datetime(data[feature], format='%Y-%m-%d')
    start_date = datetime.datetime.strptime(date, '%Y-%m-%d')
    data[new_feature] = data[feature].apply(lambda x: x-start_date).dt.days
    return data

# 添加一个天数日期
train_data = createDayFeature(train_data, 'issueDate', 'issueDateDt', '2001-01-01')
test_data = createDayFeature(test_data, 'issueDate', 'issueDateDt', '2001-01-01')

# 检查结果
print(train_data['issueDate'].value_counts(dropna=False).sort_index())
print(test_data['issueDate'].value_counts(dropna=False).sort_index())
print(train_data['issueDateDt'].value_counts(dropna=False).sort_index())
print(test_data['issueDateDt'].value_counts(dropna=False).sort_index())

2007-06-01        1
2007-07-01       21
2007-08-01       23
2007-09-01        7
2007-10-01       26
2007-11-01       24
2007-12-01       55
2008-01-01       91
2008-02-01      105
2008-03-01      130
2008-04-01       92
2008-05-01       38
2008-06-01       33
2008-07-01       52
2008-08-01       38
2008-09-01       19
2008-10-01       62
2008-11-01      113
2008-12-01      134
2009-01-01      145
2009-02-01      160
2009-03-01      162
2009-04-01      166
2009-05-01      190
2009-06-01      191
2009-07-01      223
2009-08-01      231
2009-09-01      270
2009-10-01      305
2009-11-01      376
              ...  
2016-07-01    12835
2016-08-01    13301
2016-09-01    10165
2016-10-01    11243
2016-11-01    11172
2016-12-01    11562
2017-01-01     9757
2017-02-01     8057
2017-03-01    10068
2017-04-01     7746
2017-05-01     9620
2017-06-01     9005
2017-07-01     8861
2017-08-01     9172
2017-09-01     8100
2017-10-01     7129
2017-11-01     7306
2017-12-01     5915
2018-01-01     5176


In [266]:
# earliesCreditLine
print(train_data['earliesCreditLine'].value_counts().sort_index())

Apr-1955       2
Apr-1958       1
Apr-1960       2
Apr-1961       4
Apr-1962       4
Apr-1963      12
Apr-1964      18
Apr-1965      21
Apr-1966      28
Apr-1967      29
Apr-1968      37
Apr-1969      46
Apr-1970      51
Apr-1971      57
Apr-1972      80
Apr-1973      98
Apr-1974     109
Apr-1975     130
Apr-1976     150
Apr-1977     183
Apr-1978     244
Apr-1979     263
Apr-1980     224
Apr-1981     277
Apr-1982     393
Apr-1983     485
Apr-1984     632
Apr-1985     664
Apr-1986     800
Apr-1987     827
            ... 
Sep-1986     887
Sep-1987     936
Sep-1988    1007
Sep-1989    1074
Sep-1990    1347
Sep-1991    1263
Sep-1992    1499
Sep-1993    2086
Sep-1994    2673
Sep-1995    3217
Sep-1996    3061
Sep-1997    3365
Sep-1998    4064
Sep-1999    4441
Sep-2000    4780
Sep-2001    4787
Sep-2002    5170
Sep-2003    5403
Sep-2004    5219
Sep-2005    4608
Sep-2006    3646
Sep-2007    2656
Sep-2008    1733
Sep-2009    1295
Sep-2010    1347
Sep-2011    1008
Sep-2012     575
Sep-2013     2

In [267]:
# earliesCreditLine 变成年即可
def getYear(data, col):
    print(data[col].apply(lambda x: int(x.strip()[-4:])))
    data[col] = data[col].apply(lambda x: int(x.strip()[-4:]))
    return data

train_data = getYear(train_data, 'earliesCreditLine')
test_data = getYear(test_data, 'earliesCreditLine')

# 检查结果
print(train_data['earliesCreditLine'].value_counts(dropna=False).sort_index())
print(test_data['earliesCreditLine'].value_counts(dropna=False).sort_index())

0         2001
1         2002
2         2006
3         1999
4         1977
5         1998
6         2006
7         1994
8         1994
9         1993
10        1996
11        2007
12        2014
13        1983
14        2011
15        1996
16        2002
17        1995
18        2001
19        1999
20        2004
21        1995
22        1976
23        2000
24        1999
25        1995
26        1996
27        2008
28        2000
29        2000
          ... 
799970    1998
799971    1979
799972    1984
799973    2002
799974    1964
799975    1993
799976    2003
799977    2002
799978    1992
799979    1990
799980    2001
799981    1992
799982    1996
799983    2004
799984    1998
799985    1996
799986    2005
799987    2008
799988    2004
799989    2006
799990    2002
799991    2007
799992    2001
799993    2001
799994    2007
799995    2011
799996    1989
799997    2002
799998    1994
799999    2002
Name: earliesCreditLine, Length: 799997, dtype: int64
0         1974
1         2001
2

In [268]:
# homeOwnership
print(train_data['homeOwnership'].value_counts())

0    395729
1    317660
2     86309
3       185
5        81
4        33
Name: homeOwnership, dtype: int64


In [269]:
# verificationStatus
print(train_data['verificationStatus'].value_counts())

1    309809
2    248968
0    241220
Name: verificationStatus, dtype: int64


In [270]:
# purpose
print(train_data['purpose'].value_counts())

0     464094
4     175432
2      52129
5      46276
3      17579
9       9238
1       9106
8       8657
10      5652
7       5373
6       4354
12      1363
11       554
13       190
Name: purpose, dtype: int64


In [271]:
# regionCode
print(train_data['regionCode'].value_counts())

8     116921
14     65767
13     65041
21     56671
2      30513
30     28634
0      27180
19     26197
3      25766
9      22902
7      22600
23     20919
10     19604
12     18543
26     18432
22     17722
18     17286
4      14176
11     12929
24     12776
32     12065
38     11982
36     11644
27     10521
17      9863
35      9669
5       9581
20      9124
43      7701
42      7267
15      6690
37      5998
45      5932
16      4325
28      4036
44      3963
33      3817
6       3496
39      2943
40      2287
31      2261
34      2136
25      2102
48      1880
41      1778
1       1624
29      1560
47      1213
49      1001
46       953
50         6
Name: regionCode, dtype: int64


In [272]:
# applicationType
print(train_data['applicationType'].value_counts())

0    784583
1     15414
Name: applicationType, dtype: int64


In [273]:
# initialListStatus
print(train_data['initialListStatus'].value_counts())

0    466437
1    333560
Name: initialListStatus, dtype: int64


In [274]:
# policyCode
print(train_data['policyCode'].value_counts())
print(test_data['policyCode'].value_counts())

1.0    799997
Name: policyCode, dtype: int64
1.0    200000
Name: policyCode, dtype: int64


In [275]:
# 对分类变量做虚拟变量
train_data = pd.get_dummies(train_data, columns=['homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

print(train_data.columns)
print(test_data.columns)

Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'annualIncome',
       ...
       'regionCode_41', 'regionCode_42', 'regionCode_43', 'regionCode_44',
       'regionCode_45', 'regionCode_46', 'regionCode_47', 'regionCode_48',
       'regionCode_49', 'regionCode_50'],
      dtype='object', length=114)
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'annualIncome',
       ...
       'regionCode_41', 'regionCode_42', 'regionCode_43', 'regionCode_44',
       'regionCode_45', 'regionCode_46', 'regionCode_47', 'regionCode_48',
       'regionCode_49', 'regionCode_50'],
      dtype='object', length=113)


In [276]:
# 删除id, policyCode
train_data.drop('id', axis=1, inplace=True)
train_data.drop('policyCode', axis=1, inplace=True)
test_data.drop('id', axis=1, inplace=True)
test_data.drop('policyCode', axis=1, inplace=True)

## 2.3 异常值处理

数据中检测异常值，是比较好检测的。需不需要处理异常值，是个比较复杂的问题。可以想象一下，这个异常值是因为失误录入的，还是认为写入的。通常如果是因为失误造成的异常值，那么异常值的分布应该比较均衡，这部分异常值就需要处理。如果是人为主动制造的异常值，通常对结果有很大影响，这类异常值应该保留。

In [277]:
# 首先，添加异常值列
def findOutLiersBy3Segama(data, feature):
    # 标准差
    data_std = np.std(data[feature])
    data_mean = np.mean(data[feature])
    outliners_cut_off = data_std * 3
    # 最小标准
    lower_limit = data_mean - outliners_cut_off
    # 最大标准
    upper_limit = data_mean + outliners_cut_off
    # 标注异常值
    data[feature+'_outliers'] = data[feature].apply(lambda x: str('异常值') if x < lower_limit or x > upper_limit else '正常值')
    return data

train_data_copy = train_data.copy()

exception_features = ['loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle', 'annualIncome', 'postCode', 'dti', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'title', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']

for col in exception_features:
    train_data_copy = findOutLiersBy3Segama(train_data_copy, col)
    print(train_data_copy[col+'_outliers'].value_counts())
    print(train_data_copy.groupby(col+'_outliers')['isDefault'].sum())
    print('*'*10)


正常值    799997
Name: loanAmnt_outliers, dtype: int64
loanAmnt_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    799997
Name: term_outliers, dtype: int64
term_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    794256
异常值      5741
Name: interestRate_outliers, dtype: int64
interestRate_outliers
异常值      2916
正常值    156694
Name: isDefault, dtype: int64
**********
正常值    792043
异常值      7954
Name: installment_outliers, dtype: int64
installment_outliers
异常值      2152
正常值    157458
Name: isDefault, dtype: int64
**********
正常值    799997
Name: employmentTitle_outliers, dtype: int64
employmentTitle_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    793970
异常值      6027
Name: annualIncome_outliers, dtype: int64
annualIncome_outliers
异常值       756
正常值    158854
Name: isDefault, dtype: int64
**********
正常值    798928
异常值      1069
Name: postCode_outliers, dtype: int64
postCode_outliers
异常值       221
正常值    159389
Name: isDefault, dtype: in

In [278]:
label_result = train_data['isDefault'].value_counts()
print(label_result[1]/(label_result[0] + label_result[1]))

0.19951324817468066


In [279]:
for col in exception_features:
    d1 = train_data_copy[col+'_outliers'].value_counts()
    d2 = train_data_copy.groupby(col+'_outliers')['isDefault'].sum()
    print(col)
    print(d2[0]/d1[1] if len(d1) > 1 else 0)
    print('*'*10)

loanAmnt
0
**********
term
0
**********
interestRate
0.507925448528131
**********
installment
0.27055569524767414
**********
employmentTitle
0
**********
annualIncome
0.1254355400696864
**********
postCode
0.20673526660430308
**********
dti
0.2987179487179487
**********
ficoRangeLow
0.06627481046085697
**********
ficoRangeHigh
0.06627481046085697
**********
openAcc
0.2409175721655142
**********
pubRec
0.22592641785097622
**********
pubRecBankruptcies
0.24200680272108843
**********
revolBal
0.13592718543708743
**********
revolUtil
0.4423076923076923
**********
totalAcc
0.20007196833393306
**********
title
0.15684066596959703
**********
n0
0.2022987171300865
**********
n1
0.26221052631578945
**********
n2
0.293149181377481
**********
n3
0.293149181377481
**********
n4
0.218342151675485
**********
n5
0.19263867288750647
**********
n6
0.22738316421323423
**********
n7
0.23733794295592048
**********
n8
0.20539759036144578
**********
n9
0.2903202115158637
**********
n10
0.239451955357953
***

In [280]:
# 需要处理的异常值
exception_proc_features = ['installment', 'postCode', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'totalAcc', 'title', 'n0', 'n1', 'n4', 'n5', 'n6', 'n7', 'n8', 'n10', 'n11', 'n12', 'n13']

for col in exception_proc_features:
    train_data = train_data_copy[train_data_copy[col+'_outliers']=='正常值']
    train_data = train_data.reset_index(drop=True)

## 2.4 数据分桶

连续性数值或者多个分类的特征值可以进行分桶处理。

In [281]:
# 需要进行分桶的特征值
binning_features = ['loanAmnt', 'annualIncome', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'totalAcc']

# 以1000为单位，进行分箱
train_data['loanAmnt_bin'] = np.floor_divide(train_data['loanAmnt'], 1000)
train_data['annualIncome_bin'] = np.floor_divide(train_data['annualIncome'], 1000)
train_data['totalAcc_bin'] = np.floor_divide(train_data['totalAcc'], 1000)
test_data['loanAmnt_bin'] = np.floor_divide(test_data['loanAmnt'], 1000)
test_data['annualIncome_bin'] = np.floor_divide(test_data['annualIncome'], 1000)
test_data['totalAcc_bin'] = np.floor_divide(test_data['totalAcc'], 1000)

In [282]:
# 中位数分箱
train_data['openAcc_bin'] = pd.qcut(train_data['openAcc'], 10, labels=False, duplicates='drop')
train_data['pubRec_bin'] = pd.qcut(train_data['pubRec'], 10, labels=False, duplicates='drop')
train_data['pubRecBankruptcies_bin'] = pd.qcut(train_data['pubRecBankruptcies'], 10, labels=False, duplicates='drop')
test_data['openAcc_bin'] = pd.qcut(test_data['openAcc'], 10, labels=False, duplicates='drop')
test_data['pubRec_bin'] = pd.qcut(test_data['pubRec'], 10, labels=False, duplicates='drop')
test_data['pubRecBankruptcies_bin'] = pd.qcut(test_data['pubRecBankruptcies'], 10, labels=False, duplicates='drop')

In [283]:
## 2.5 特征交互

EDA中分析的，pubRec,pubRecBankruptcies和delinquency_2years做交叉应该会对模型有帮助。

SyntaxError: invalid character in identifier (<ipython-input-283-e2c6c4af514e>, line 3)

In [284]:
import seaborn as sns
import matplotlib.pyplot as plt

train_data['pubRec_num'] = train_data['pubRec'] - train_data['pubRecBankruptcies']
test_data['pubRec_num'] = test_data['pubRec'] - test_data['pubRecBankruptcies']

In [285]:
for df in [train_data, test_data]: 
    for item in ['n0','n1','n2','n3','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']: 
        df['grade_to_mean_' + item] = df['grade'] / df.groupby([item]) ['grade'].transform('mean')
        df['grade_to_std_' + item] = df['grade'] / df.groupby([item]) ['grade'].transform('std')

## 2.6 数据归一化



## 2.7 特征选择

In [286]:
## 2.8 简单预测

features = [f for f in train_data.columns if f not in ['id', 'issueDate', 'isDefault'] and '_outliers' not in f] 
x_train = train_data[features] 
x_test = test_data[features] 
y_train = train_data['isDefault']

print(features)

['loanAmnt', 'term', 'interestRate', 'installment', 'grade', 'subGrade', 'employmentTitle', 'employmentLength', 'annualIncome', 'postCode', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType', 'earliesCreditLine', 'title', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14', 'issueDateDt', 'homeOwnership_1', 'homeOwnership_2', 'homeOwnership_3', 'homeOwnership_4', 'homeOwnership_5', 'verificationStatus_1', 'verificationStatus_2', 'purpose_1', 'purpose_2', 'purpose_3', 'purpose_4', 'purpose_5', 'purpose_6', 'purpose_7', 'purpose_8', 'purpose_9', 'purpose_10', 'purpose_11', 'purpose_12', 'purpose_13', 'regionCode_1', 'regionCode_2', 'regionCode_3', 'regionCode_4', 'regionCode_5', 'regionCode_6', 'regionCode_7', 'regionCode_8', 'regionCode_9', 'regionCode_10', 'regionCode_11', 'regionCode_12', 'regionCode_13', 'regionCod

In [294]:
# import xgboost as xgb 
import lightgbm as lgb 
# from catboost import CatBoostRegressor
from sklearn.model_selection import StratifiedKFold, KFold 
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss

def cv_model(clf, train_x, train_y, test_x, clf_name): 
    folds = 5 
    seed = 2020 
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed) 
    train = np.zeros(train_x.shape[0]) 
    test = np.zeros(test_x.shape[0]) 
    cv_scores = [] 
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)): 
        print('************************************ {} ************************************'.format(str(i+1))) 
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index] 
        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y) 
            valid_matrix = clf.Dataset(val_x, label=val_y) 
            params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'min_child_weight': 5, 'num_leaves': 2 ** 5, 'lambda_l2': 10, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 4, 'learning_rate': 0.1, 'seed': 2020, 'nthread': 28, 'n_jobs':24, 'silent': True, 'verbose': -1, }
            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200) 
            val_pred = model.predict(val_x, num_iteration=model.best_iteration) 
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)

        if clf_name == "xgb": 
            train_matrix = clf.DMatrix(trn_x , label=trn_y) 
            valid_matrix = clf.DMatrix(val_x , label=val_y) 
            params = {'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'gamma': 1, 'min_child_weight': 1.5, 'max_depth': 5, 'lambda': 10, 'subsample': 0.7, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.7, 'eta': 0.04, 'tree_method': 'exact', 'seed': 2020, 'nthread': 36, "silent": True, }
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')] 
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200) 
            val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit) 
            test_pred = model.predict(test_x , ntree_limit=model.best_ntree_limit)

        if clf_name == "cat": 
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli', 'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            model = clf(iterations=20000, **params) 
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y), cat_features=[], use_best_model=True, verbose=500) 
            val_pred = model.predict(val_x) 
            test_pred = model.predict(test_x) 

        train[valid_index] = val_pred 
        test = test_pred / kf.n_splits 
        cv_scores.append(roc_auc_score(val_y, val_pred))
        print(cv_scores)

    print("%s_scotrainre_list:" % clf_name, cv_scores) 
    print("%s_score_mean:" % clf_name, np.mean(cv_scores)) 
    print("%s_score_std:" % clf_name, np.std(cv_scores)) 
    return train, test

def lgb_model(x_train, y_train, x_test): 
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb") 
    return lgb_train, lgb_test 
def xgb_model(x_train, y_train, x_test): 
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb") 
    return xgb_train, xgb_test 
def cat_model(x_train, y_train, x_test): 
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_train, cat_test

In [295]:
lgb_train, lgb_test = lgb_model(x_train.iloc[:10000, :], y_train.iloc[:10000], x_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.990421	valid_1's auc: 0.70711
Early stopping, best iteration is:
[36]	training's auc: 0.838305	valid_1's auc: 0.726466
[0.7264657004655278]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.989481	valid_1's auc: 0.680084
Early stopping, best iteration is:
[33]	training's auc: 0.834603	valid_1's auc: 0.700719
[0.7264657004655278, 0.7007189764492754]
************************************ 3 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.99023	valid_1's auc: 0.686777
Early stopping, best iteration is:
[74]	training's auc: 0.907537	valid_1's auc: 0.696864
[0.7264657004655278, 0.7007189764492754, 0.6968643762872526]
*********************************

In [None]:
# 时间关系，最后部分代码只来得及测试，没有详细看，抱歉！！！