In [1]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np

In [2]:
#データの読み込みとデータ処理
data = pd.read_csv("./data/credit_card.csv").drop('ID',axis=1).sample(frac=1)
data['SEX'][data['SEX'] == 1] = 0
data['SEX'][data['SEX'] == 2] = 1
print(data.shape)
data['SEX']
data.head()

(30000, 24)


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
14663,300000,1,2,2,39,-1,-1,-1,-1,-1,...,990,990,990,5490,990,990,990,990,990,0
13054,80000,1,1,2,27,-1,-1,-1,-2,-1,...,0,333,183,7900,0,0,333,0,1500,0
23063,30000,1,1,2,46,0,0,0,0,2,...,24116,23430,25044,2000,2000,2535,0,2000,2300,0
29674,110000,0,3,1,40,0,0,0,0,0,...,79368,74778,75996,4500,4100,2612,2666,2700,2659,0
21735,50000,1,1,2,23,-2,-2,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0


In [3]:
sum(data['default payment next month'] ==1 ) / len(data)
data.shape

(30000, 24)

In [4]:
# データを訓練データと評価データに分割
train_data = data[:int(len(data)*0.7)]
test_data = data[int(len(data)*0.7):]

train_data.shape

(21000, 24)

In [5]:
# 訓練データを少数クラスのデータと多数クラスに分割
train_data_1 = train_data[train_data['default payment next month'] == 1]
train_data_0 = train_data[train_data['default payment next month'] == 0]

In [7]:
# 弱識別器の定義
from sklearn.linear_model import LogisticRegression
from sklearn import svm
clfs = {"GB1":GradientBoostingClassifier(),
        "GB2":GradientBoostingClassifier(),
        "GB3":GradientBoostingClassifier(),
        "GB4":GradientBoostingClassifier(),
        "GB5":GradientBoostingClassifier(),
        "LR1":LogisticRegression(),
        "LR2":LogisticRegression(),
        "SVC":svm.SVC()
       }

In [8]:
# 訓練データを説明変数、目的変数に分割
train_data_ = np.vstack((train_data_0.sample(n=int(len(train_data_1))),train_data_1))
train_data_feature = train_data_[:,:-1]
train_data_target = train_data_[:,-1]
print(train_data_feature.shape)
print(train_data_target.shape)

(9324, 23)
(9324,)


In [9]:
# 各Estimatorのパラメータ探索用dict
gb_parameters = {
    "clf__learning_rate": [0.01, 0.05, 0.1],
    "clf__max_depth":[3,5],
    "clf__subsample":[0.5, 1.0],
    "clf__n_estimators":[10]
}
rf_parameters = {
    "clf__n_estimators" : [100],
    "clf__max_features" : [1, 'auto', None],
    "clf__max_depth" : [1, 5, 10, None],
    "clf__min_samples_leaf": [1, 2, 4,]
}
lr_parameters = {
    "clf__C": [0.01,0.1,1]
}
svc_parameters = {
    "clf__C": [0.01,0.1,1,10,100],
    "clf__gamma": [0.01,0.1,1,10,100]
}

In [10]:
train_data_feature.dtype

dtype('int64')

In [11]:
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
i=0

# 識別器格納用リスト
# 8種類の識別器が格納される
gb_clfs = []

# 各識別器に対して学習実行
for clf in clfs.items():
    i += 1
    print(i) 
    
    # 多数データを少数データの数だけUnderSamplingし、少数データとマージ
    train_data_ = np.vstack((train_data_0.sample(n=int(len(train_data_1))),train_data_1))
    # マージ後の訓練データをシャッフル
    np.random.shuffle(train_data_)
    # 訓練データを説明変数と目的変数に分割
    train_data_feature = train_data_[:,:-1].astype(float)
    train_data_target = train_data_[:,-1].astype(float)
    
    # 説明変数(特徴量)抽出モデルと標準化モデルを定義
    select = SelectFromModel(RandomForestClassifier(n_estimators=100, n_jobs=-1))
    scaler = StandardScaler()
    
    # 一連の処理(Pipeline)を定義
    # 特徴量抽出→標準化→識別器訓練
    estimator = [
        #('select',select),
        ('scaler',scaler),
        ('clf',clf[1])
    ]

    pipe = Pipeline(estimator)
    
    # グリッドサーチ用モデルの定義
    # 識別器の種類によってパラメータが異なるので条件分岐
    if clf[0] == 'SVC':
        gb_clf = GridSearchCV(pipe, svc_parameters, cv=3, n_jobs=-1)
    elif clf[0] == 'RF':
        gb_clf = GridSearchCV(pipe, rf_parameters, cv=3, n_jobs=-1)
    elif clf[0].find('LR') > -1:
        gb_clf = GridSearchCV(pipe, lr_parameters, cv=3, n_jobs=-1)
    else:
        gb_clf = GridSearchCV(pipe, gb_parameters, cv=3, n_jobs=-1)

    # グリッドサーチ実行
    gb_clf.fit(train_data_feature,train_data_target)
    
    # 最適なパラメータで訓練された識別器をリストに追加
    gb_clfs.append(gb_clf)

1
2
3
4
5
6
7
8


In [12]:
len(gb_clfs)

8

In [13]:
# 評価データを説明変数と目的変数に分割
test_data_feature = test_data.iloc[:,:-1]
test_data_target = test_data.iloc[:,-1]

# 投票箱作成
predict_vote=np.zeros(int(len(test_data_feature)))

# 各識別器の予測結果を投票箱に入れる
for gb_clf in gb_clfs:
    predict_vote += (gb_clf.predict(test_data_feature))

# 投票結果は多数決    
predict_vote = (predict_vote/8 > 0.5).astype(int)
predict_vote

array([0, 1, 1, ..., 0, 0, 0])

In [14]:
# F値確認
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

print(f1_score(test_data_target,predict_vote))
confusion_matrix(test_data_target, predict_vote)

0.53659652333


array([[5801, 1225],
       [ 801, 1173]])

<h1>単純に勾配ブースティング</h1>

In [151]:
mono_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,max_depth=1)

In [103]:
sum(mono_predict == test_data.iloc[:,-1])

7393

In [152]:
select = SelectFromModel(RandomForestClassifier(n_estimators=100, n_jobs=-1))
scaler = StandardScaler()
estimator = [
    #('select',select),
    ('scaler',scaler),
    ('clf',clf)
]

pipe = Pipeline(estimator)

mono_gb_clf = GridSearchCV(pipe, gb_parameters, cv=3, n_jobs=-1)
mono_gb_clf.fit(train_data.iloc[:,:-1],train_data.iloc[:,-1])

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decreas...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'clf__learning_rate': [0.01, 0.05, 0.1], 'clf__max_depth': [3, 5], 'clf__subsample': [0.5, 1.0], 'clf__n_estimators': [10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [153]:
mono_predict = mono_gb_clf.predict(test_data.iloc[:,:-1])

In [154]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

print(f1_score(test_data.iloc[:,-1],mono_predict))
confusion_matrix(test_data.iloc[:,-1], mono_predict)

0.431358637815


array([[6789,  236],
       [1367,  608]])