<h1>UnderSampling + Baggingでimbalancedなデータを学習</h1>


In [31]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader , TensorDataset

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np

In [32]:
#データの読み込みとデータ処理
data = pd.read_csv("./data/credit_card.csv").drop('ID',axis=1).sample(frac=1)
data['SEX'][data['SEX'] == 1] = 0
data['SEX'][data['SEX'] == 2] = 1
print(data.shape)
data['SEX']
data.head()

(30000, 24)


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
2402,80000,1,2,2,29,1,2,2,2,2,...,55201,69270,70582,0,5000,0,15000,2600,0,0
1238,120000,0,2,1,57,0,0,0,0,0,...,88557,91124,88094,5000,5007,4000,4003,4003,4000,1
20437,30000,0,2,2,22,3,2,2,2,2,...,27080,30525,29764,0,3200,0,3900,0,0,1
21490,200000,1,2,2,29,-1,0,0,0,0,...,10519,6980,8702,1500,3000,3000,1000,2000,2000,0
27499,20000,0,2,1,59,3,2,3,3,2,...,8208,7930,8650,2000,1000,0,0,1000,0,1


In [33]:
sum(data['default payment next month'] ==1 ) / len(data)
data.shape

(30000, 24)

In [34]:
# データを訓練データと評価データに分割
train_data = data[:int(len(data)*0.7)]
test_data = data[int(len(data)*0.7):]

train_data.shape

(21000, 24)

In [35]:
# 訓練データを少数クラスのデータと多数クラスに分割
train_data_1 = train_data[train_data['default payment next month'] == 1]
train_data_0 = train_data[train_data['default payment next month'] == 0]

In [100]:
# default payment next monthが 1 なのは全体の1/5くらい
len(train_data_1)/len(train_data)

0.21866666666666668

<h2>ニューラルネットワークモデル定義</h2>
<h4>GridSearchCVとかPipelineが便利なので、sklearnのBaseEstimatorを継承しました</h4>

In [97]:
from sklearn.base import BaseEstimator
from sklearn.base import BaseEstimator, ClassifierMixin

INPUT_COLUMNS = len(train_data_feature[0])
hidden_layer = INPUT_COLUMNS
OUTPUT_SIZE=2

class skNN(BaseEstimator,nn.Module,ClassifierMixin):
    def __init__(self, input_layer=len(train_data_feature[0]),hidden_layer=20,lr=0.01):
        self.input_layer = input_layer
        self.hidden_layer = hidden_layer
        self.lr = lr
        
        super(skNN, self).__init__()
        self.fc1 = nn.Linear(self.input_layer,self.hidden_layer)
        self.fc2 = nn.Linear(self.hidden_layer,self.hidden_layer)
        self.fc3 = nn.Linear(self.hidden_layer,OUTPUT_SIZE)
        self.outact = nn.Softmax()
        
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(self.parameters(),lr=self.lr)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return self.outact(x)
        
    def fit(self, X, y=None, **params):
        self.input_layer = len(X[0])
        self.fc1 = nn.Linear(self.input_layer,self.hidden_layer)
        
        tensorX = torch.from_numpy(X).float()
        tensorY = torch.from_numpy(y).long()
        train = TensorDataset(tensorX,tensorY)
        train_loader = DataLoader(train, batch_size=32, shuffle=True)

        for epoch in range(int(len(X)/32)):
            total_loss = 0    
            for train_x, train_y in train_loader:
                train_x, train_y = Variable(train_x), Variable(train_y)
                self.optimizer.zero_grad()
                output = self(train_x)
                loss = self.criterion(output, train_y)
                loss.backward()
                self.optimizer.step()
                total_loss += loss.data[0]
            
            if (epoch+1) % 50 == 0:
                print(epoch+1, total_loss)
        return self
    
    def predict(self, X):
        tensorX = torch.from_numpy(X).float()
        test_X = Variable(tensorX)
        return (self(test_X).data[:,1].numpy() > 0.5).astype(int)
    
    def predict_proba(self, X):
        tensorX = torch.from_numpy(X).float()
        test_X = Variable(tensorX)
        return self(test_X).data[:,1].numpy()
    
    def score(self,X,y):
        tensorX = torch.from_numpy(X).float()
        test_X = Variable(tensorX)
        return sum( (self(test_X).data[:,1].numpy() > 0.5).astype(int) == y )/ len(y)
    
    def get_params(self,deep=True):
        return {'hidden_layer': self.hidden_layer, 'lr': self.lr}
    
    def set_params(self,**params):
        for parameter, value in params.items():
            setattr(self,parameter, value)
        self.fc1 = nn.Linear(self.input_layer,self.hidden_layer)
        self.fc2 = nn.Linear(self.hidden_layer,self.hidden_layer)
        self.fc3 = nn.Linear(self.hidden_layer,OUTPUT_SIZE)
        self.optimizer = optim.Adam(self.parameters(),lr=self.lr)
        return self
        

<h2>学習実行</h2>
<h4>折角なので色んな種類の識別器を使ってみます。それぞれでグリッドサーチし、最適なパラメータを探索しています</h4>

In [89]:
# 識別器の定義
from sklearn.linear_model import LogisticRegression
from sklearn import svm
clfs = {"GB1":GradientBoostingClassifier(),
        "GB2":GradientBoostingClassifier(),
        "GB3":GradientBoostingClassifier(),
        "GB4":GradientBoostingClassifier(),
        "NN":skNN(),
        "LR1":LogisticRegression(),
        "LR2":LogisticRegression(),
        "SVC":svm.SVC()
       }

In [90]:
# 訓練データを説明変数、目的変数に分割
train_data_ = np.vstack((train_data_0.sample(n=int(len(train_data_1))),train_data_1))
train_data_feature = train_data_[:,:-1]
train_data_target = train_data_[:,-1]
print(train_data_feature.shape)
print(train_data_target.shape)

(9184, 23)
(9184,)


In [91]:
# 各Estimatorのパラメータ探索用dict
gb_parameters = {
    "clf__learning_rate": [0.01, 0.05, 0.1],
    "clf__max_depth":[3,5],
    "clf__subsample":[0.5, 1.0],
    "clf__n_estimators":[10]
}
rf_parameters = {
    "clf__n_estimators" : [100],
    "clf__max_features" : [1, 'auto', None],
    "clf__max_depth" : [1, 5, 10, None],
    "clf__min_samples_leaf": [1, 2, 4,]
}
lr_parameters = {
    "clf__C": [0.01,0.1,1]
}
svc_parameters = {
    "clf__C": [0.01,0.1,1,10,100],
    "clf__gamma": [0.01,0.1,1,10,100]
}
nn_parameters = {
    'clf__hidden_layer': [30,40,50,60],
    'clf__lr': [0.01,0.05],
}

In [92]:
train_data_feature.dtype

dtype('int64')

In [93]:
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
i=0

# 識別器格納用リスト
# 8種類の識別器が格納される
gb_clfs = []

# 各識別器に対して学習実行
for clf in clfs.items():
    i += 1
    print(i) 
    
    # 多数データを少数データの数だけUnderSamplingし、少数データとマージ
    train_data_ = np.vstack((train_data_0.sample(n=int(len(train_data_1))),train_data_1))
    # マージ後の訓練データをシャッフル
    np.random.shuffle(train_data_)
    # 訓練データを説明変数と目的変数に分割
    train_data_feature = train_data_[:,:-1].astype(float)
    train_data_target = train_data_[:,-1].astype(float)
    
    # 説明変数(特徴量)抽出モデルと標準化モデルを定義
    select = SelectFromModel(RandomForestClassifier(n_estimators=100, n_jobs=-1))
    scaler = StandardScaler()
    
    # 一連の処理(Pipeline)を定義
    # 特徴量抽出→標準化→識別器訓練
    estimator = [
        #('select',select),
        ('scaler',scaler),
        ('clf',clf[1])
    ]

    pipe = Pipeline(estimator)
    
    # グリッドサーチ用モデルの定義
    # 識別器の種類によってパラメータが異なるので条件分岐
    if clf[0] == 'SVC':
        gb_clf = GridSearchCV(pipe, svc_parameters, scoring='f1', cv=3, n_jobs=-1)
    elif clf[0] == 'RF':
        gb_clf = GridSearchCV(pipe, rf_parameters, scoring='f1', cv=3, n_jobs=-1)
    elif clf[0] =='NN':
        gb_clf = GridSearchCV(pipe, nn_parameters, scoring='f1', cv=3, n_jobs=-1)
    elif clf[0].find('LR') > -1:
        gb_clf = GridSearchCV(pipe, lr_parameters, scoring='f1', cv=3, n_jobs=-1)
    else:
        gb_clf = GridSearchCV(pipe, gb_parameters, scoring='f1', cv=3, n_jobs=-1)

    # グリッドサーチ実行
    gb_clf.fit(train_data_feature,train_data_target)
    
    # 最適なパラメータで訓練された識別器をリストに追加
    gb_clfs.append(gb_clf)

1
2
3
4
5




50 tensor(106.8680)
50 tensor(106.9900)
50 tensor(104.9529)
50 tensor(114.5744)
100 tensor(103.5048)
100 tensor(102.4545)
100 tensor(101.0104)
100 tensor(114.0480)
150 tensor(100.7757)
150 tensor(101.3939)
150 tensor(98.4460)
150 tensor(114.3346)




50 tensor(112.4436)
50 tensor(112.5505)
50 tensor(104.2488)
50 tensor(102.1906)
100 tensor(113.5997)
100 tensor(111.3532)
100 tensor(99.6503)
100 tensor(97.8627)
150 tensor(111.0628)
150 tensor(98.0733)
150 tensor(110.8978)
150 tensor(95.0773)




50 tensor(105.6742)
50 tensor(112.9785)
50 tensor(112.0818)
50 tensor(114.8684)
100 tensor(102.8164)
100 tensor(113.1351)
100 tensor(110.1609)
100 tensor(114.5350)
150 tensor(100.4826)
150 tensor(111.8961)
150 tensor(110.5884)
150 tensor(114.6511)




50 tensor(103.9020)
50 tensor(103.9841)
50 tensor(102.8355)
50 tensor(114.7812)
100 tensor(100.1103)
100 tensor(98.7452)
100 tensor(98.3209)
100 tensor(112.6042)
150 tensor(97.3260)
150 tensor(96.0988)
150 tensor(96.3484)
150 tensor(112.0269)




50 tensor(113.1787)
50 tensor(112.9205)
50 tensor(102.2000)
50 tensor(101.0792)
100 tensor(111.1780)
100 tensor(114.4659)
100 tensor(97.1933)
100 tensor(97.0783)
150 tensor(110.0193)
150 tensor(112.3461)
150 tensor(94.9104)




150 tensor(96.0246)




50 tensor(101.8299)
50 tensor(113.3773)
50 tensor(111.2746)
50 tensor(121.6938)
100 tensor(97.7106)
100 tensor(113.7258)
100 tensor(109.4721)
100 tensor(118.7504)
150 tensor(93.6314)
150 tensor(113.4941)
150 tensor(110.6812)
150 tensor(121.7080)




50 tensor(157.1936)
100 tensor(152.2244)
150 tensor(148.2690)
200 tensor(148.6673)
250 tensor(147.2703)
6
7
8


<h2>評価</h2>

In [95]:
# 評価データを説明変数と目的変数に分割
test_data_feature = test_data.iloc[:,:-1]
test_data_target = test_data.iloc[:,-1]

# 投票箱作成
predict_vote=np.zeros(int(len(test_data_feature)))

# 各識別器の予測結果を投票箱に入れる
for gb_clf in gb_clfs:
    predict_vote += (gb_clf.predict(test_data_feature))

# 投票結果は多数決    
predict_vote = (predict_vote/8 > 0.5).astype(int)
predict_vote



array([1, 1, 0, ..., 0, 0, 1])

In [96]:
# F値確認
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

print(f1_score(test_data_target,predict_vote))
confusion_matrix(test_data_target, predict_vote)

0.543754175017


array([[5730, 1226],
       [ 823, 1221]])

<h2>単純に勾配ブースティング</h2>
<h4>以下、比較のため単体での学習・評価を実行</h4>

In [114]:
train_data_ = np.vstack((train_data_0.sample(n=int(len(train_data_1))),train_data_1))
# マージ後の訓練データをシャッフル
np.random.shuffle(train_data_)
# 訓練データを説明変数と目的変数に分割
train_data_feature = train_data_[:,:-1].astype(float)
train_data_target = train_data_[:,-1].astype(float)

In [115]:
mono_gb = GradientBoostingClassifier()

In [116]:
select = SelectFromModel(RandomForestClassifier(n_estimators=100, n_jobs=-1))
scaler = StandardScaler()
estimator = [
    #('select',select),
    ('scaler',scaler),
    ('clf',mono_gb)
]

pipe = Pipeline(estimator)

mono_gb_clf = GridSearchCV(pipe, gb_parameters, cv=3, n_jobs=-1)
mono_gb_clf.fit(train_data_feature,train_data_target)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decreas...      presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'clf__learning_rate': [0.01, 0.05, 0.1], 'clf__max_depth': [3, 5], 'clf__subsample': [0.5, 1.0], 'clf__n_estimators': [10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [117]:
mono_gb_predict = mono_gb_clf.predict(test_data_feature)

In [118]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

print(f1_score(test_data.iloc[:,-1],mono_gb_predict))
confusion_matrix(test_data.iloc[:,-1], mono_gb_predict)

0.536616702355


array([[5583, 1373],
       [ 791, 1253]])

<h2>単純にニューラルネットワーク</h2>

In [87]:
model=skNN(hidden_layer=30,lr=0.01)

In [71]:
model=skNN(hidden_layer=30,lr=0.1)
select = SelectFromModel(RandomForestClassifier(n_estimators=100, n_jobs=-1))
scaler = StandardScaler()
estimator = [
    #('select',select),
    ('scaler',scaler),
    ('clf',model)
]

pipe = Pipeline(estimator)

In [78]:
nn_params = {
        'clf__hidden_layer': [30,40,50,60],
        'clf__lr': [0.01,0.05],
    }
grid_search = GridSearchCV(pipe,param_grid=nn_params,cv=3)
grid_search.fit(train_data_feature,train_data_target)



50 tensor(214.7204)
100 tensor(207.6572)
150 tensor(205.6797)
200 tensor(205.0415)
250 tensor(201.6186)
300 tensor(201.2094)




50 tensor(210.7596)
100 tensor(204.3947)
150 tensor(202.8853)
200 tensor(199.7377)
250 tensor(198.0850)
300 tensor(196.3199)




50 tensor(213.8788)
100 tensor(208.5111)
150 tensor(204.2100)
200 tensor(203.3073)
250 tensor(201.8567)
300 tensor(201.0586)




50 tensor(228.5086)
100 tensor(233.0519)
150 tensor(227.6787)
200 tensor(226.7003)
250 tensor(231.2693)
300 tensor(229.6535)




50 tensor(230.4341)
100 tensor(231.1818)
150 tensor(232.9640)
200 tensor(231.6747)
250 tensor(230.2160)
300 tensor(227.3287)




50 tensor(230.8564)
100 tensor(230.4863)
150 tensor(233.3822)
200 tensor(231.5146)
250 tensor(228.5508)
300 tensor(232.3887)




50 tensor(212.2606)
100 tensor(204.1678)
150 tensor(201.8092)
200 tensor(199.2731)
250 tensor(200.1341)
300 tensor(199.9729)




50 tensor(208.8728)
100 tensor(203.2689)
150 tensor(201.6723)
200 tensor(198.6338)
250 tensor(199.8845)
300 tensor(198.3681)




50 tensor(210.1789)
100 tensor(204.5004)
150 tensor(200.4116)
200 tensor(198.1672)
250 tensor(199.1358)
300 tensor(198.2924)




50 tensor(231.5375)
100 tensor(226.5365)
150 tensor(229.4781)
200 tensor(231.4583)
250 tensor(226.5414)
300 tensor(226.2895)




50 tensor(232.1916)
100 tensor(228.8966)
150 tensor(234.3944)
200 tensor(235.0987)
250 tensor(229.9450)
300 tensor(228.2445)




50 tensor(229.3380)
100 tensor(227.6495)
150 tensor(230.7379)
200 tensor(232.0704)
250 tensor(233.3507)
300 tensor(235.1933)




50 tensor(205.8096)
100 tensor(198.9461)
150 tensor(193.7176)
200 tensor(191.4719)
250 tensor(190.4464)
300 tensor(187.7679)




50 tensor(204.5936)
100 tensor(198.8827)
150 tensor(192.1316)
200 tensor(191.7708)
250 tensor(187.2204)
300 tensor(186.7061)




50 tensor(208.2058)
100 tensor(202.7624)
150 tensor(200.1342)
200 tensor(195.1940)
250 tensor(196.5866)
300 tensor(192.4830)




50 tensor(280.8603)
100 tensor(244.1492)
150 tensor(245.0082)
200 tensor(253.2157)
250 tensor(245.3171)
300 tensor(250.1101)




50 tensor(231.0909)
100 tensor(229.7380)
150 tensor(228.7110)
200 tensor(230.4012)
250 tensor(229.4269)
300 tensor(230.5448)




50 tensor(238.3144)
100 tensor(229.9802)
150 tensor(236.1840)
200 tensor(231.3784)
250 tensor(228.3245)
300 tensor(229.6743)




50 tensor(208.3472)
100 tensor(197.1072)
150 tensor(200.0561)
200 tensor(193.3410)
250 tensor(192.0423)
300 tensor(188.2139)




50 tensor(207.6814)
100 tensor(201.6565)
150 tensor(198.8749)
200 tensor(201.6703)
250 tensor(196.8871)
300 tensor(197.3461)




50 tensor(210.2308)
100 tensor(202.4248)
150 tensor(198.4446)
200 tensor(196.7221)
250 tensor(198.2836)
300 tensor(197.0681)




50 tensor(250.4229)
100 tensor(249.6544)
150 tensor(264.2796)
200 tensor(249.1526)
250 tensor(248.4046)
300 tensor(248.3296)




50 tensor(239.5768)
100 tensor(236.3243)
150 tensor(238.9701)
200 tensor(228.9630)
250 tensor(228.4881)
300 tensor(232.0840)




50 tensor(242.7798)
100 tensor(243.6463)
150 tensor(243.0724)
200 tensor(241.6046)
250 tensor(241.5630)
300 tensor(241.5630)




50 tensor(317.5160)
100 tensor(310.3251)
150 tensor(305.5786)
200 tensor(304.0794)
250 tensor(300.2977)
300 tensor(297.9107)


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', skNN(hidden_layer=30, lr=0.1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__hidden_layer': [30, 40, 50, 60], 'clf__lr': [0.01, 0.05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [79]:
grid_search.best_params_

{'clf__hidden_layer': 60, 'clf__lr': 0.01}

In [80]:
nn_predicted = grid_search.predict(test_data_feature)
sum(nn_predicted == test_data_target) / len(test_data_target)



0.72422222222222221

In [81]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

print(f1_score(test_data_target,nn_predicted))
confusion_matrix(test_data_target, nn_predicted)

0.538661710037


array([[5069, 1957],
       [ 525, 1449]])