In [1]:
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.model_selection import KFold

In [3]:
df = pd.read_csv('bank.csv')

# Xとyに分ける

In [4]:
X = df.drop(['deposit'], axis=1)

In [5]:
X.head(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown


In [6]:
y = df['deposit']

In [7]:
y.head(3)

0    yes
1    yes
2    yes
Name: deposit, dtype: object

# 不要なものを削除

In [8]:
# durationとcampaignは未来のデータなので削除する
X = X.drop('duration',axis=1).drop('campaign',axis=1)

In [9]:
X.head(3)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,pdays,previous,poutcome
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,-1,0,unknown
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,-1,0,unknown
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,-1,0,unknown


# one-hotエンコーディング

In [10]:
# one-hotエンコーディング(X)
ohe_columns = ['job',
               'marital',
               'education',
               'default',
               'housing',
               'loan',
               'contact',
               'month',
               'poutcome']

X_ohe = pd.get_dummies(X,
                       dummy_na=True,
                       columns=ohe_columns)

print('x_ohe shape:(%i,%i)' % X_ohe.shape)
X_ohe.head()

x_ohe shape:(11162,58)


Unnamed: 0,age,balance,day,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,...,month_may,month_nov,month_oct,month_sep,month_nan,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,poutcome_nan
0,59,2343,5,-1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,56,45,5,-1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,41,1270,5,-1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
3,55,2476,5,-1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,54,184,5,-1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [11]:
# one-hotエンコーディング(y)
y_ohe = pd.get_dummies(y, dummy_na=True, columns=['deposit'])
print('train_y_ohe shape:(%i,%i)' % y_ohe.shape)
y_ohe.head()

train_y_ohe shape:(11162,3)


Unnamed: 0,no,yes,nan
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0


# 多重共線性の削除

In [12]:
X_ohe = X_ohe.drop(['job_nan', 'marital_nan', 'education_nan', 'default_nan', 'housing_nan', 'loan_nan', 'contact_nan', 'month_nan', 'poutcome_nan'],axis=1)

# RandomForestの重要度から、不要な特徴量を削除

In [13]:
X_ohe = X_ohe.drop(['education_primary',
                'education_secondary',
                'marital_divorced',
                'job_management',
                'job_technician',
                'job_admin.',
                'job_housemaid',
                'job_services',
                'education_unknown',
                'job_unemployed',
                'job_entrepreneur',
                'job_self-employed',
                'default_yes',
                'default_no',
                'job_unknown'], axis=1)

In [14]:
display(X_ohe.shape)
display(X_ohe.head(3))

(11162, 34)

Unnamed: 0,age,balance,day,pdays,previous,job_blue-collar,job_retired,job_student,marital_married,marital_single,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,59,2343,5,-1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
1,56,45,5,-1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
2,41,1270,5,-1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1


# アンサンブル & スタッキング(積み重ね)モデル

## Pythonクラスによるヘルパー

In [15]:
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(n_splits=NFOLDS,
           shuffle=False,
           random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

    def fit(self, x, y):
        return self.clf.fit(x, y)

    def feature_importances(self, x, y):
        print(self.clf.fit(x, y).feature_importances_)

# Class to extend XGboost classifier

## Out-of-Fold Predictions

In [16]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
#     for i, (train_index, test_index) in enumerate(kf):

#     df_y_train = pd.DataFrame(y_train)
#     for i, (train_index, test_index) in enumerate(kf.split(df_y_train)):
    for i, (train_index, test_index) in enumerate(kf.split(train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        
        clf.train(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
    
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [17]:
# df_y_train = pd.DataFrame(y_train)
# display(df_y_train)

# 基本となる最初のレベルのモデルたち

In [18]:
# ランダムフォレストのパラメータ
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
    'warm_start': True,
    #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'verbose': 0
}

# Extra Treesのパラメータ
et_params = {
    'n_jobs': -1,
    'n_estimators': 500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoostのパラメータ
ada_params = {
    'n_estimators': 500,
    'learning_rate': 0.75
}

# Gradient Boostingのパラメータ
gb_params = {
    'n_estimators': 500,
    #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# SVCのパラメータ
svc_params = {
    'kernel': 'linear',
    'C': 0.025
}

In [19]:
# 4つのモデルを表す5つのオブジェクトを生成する
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=ada_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

# TrainデータとTestデータの分離

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train_ohe, X_test_ohe, y_train_ohe, y_test_ohe = train_test_split(X_ohe, y_ohe, test_size=0.3, random_state=1)

# モデル作成

In [22]:
rf_model = rf.fit(X_train_ohe, y_train_ohe)

# 予測＆評価

In [23]:
y_pred = rf_model.predict(X_test_ohe)

In [24]:
from sklearn.metrics import accuracy_score

In [25]:
accuracy_score(y_test_ohe, y_pred)

0.7028963869811884