In [174]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC # SVCは非線形SVM（Support Vector Classifier）
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# ファイル入出力ライブラリ
import pickle

# データの読み込み

In [2]:
df_input = pd.read_csv('../bank_marketing_train.csv')

# 特徴量エンジニアリング

### 最終的なペルソナに従い、特徴量を抽出する
- age:60以上
- job:retired
- marital：結婚経験あり
- default(クレジットの支払い遅延)：なし
- education(最終学歴)：basic.4y
- contact(連絡デバイス)：cellular
- pdays（前回の接触からの経過日数）：少ない
- poutcome（以前のキャンペーン結果）：あり（初めての客でない）

### ペルソナに用いた説明変数以外で重要なものも含める
- emp.var.rate
- cons.price.idx
- cons.conf.idx

In [245]:
df_extract = df_input.loc[:, ['age','job','marital','default','education','contact','pdays','poutcome',
                            'emp.var.rate','cons.price.idx','cons.conf.idx','y']]
# df_extract = df_input.loc[:, ['age','job','marital','default','education','housing','loan','contact','pdays','poutcome',
#                             'previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','y']]

In [246]:
df_extract.head(2)

Unnamed: 0,age,job,marital,default,education,housing,loan,contact,pdays,poutcome,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,no,basic.4y,no,no,telephone,999,nonexistent,0,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,unknown,high.school,no,no,telephone,999,nonexistent,0,1.1,93.994,-36.4,4.857,5191.0,no


### 特徴量と目的変数に分ける

In [247]:
df_X = df_extract.drop('y', axis=1)

In [248]:
df_X.head(2)

Unnamed: 0,age,job,marital,default,education,housing,loan,contact,pdays,poutcome,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,housemaid,married,no,basic.4y,no,no,telephone,999,nonexistent,0,1.1,93.994,-36.4,4.857,5191.0
1,57,services,married,unknown,high.school,no,no,telephone,999,nonexistent,0,1.1,93.994,-36.4,4.857,5191.0


In [249]:
df_y = df_extract['y']

In [250]:
df_y.head(2)

0    no
1    no
Name: y, dtype: object

### 目的変数のyes/noを1/0に変換

In [251]:
df_y_val = df_y.apply(lambda x: 1 if x == 'yes' else 0)

In [252]:
df_y_val.head(2)

0    0
1    0
Name: y, dtype: int64

### Dummyエンコーディング

In [253]:
# カテゴリカル変数
categorical_columns = ['job','marital','default','education','housing','loan','contact','poutcome']

In [254]:
df_X_dm = pd.get_dummies(data=df_X, dummy_na=True, drop_first=True, columns=categorical_columns)

In [255]:
df_X_dm.columns

Index(['age', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'job_nan', 'marital_married',
       'marital_single', 'marital_unknown', 'marital_nan', 'default_unknown',
       'default_yes', 'default_nan', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'education_nan', 'housing_unknown', 'housing_yes',
       'housing_nan', 'loan_unknown', 'loan_yes', 'loan_nan',
       'contact_telephone', 'contact_nan', 'poutcome_nonexistent',
       'poutcome_success', 'poutcome_nan'],
      dtype='object')

### 機械学習モデルに入力するためNumpy配列に変換

In [256]:
y = np.array(df_y_val)
X = np.array(df_X_dm)

In [257]:
print(y)
print(X)

[0 0 0 ..., 1 1 0]
[[  56.  999.    0. ...,    1.    0.    0.]
 [  57.  999.    0. ...,    1.    0.    0.]
 [  56.  999.    0. ...,    1.    0.    0.]
 ..., 
 [  73.  999.    0. ...,    1.    0.    0.]
 [  44.  999.    0. ...,    1.    0.    0.]
 [  74.  999.    1. ...,    0.    0.    0.]]


### HoldOut

In [258]:
# X_train, X_test, y_train, y_test = train_test_split(df_X_dm, df_y_val, test_size=0.3, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Modelの生成

In [259]:
logistic = Pipeline([('scl',StandardScaler()), ('est',LogisticRegression(random_state=1, class_weight="balanced"))])
# random_forest = Pipeline([('scl',StandardScaler()), ('est',RandomForestClassifier(random_state=1, class_weight="balanced"))])
# xgb = Pipeline([('scl',StandardScaler()), ('est',XGBClassifier(random_state=1, class_weight="balanced"))])
# lgbm = Pipeline([('scl',StandardScaler()), ('est',LGBMClassifier(random_state=1, class_weight="balanced"))])

# 学習

In [260]:
logistic.fit(X_train, y_train)
# random_forest.fit(X_train, y_train)
# xgb.fit(X_train, y_train)
# lgbm.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('est', LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=1,
          solver='warn', tol=0.0001, verbose=0, warm_start=False))])

# 予測

In [261]:
y_pred_proba = logistic.predict_proba(X_test)
# y_pred_proba = random_forest.predict_proba(X_test)
# y_pred_proba = xgb.predict_proba(X_test)
# y_pred_proba = lgbm.predict_proba(X_test)

# ベストな閾値を求める

### ROIの計算処理

In [262]:
def calc_roi(y_list: np.ndarray, attack_list: np.ndarray) -> int:
    roi = 0
    for (yes, attack) in zip(y_list, attack_list):
    #     print(yes, attack)
        if attack == 1:
            roi += 5000*yes - 500*attack

    return roi

### 閾値を変えてROIを計算

In [263]:
best_roi = 0
best_threshold = 0
best_attack_list = np.ndarray([])

for threshold in np.arange(0, 1.01, 0.01):
    attack_list = (y_pred_proba[:, 1] > threshold).astype(int)
    roi = calc_roi(y_test, attack_list)
#     print('threshold:', threshold, ', roi:', roi)
    if roi > best_roi:
        best_roi = roi
        best_threshold = threshold
        best_attack_list = attack_list

In [264]:
print(best_roi)
print(best_threshold)
print(best_attack_list)

2353500
0.55
[0 0 1 ..., 0 0 1]


### ベストな閾値は上の通り

# 複数モデルの中からベストモデルを選ぶ

### 複数モデルを構築

In [212]:
pipelines = {
            'knn':
                Pipeline([('scl',StandardScaler()), ('est',KNeighborsClassifier())]),
            'logistic':
                Pipeline([('scl',StandardScaler()), ('est',LogisticRegression(random_state=1, class_weight="balanced"))]),
            'rsvc':
                Pipeline([('scl',StandardScaler()), ('est',SVC(C=1.0,kernel='rbf',class_weight='balanced',random_state=1, probability=True))]),
#             'lsvc':
#                 Pipeline([('scl',StandardScaler()), ('est',LinearSVC(C=1.0,random_state=1,class_weight='balanced'))]),
            'rf':
                Pipeline([('scl',StandardScaler()), ('est',RandomForestClassifier(random_state=1, class_weight="balanced"))]),
            'gb':
                Pipeline([('scl',StandardScaler()), ('est',GradientBoostingClassifier(random_state=1))]),
            'mlp':
                Pipeline([('scl',StandardScaler()), ('est',MLPClassifier(hidden_layer_sizes=(5,3), max_iter=500, random_state=1))]),
            'xgb':
                Pipeline([('scl',StandardScaler()), ('est',XGBClassifier(random_state=1, class_weight="balanced"))]),
            'lgbm':
                Pipeline([('scl',StandardScaler()), ('est',LGBMClassifier(random_state=1, class_weight="balanced"))]),
    }

### 各モデルのROIを求める

In [217]:
model_info = {}
for pipe_name, pipeline in pipelines.items():
    # 学習
    pipeline.fit(X_train, y_train)
    # 予測
    y_pred_proba = pipeline.predict_proba(X_test)
    
    best_roi = 0
    best_threshold = 0
    best_attack_list = np.ndarray([])

    for threshold in np.arange(0, 1.01, 0.01):
        attack_list = (y_pred_proba[:, 1] > threshold).astype(int)
        roi = calc_roi(y_test, attack_list)
    #     print('threshold:', threshold, ', roi:', roi)
        if roi > best_roi:
            best_roi = roi
            best_threshold = threshold
            best_attack_list = attack_list
    
    model_info[pipe_name] = {'model': pipeline, 'roi': best_roi, 'threshold': best_threshold, 'attack_list': best_attack_list}



### # 各モデルのベストROIと閾値を確認

In [220]:
for model_name, info in model_info.items():
    print(model_name)
    print(' roi       :', info['roi'])
    print(' threshold :', info['threshold'])

knn
 roi       : 1752500
 threshold : 0.2
logistic
 roi       : 2328000
 threshold : 0.53
rsvc
 roi       : 2335500
 threshold : 0.14
rf
 roi       : 1773500
 threshold : 0.09
gb
 roi       : 2478000
 threshold : 0.13
mlp
 roi       : 2375000
 threshold : 0.12
xgb
 roi       : 2478000
 threshold : 0.13
lgbm
 roi       : 2471500
 threshold : 0.54


# アタックリストの作成

In [221]:
def make_attack_list(dataset: pd.DataFrame, model: Pipeline, threshold: float) -> np.ndarray:

    # ペルソナに用いた説明変数、重要な説明変数を含める(yは含めないように注意)
    df_X = dataset.loc[:, ['age','job','marital','default','education','contact','pdays','poutcome',
                            'emp.var.rate','cons.price.idx','cons.conf.idx']]
    
    # Dummyエンコーディング
    categorical_columns = ['job','marital','default','education','contact','poutcome']
    df_X_dm = pd.get_dummies(data=df_X, dummy_na=True, drop_first=True, columns=categorical_columns)
    
    # 機械学習モデルに入力するためNumpy配列に変換
    X = np.array(df_X_dm)
    
    # Modelの生成
    y_pred_proba = model.predict_proba(X)
    
    # アタックリストの作成
    attack_list = (y_pred_proba[:, 1] > threshold).astype(int)
    
    return attack_list

In [222]:
# 当日はテストデータを読み込む
df_test = pd.read_csv('../bank_marketing_train.csv')
final_attack_list = make_attack_list(df_test, model_info['xgb']['model'], model_info['xgb']['threshold'])

In [223]:
final_attack_list

array([0, 0, 0, ..., 1, 1, 1])

### アタックリストをcsv出力

In [224]:
np.savetxt('attack.csv', final_attack_list,delimiter=',', fmt='%d')

# 期待される収益

In [225]:
# yのリストを求める
y = df_test['y']
y_val = y.apply(lambda x: 1 if x == 'yes' else 0)
y_list = np.array(y_val)

In [226]:
# ROIの計算
calc_roi(y_list=y_list, attack_list=final_attack_list)

8587500

### （おまけ）yをすべて当てた時の理想収益

In [227]:
y_val[y_val == 1].size * (5000-500)

17082000

In [228]:
(17082000-8587500)/500

16989.0

# Modelの予測精度

In [154]:
print('accuracy  :', accuracy_score(y_true=y_list, y_pred=final_attack_list))
print('recall    :', recall_score(y_true=y_list, y_pred=final_attack_list))
print('precision :', precision_score(y_true=y_list, y_pred=final_attack_list))
print('f1        :', f1_score(y_true=y_list, y_pred=final_attack_list))

accuracy  : 0.800349691797
recall    : 0.639357218124
precision : 0.311353431687
f1        : 0.418773186093


In [156]:
y_val.size

33744