In [174]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC, LinearSVC # SVCは非線形SVM（Support Vector Classifier）
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# ファイル入出力ライブラリ
import pickle

In [33]:
class AutoML():
    def __init__(self, categorical_columns, eval_kind):
        self.categorical_columns = categorical_columns
        self.eval_kind = eval_kind

        # 2値分類ができるモデルをパイプラインで作成
        self.pipelines = {
#             'knn':
#                 Pipeline([('scl',StandardScaler()), ('est',KNeighborsClassifier())]),
            'logistic':
                Pipeline([('scl',StandardScaler()), ('est',LogisticRegression(random_state=1, class_weight="balanced"))]),
#             'rsvc':
#                 Pipeline([('scl',StandardScaler()), ('est',SVC(C=1.0,kernel='rbf',class_weight='balanced',random_state=1, probability=True))]),
#             'lsvc':
#                 Pipeline([('scl',StandardScaler()), ('est',LinearSVC(C=1.0,class_weight='balanced',random_state=1))]),
#             'rf':
#                 Pipeline([('scl',StandardScaler()), ('est',RandomForestClassifier(random_state=1))]),
#             'gb':
#                 Pipeline([('scl',StandardScaler()), ('est',GradientBoostingClassifier(random_state=1))]),
#             'mlp':
#                 Pipeline([('scl',StandardScaler()), ('est',MLPClassifier(hidden_layer_sizes=(5,3), max_iter=500, random_state=1))])
        }
        
    def read_data_file(self, file_path):
        # objectで読み込むのがポイント！ 'Dependents_2'が'Dependents_2.0'となることを避けられる
        dtype = {column: object for column in self.categorical_columns}
        df = pd.read_csv(file_path, header=0, dtype=dtype)

        X  = df.iloc[:,2:]            # 3列目以降を特徴量
        ID = df.iloc[:,[0]]             # 第0列はPK（Loan_ID）なのでIDとしてセット
        y  = df.iloc[:,1]              # 2列目をクラス変数
        
        self.ID_name = ID.columns[0]
        self.y_name = y.name
        
        return X, y, ID
    
    def fit(self, X, y):
        # データ前処理
        X_pre = self.__preprocess_X(X)
        y_pre = self.__preprocess_y(y)
        
        # one-hotエンコーディングヘッダをテストデータにも適用するため保持
        self.X_columns = X_pre.columns.values
#         
#         # デバッグ文
#         print('---X_columns start------------')
#         display(self.X_columns)
#         print('---X_columns end------------')
        
        # 全てのモデルでfitを行う（cross-validationで検証する）
        # fit & evaluation
        scores = {}
        for pipe_name, pipeline in self.pipelines.items():
            pipeline.fit(X_pre, y_pre)
            cv_results = cross_val_score(pipeline,
                             X_pre,
                             y_pre.values,
                             cv=3,
                             scoring=self.eval_kind)

            # cross-validationの平均値-標準偏差をスコアとする
            scores[pipe_name] = cv_results.mean() - cv_results.std()
            sorted_scores = self.__sort_dictionary(scores)

        # アルゴリズムランキングと性能指標評価を出力
        display(pd.Series(sorted_scores))

        # eval_kindを用いて評価する
        # 一番評価の良かったモデルを取っておく
        best_algorithm = [*sorted_scores][0]
#         print('best:{}'.format(best_algorithm))
        self.best_model = self.pipelines[best_algorithm]
    
    # X前処理
    def __preprocess_X(self, X):
        # Xのone-hotエンコーディング
        X_ohe = pd.get_dummies(X, dummy_na=True, columns=self.categorical_columns)

        # 欠損値を平均で置き換える
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        imp.fit(X_ohe)
        X_ohe_columns = X_ohe.columns.values
        X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns)
        
        # デバッグ文
        print('preprocess result-----')
        display(X_ohe)
        print('----------------------')
        
        return X_ohe

    # y前処理
    def __preprocess_y(self, y):
        # yは欠損値を最頻値で置き換える
        y_pre = y.fillna(y.mode()[0])
#         # デバッグ文
#         print('preprocess result-----')
#         display(y_pre)
#         print('----------------------')
        return y_pre
    
    def __sort_dictionary(self, dict):
        sorted_dict = {}
        for k, v in sorted(dict.items(), key=lambda x: -x[1]):
            sorted_dict[k] = v
        
        return sorted_dict
    
    def __get_score(self, y, y_predict):
        if self.eval_kind == 'accuracy':
            return accuracy_score(y, y_predict)
        if self.eval_kind == 'precision':
            return precision_score(y, y_predict)
        if self.eval_kind == 'recall':
            return recall_score(y, y_predict)
        if self.eval_kind == 'f1':
            return f1_score(y, y_predict)

    def __preprocess_test(self, X_test):
        # データ前処理
        X_test_pre = self.__preprocess_X(X_test)

        # 訓練データのヘッダから空データフレーム作成
        df_cols_m = pd.DataFrame(None,
                         columns=self.X_columns,
                         dtype=float)
#         # デバッグ文
#         print('---df_cols_m start--------------------')
#         display(df_cols_m)
#         print('---df_cols_m end----------------------')
        
        # テストデータの列を訓練データに合わせる
        X_test_concat = pd.concat([df_cols_m, X_test_pre])

#         # デバッグ文
#         display(X_test_concat)
        
        # 訓練データにない列を削除
        X_test_drop = X_test_concat.drop(list(set(X_test_concat.columns.values)-set(self.X_columns)),axis=1)
        

        # テストデータに登場しなかったデータ項目をゼロ埋め
        X_test_drop.loc[:,list(set(self.X_columns)-set(X_test_pre.columns.values))] = \
            X_test_drop.loc[:,list(set(self.X_columns)-set(X_test_pre.columns.values))].fillna(0, axis=1)

#         # デバッグ文
#         print('---X_test_drop start--------------------')
#         display(X_test_drop)
#         print('---X_test_drop end----------------------')

        # 訓練データと合わせて並び替え
        X_test_drop_reindex = X_test_drop.reindex(self.X_columns, axis=1)

        # デバッグ文
        print('---X_test_drop_reindex start--------------------')
        display(X_test_drop_reindex)
        print('---X_test_drop_reindex end----------------------')

        return X_test_drop_reindex
        
    
    def predict(self, X_test):
        # データ前処理
        X_test_pre_complete = self.__preprocess_test(X_test)
        
        return self.best_model.predict(X_test_pre_complete)
                
    def predict_proba(self, X_test):
        # データ前処理
        X_test_pre_complete = self.__preprocess_test(X_test)

        return self.best_model.predict_proba(X_test_pre_complete)

    def predict_proba_with_id(self, id, X_test):
        proba = self.predict_proba(X_test)
        
        # 2列目が'1'の予測確率
        proba_1 = proba[:, 1]
        
        # IDと予測確率の結合
        ID_array = ID_test.values
        ID_array_1dim = ID_array[:,0]
        result = np.vstack((ID_array_1dim, proba_1))
        result_df = pd.DataFrame(result).T
        
        # ヘッダをつける
        result_df.columns = [self.ID_name, self.y_name]
        
        return result_df

    # file_nameにAutoMLごとベストモデルを保存する
    def save(self, file_name):
        with open(file_name, mode='wb') as f:
            pickle.dump(self, f)

    # file_nameから学習済みモデルを保持したAutoMLを呼び出す
    def load(self, file_name):
        with open(file_name, mode='rb') as f:
            return pickle.load(f)

# データの読み込み

In [2]:
df_input = pd.read_csv('../bank_marketing_train.csv')

# 特徴量エンジニアリング

### 最終的なペルソナに従い、特徴量を抽出する
- age:60以上
- job:retired
- marital：結婚経験あり
- default(クレジットの支払い遅延)：なし
- education(最終学歴)：basic.4y
- contact(連絡デバイス)：cellular
- pdays（前回の接触からの経過日数）：少ない
- poutcome（以前のキャンペーン結果）：あり（初めての客でない）

### ペルソナに用いた説明変数以外で重要なものも含める
- emp.var.rate
- cons.price.idx
- cons.conf.idx

In [3]:
df_extract = df_input.loc[:, ['age','job','marital','default','education','contact','pdays','poutcome',
                            'emp.var.rate','cons.price.idx','cons.conf.idx','y']]

In [4]:
df_extract.head(2)

Unnamed: 0,age,job,marital,default,education,contact,pdays,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,y
0,56,housemaid,married,no,basic.4y,telephone,999,nonexistent,1.1,93.994,-36.4,no
1,57,services,married,unknown,high.school,telephone,999,nonexistent,1.1,93.994,-36.4,no


### 特徴量と目的変数に分ける

In [5]:
df_X = df_extract.drop('y', axis=1)

In [6]:
df_X.head(2)

Unnamed: 0,age,job,marital,default,education,contact,pdays,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx
0,56,housemaid,married,no,basic.4y,telephone,999,nonexistent,1.1,93.994,-36.4
1,57,services,married,unknown,high.school,telephone,999,nonexistent,1.1,93.994,-36.4


In [7]:
df_y = df_extract['y']

In [8]:
df_y.head(2)

0    no
1    no
Name: y, dtype: object

### 目的変数のyes/noを1/0に変換

In [9]:
df_y_val = df_y.apply(lambda x: 1 if x == 'yes' else 0)

In [10]:
df_y_val.head(2)

0    0
1    0
Name: y, dtype: int64

### Dummyエンコーディング

In [16]:
# カテゴリカル変数
categorical_columns = ['job','marital','default','education','contact','poutcome']

In [19]:
df_X_dm = pd.get_dummies(data=df_X, dummy_na=True, drop_first=True, columns=categorical_columns)

In [21]:
df_X_dm.columns

Index(['age', 'pdays', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'job_nan', 'marital_married', 'marital_single', 'marital_unknown',
       'marital_nan', 'default_unknown', 'default_yes', 'default_nan',
       'education_basic.6y', 'education_basic.9y', 'education_high.school',
       'education_illiterate', 'education_professional.course',
       'education_university.degree', 'education_unknown', 'education_nan',
       'contact_telephone', 'contact_nan', 'poutcome_nonexistent',
       'poutcome_success', 'poutcome_nan'],
      dtype='object')

### 機械学習モデルに入力するためNumpy配列に変換

In [31]:
y = np.array(df_y_val)
X = np.array(df_X_dm)

In [33]:
print(y)
print(X)

[0 0 0 ..., 1 1 0]
[[  56.   999.     1.1 ...,    1.     0.     0. ]
 [  57.   999.     1.1 ...,    1.     0.     0. ]
 [  56.   999.     1.1 ...,    1.     0.     0. ]
 ..., 
 [  73.   999.    -1.1 ...,    1.     0.     0. ]
 [  44.   999.    -1.1 ...,    1.     0.     0. ]
 [  74.   999.    -1.1 ...,    0.     0.     0. ]]


### HoldOut

In [34]:
# X_train, X_test, y_train, y_test = train_test_split(df_X_dm, df_y_val, test_size=0.3, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Modelの生成

In [176]:
# logistic = Pipeline([('scl',StandardScaler()), ('est',LogisticRegression(random_state=1, class_weight="balanced"))])
random_forest = Pipeline([('scl',StandardScaler()), ('est',RandomForestClassifier(random_state=1, class_weight="balanced"))])
xgb = Pipeline([('scl',StandardScaler()), ('est',XGBClassifier(random_state=1, class_weight="balanced"))])
lgbm = Pipeline([('scl',StandardScaler()), ('est',LGBMClassifier(random_state=1, class_weight="balanced"))])

In [177]:
logistic.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgbm.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('est', LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
        colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
        max_depth=-1, min_child_samples=20, min_child_weight=0.001,
        min_sp...ambda=0.0,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0))])

# 予測

In [190]:
# y_pred_proba = logistic.predict_proba(X_test)
# y_pred_proba = random_forest.predict_proba(X_test)
# y_pred_proba = xgb.predict_proba(X_test)
y_pred_proba = lgbm.predict_proba(X_test)

# ベストな閾値を求める

### ROIの計算処理

In [105]:
def calc_roi(y_list: np.ndarray, attack_list: np.ndarray) -> int:
    roi = 0
    for (yes, attack) in zip(y_list, attack_list):
    #     print(yes, attack)
        if attack == 1:
            roi += 5000*yes - 500*attack

    return roi

### 閾値を変えてROIを計算

In [191]:
best_roi = 0
best_threshold = 0
best_attack_list = np.ndarray([])

for threshold in np.arange(0, 1.01, 0.01):
    attack_list = (y_pred_proba[:, 1] > threshold).astype(int)
    roi = calc_roi(y_test, attack_list)
    print('threshold:', threshold, ', roi:', roi)
    if roi > best_roi:
        best_roi = roi
        best_threshold = threshold
        best_attack_list = attack_list

threshold: 0.0 , roi: 308000
threshold: 0.01 , roi: 308000
threshold: 0.02 , roi: 308000
threshold: 0.03 , roi: 308000
threshold: 0.04 , roi: 310000
threshold: 0.05 , roi: 313000
threshold: 0.06 , roi: 322500
threshold: 0.07 , roi: 327000
threshold: 0.08 , roi: 340500
threshold: 0.09 , roi: 347000
threshold: 0.1 , roi: 352500
threshold: 0.11 , roi: 368000
threshold: 0.12 , roi: 381500
threshold: 0.13 , roi: 416000
threshold: 0.14 , roi: 439000
threshold: 0.15 , roi: 503000
threshold: 0.16 , roi: 581000
threshold: 0.17 , roi: 667500
threshold: 0.18 , roi: 774000
threshold: 0.19 , roi: 847500
threshold: 0.2 , roi: 907000
threshold: 0.21 , roi: 1033500
threshold: 0.22 , roi: 1149000
threshold: 0.23 , roi: 1254000
threshold: 0.24 , roi: 1353500
threshold: 0.25 , roi: 1415000
threshold: 0.26 , roi: 1490500
threshold: 0.27 , roi: 1566500
threshold: 0.28 , roi: 1614000
threshold: 0.29 , roi: 1699000
threshold: 0.3 , roi: 1755000
threshold: 0.31 , roi: 1800000
threshold: 0.32 , roi: 1911000
th

In [192]:
print(best_roi)
print(best_threshold)
print(best_attack_list)

2471500
0.54
[0 0 1 ..., 0 0 0]


### ベストな閾値は上の通り

# アタックリストの作成

In [182]:
def make_attack_list(dataset: pd.DataFrame, model: Pipeline, threshold: float) -> np.ndarray:

    # ペルソナに用いた説明変数、重要な説明変数を含める(yは含めないように注意)
    df_X = dataset.loc[:, ['age','job','marital','default','education','contact','pdays','poutcome',
                            'emp.var.rate','cons.price.idx','cons.conf.idx']]
    
    # Dummyエンコーディング
    categorical_columns = ['job','marital','default','education','contact','poutcome']
    df_X_dm = pd.get_dummies(data=df_X, dummy_na=True, drop_first=True, columns=categorical_columns)
    
    # 機械学習モデルに入力するためNumpy配列に変換
    X = np.array(df_X_dm)
    
    # Modelの生成
    y_pred_proba = logistic.predict_proba(X)
    
    # アタックリストの作成
    attack_list = (y_pred_proba[:, 1] > threshold).astype(int)
    
    return attack_list

In [193]:
# 当日はテストデータを読み込む
df_test = pd.read_csv('../bank_marketing_train.csv')
final_attack_list = make_attack_list(df_test, logistic, best_threshold)

In [194]:
final_attack_list

array([0, 0, 0, ..., 1, 1, 1])

### アタックリストをcsv出力

In [134]:
np.savetxt('attack.csv', final_attack_list,delimiter=',', fmt='%d')

# 期待される収益

In [195]:
# yのリストを求める
y = df_test['y']
y_val = y.apply(lambda x: 1 if x == 'yes' else 0)
y_list = np.array(y_val)

In [196]:
# ROIの計算
calc_roi(y_list=y_list, attack_list=final_attack_list)

8169000

### （おまけ）yをすべて当てた時の理想収益

In [149]:
y_val[y_val == 1].size * (5000-500)

17082000

# Modelの予測精度

In [154]:
print('accuracy  :', accuracy_score(y_true=y_list, y_pred=final_attack_list))
print('recall    :', recall_score(y_true=y_list, y_pred=final_attack_list))
print('precision :', precision_score(y_true=y_list, y_pred=final_attack_list))
print('f1        :', f1_score(y_true=y_list, y_pred=final_attack_list))

accuracy  : 0.800349691797
recall    : 0.639357218124
precision : 0.311353431687
f1        : 0.418773186093


In [156]:
y_val.size

33744