# Automated Machine Learning

## 機能要件
* 2値の分類（Classification）タスクを扱える
* カテゴリカル変数を指定するとone-hotエンコードを実行する
* モデル用データマートに施したのと同一データ前処理をスコア用データマートに対しても適用される
* モデル選択の評価指標を選択できる
* 複数アルゴリズムから指定の評価指標に従いベストモデルを選択できる
* 学習済みモデルを保存できる
* アルゴリズムランキングと性能評価指標が出力される
* 学習済みモデル（保存したモデル）を呼び出しスコア用データに対し予測確率を付与できる

In [1]:
# カテゴリカル変数をコンマ(","）で区切って記入してください。（例：Dependents,Gender,Married,Education,Self_Employed,Property_Area）
ohe_columns = input().split(",")

sales,salary


In [2]:
# 評価指標を記入してください。（選択肢：accuracy, precision, recall, f1, roc_auc）
chosen_metrics = input()

roc_auc


In [6]:
# 関連ライブラリのインポート
import numpy as np
import pandas as pd
from statistics import mean

from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.externals import joblib

# Import data
ohe_dict = {}
for column in ohe_columns:
    ohe_dict[column] = object

df = pd.read_csv('./data/final_hr_analysis_train.csv', header=0, dtype=ohe_dict)
ID = df.iloc[:, [0]]    # １列目をID情報としてセット
y  = df.iloc[:, [1]]    # ２列目を正解データとして読込
X  = df.iloc[:, 2:]      # ３列目以降を特徴量Xとして読込

# 日付データ処理が必要なら
# X.loc[:, ['日付データのindex']] = CDATE（基準日の変数） - X.loc[:, ['日付データのindex]] 

# パイプラインの構成
pipe_knn = Pipeline([('scl',StandardScaler()),('est',KNeighborsClassifier())])
pipe_logistic = Pipeline([('scl',StandardScaler()),('est',LogisticRegression(random_state=1))])
pipe_rfc = Pipeline([('scl',StandardScaler()),('est',RandomForestClassifier(random_state=1))])
pipe_gbc = Pipeline([('scl',StandardScaler()),('est',GradientBoostingClassifier(random_state=1))])
pipe_gbc200 = Pipeline([('scl',StandardScaler()),('est',GradientBoostingClassifier(n_estimators=200, random_state=1))])
pipe_gbc500 = Pipeline([('scl',StandardScaler()),('est',GradientBoostingClassifier(n_estimators=500, random_state=1))])

# one-hotエンコーディング
X_ohe = pd.get_dummies(X, dummy_na=True, columns=ohe_columns)

# 欠損値補完
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X_ohe)
X_ohe_columns = X_ohe.columns.values
X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns)

# RFE　（特徴量選択が必要であれば）
selector = RFE(
    estimator = GradientBoostingClassifier(random_state = 0),
    n_features_to_select = 20,
    step = 0.05
)
selector.fit(X_ohe, y.as_matrix().ravel())
X_ohe_selected = selector.transform(X_ohe)
X_ohe_selected = pd.DataFrame(X_ohe_selected, columns=X_ohe_columns[selector.support_])

# Holdout
X_train, X_test, y_train, y_test = train_test_split(X_ohe_selected, y, test_size=0.20, random_state=1)

# Grid 1
learning_rate_start = 0.2
param_test1 = {'n_estimators':range(20,101,10)}
gsearch1 = GridSearchCV(
    estimator = GradientBoostingClassifier(
        learning_rate = learning_rate_start,
        min_samples_split=500,
        min_samples_leaf=50,
        max_depth=8,
        max_features='sqrt',
        subsample=0.8,
        random_state=10),
    param_grid = param_test1,
    scoring='roc_auc',
    n_jobs=4,
    iid=False,
    cv=5)
gsearch1.fit(X_train,y_train.as_matrix().ravel())

# Grid 2
param_test2 = {
    'max_depth':range(5,16,2),
    'min_samples_split':range(200,1001,200)}
gsearch2 = GridSearchCV(
    estimator = GradientBoostingClassifier(
        learning_rate = learning_rate_start,
        n_estimators=gsearch1.best_params_["n_estimators"],
        max_features='sqrt',
        subsample=0.8,
        random_state=10),
    param_grid = param_test2,
    scoring='roc_auc',
    n_jobs=4,
    iid=False,
    cv=5)
gsearch2.fit(X_train,y_train.as_matrix().ravel())

# Grid 3
param_test3 = {
    'min_samples_leaf':range(30,71,10),
    'max_features':range(3,11,1)}
gsearch3 = GridSearchCV(
    estimator = GradientBoostingClassifier(
        learning_rate = learning_rate_start,
        n_estimators = gsearch1.best_params_["n_estimators"],
        min_samples_split = gsearch2.best_params_["min_samples_split"],
        max_depth = gsearch2.best_params_["max_depth"],
        subsample = 0.8,
        random_state = 10), 
    param_grid = param_test3,
    scoring='roc_auc',
    n_jobs=4,
    iid=False,
    cv=5)
gsearch3.fit(X_train,y_train.as_matrix().ravel())

# Grid 4
param_test4 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch4 = GridSearchCV(
    estimator = GradientBoostingClassifier(
        learning_rate = learning_rate_start,
        n_estimators = gsearch1.best_params_["n_estimators"],
        min_samples_split = gsearch2.best_params_["min_samples_split"],
        max_depth = gsearch2.best_params_["max_depth"],
        min_samples_leaf = gsearch3.best_params_["min_samples_leaf"],
        random_state=10,
        max_features = gsearch3.best_params_["max_features"]),
    param_grid = param_test4,
    scoring='roc_auc',
    n_jobs=4,
    iid=False,
    cv=5)
gsearch4.fit(X_train,y_train.as_matrix().ravel())

pipe_gbc_tuning1 = Pipeline([('scl',StandardScaler()),('est',GradientBoostingClassifier(
    learning_rate = learning_rate_start,
    n_estimators = gsearch1.best_params_["n_estimators"],
    min_samples_split = gsearch2.best_params_["min_samples_split"],
    max_depth = gsearch2.best_params_["max_depth"],
    min_samples_leaf = gsearch3.best_params_["min_samples_leaf"],
    max_features = gsearch3.best_params_["max_features"],
    subsample = gsearch4.best_params_["subsample"],
    random_state=10,
))])
pipe_gbc_tuning2 = Pipeline([('scl',StandardScaler()),('est',GradientBoostingClassifier(
    learning_rate = learning_rate_start/2,
    n_estimators = gsearch1.best_params_["n_estimators"]*2,
    min_samples_split = gsearch2.best_params_["min_samples_split"],
    max_depth = gsearch2.best_params_["max_depth"],
    min_samples_leaf = gsearch3.best_params_["min_samples_leaf"],
    max_features = gsearch3.best_params_["max_features"],
    subsample = gsearch4.best_params_["subsample"],
    random_state=10,
))])
pipe_gbc_tuning10 = Pipeline([('scl',StandardScaler()),('est',GradientBoostingClassifier(
    learning_rate = learning_rate_start/10,
    n_estimators = gsearch1.best_params_["n_estimators"]*10,
    min_samples_split = gsearch2.best_params_["min_samples_split"],
    max_depth = gsearch2.best_params_["max_depth"],
    min_samples_leaf = gsearch3.best_params_["min_samples_leaf"],
    max_features = gsearch3.best_params_["max_features"],
    subsample = gsearch4.best_params_["subsample"],
    random_state=10,
))])
pipe_gbc_tuning20 = Pipeline([('scl',StandardScaler()),('est',GradientBoostingClassifier(
    learning_rate = learning_rate_start/20,
    n_estimators = gsearch1.best_params_["n_estimators"]*20,
    min_samples_split = gsearch2.best_params_["min_samples_split"],
    max_depth = gsearch2.best_params_["max_depth"],
    min_samples_leaf = gsearch3.best_params_["min_samples_leaf"],
    max_features = gsearch3.best_params_["max_features"],
    subsample = gsearch4.best_params_["subsample"],
    random_state=10,
))])

# 学習と評価
pipe_names = {
    'KNN': pipe_knn,
    'Logistic': pipe_logistic,
    'RandomForestClassifier': pipe_rfc,
    'GradientBoostingCrassifier': pipe_gbc,
    'GradientBoostingCrassifierN200': pipe_gbc200,
    'GradientBoostingCrassifierN500': pipe_gbc500,
    'TunedGBC1': pipe_gbc_tuning1,
    'TunedGBC2': pipe_gbc_tuning2,
    'TunedGBC10': pipe_gbc_tuning10,
    'TunedGBC20': pipe_gbc_tuning20,
 }
result = {}
for key, value in pipe_names.items():
    cv_results = cross_val_score(
        value,
        X_ohe_selected,
        y.as_matrix().ravel(),
        cv=2,
        scoring=chosen_metrics,
        n_jobs=4
    )
    result[key] = mean(cv_results)

sorted_result = sorted(result.items(), key=lambda x: -x[1])    # 評価指標が高い順に並び替える
print(sorted_result[0][0])
print("評価指標「%s_score」のアルゴリズムランキング"%chosen_metrics)

algorithm_ranking = {}

for i, (key, value) in enumerate(sorted_result):
    print("%d位: %.6f %s"%(i+1, value, key))
    algorithm_ranking[i+1] = {
        "score": value,
        "name": key,
    }

print(algorithm_ranking)
    
# １位のアルゴリズムを保存
joblib.dump(pipe_names[sorted_result[0][0]], sorted_result[0][0]+'.pkl') 


TunedGBC20
評価指標「roc_auc_score」のアルゴリズムランキング
1位: 0.990778 TunedGBC20
2位: 0.990706 TunedGBC10
3位: 0.990030 TunedGBC2
4位: 0.989545 TunedGBC1
5位: 0.989127 GradientBoostingCrassifierN200
6位: 0.989094 GradientBoostingCrassifierN500
7位: 0.987077 GradientBoostingCrassifier
8位: 0.983574 RandomForestClassifier
9位: 0.954937 KNN
10位: 0.817361 Logistic
{1: {'score': 0.99077785170828359, 'name': 'TunedGBC20'}, 2: {'score': 0.99070620341849036, 'name': 'TunedGBC10'}, 3: {'score': 0.9900303676220894, 'name': 'TunedGBC2'}, 4: {'score': 0.98954539180787326, 'name': 'TunedGBC1'}, 5: {'score': 0.98912748675556361, 'name': 'GradientBoostingCrassifierN200'}, 6: {'score': 0.98909379985992418, 'name': 'GradientBoostingCrassifierN500'}, 7: {'score': 0.98707735715154454, 'name': 'GradientBoostingCrassifier'}, 8: {'score': 0.98357351124409409, 'name': 'RandomForestClassifier'}, 9: {'score': 0.95493664818328061, 'name': 'KNN'}, 10: {'score': 0.81736113127198196, 'name': 'Logistic'}}


['TunedGBC20.pkl']

In [37]:
# 検証用データに対してモデルを適応する

# Import data
df_s = pd.read_csv('./data/final_hr_analysis_test.csv', header=0, dtype=ohe_dict)
ID_s = df_s.iloc[:, [0]]    # 最初列がPKなのでID情報としてセット
X_s  = df_s.iloc[:, 2:]      # 2列目以降を特徴量Xとして読込
# df_y = pd.read_csv('./data/turnover_answer.csv', header=0)
# y_s = df_y.iloc[:, [1]]

# one-hotエンコーディング
X_ohe_s = pd.get_dummies(X_s, dummy_na=True, columns=ohe_columns)


# モデリング時のデータ項目に合わせる

# モデリング時点のone-hotエンコーディング処理後のカラム構成だけを持った（データ部分は持たない）データフレームを作る
df_cols_m = pd.DataFrame(None, columns=X_ohe_columns, dtype=float)
print(df_cols_m)

# 上記データフレームに対して、スコアリング時点のone-hotエンコーディング後のデータを縦に結合
X_ohe_s2 = pd.concat([df_cols_m, X_ohe_s])
print(X_ohe_s2[:5])

# スコアリングのみに登場するデータ項目を削除
X_ohe_s2 = X_ohe_s2.drop(list(set(X_ohe_s.columns.values)-set(X_ohe.columns.values)), axis=1)
print(X_ohe_s2[:5])

# スコアリングで登場しなかったデータ項目をゼロ埋め
X_ohe_s2.loc[:,list(set(X_ohe.columns.values)-set(X_ohe_s.columns.values))] = \
    X_ohe_s2.loc[:,list(set(X_ohe.columns.values)-set(X_ohe_s.columns.values))].fillna(0, axis=1)
print(X_ohe_s2[:5])

# モデリング時点のone-hotエンコーディング後の並び順に制御
X_ohe_s2 = X_ohe_s2.reindex_axis(X_ohe.columns.values, axis=1)
print(X_ohe_s2[:5])

# モデリング時に各変数の平均値を学習させたImputerを適用
X_ohe_s3 = pd.DataFrame(imp.transform(X_ohe_s2), columns=X_ohe_columns)
print(X_ohe_s3[:5])

# モデリング時にRFEによって選択された変数を残す
X_fin_s = X_ohe_s3.loc[:, X_ohe_columns[selector.support_]]
print(X_fin_s[:5])


# # モデルの読み込み
# est = joblib.load(sorted_result[0][0]+'.pkl')
# print(pipe_names[sorted_result[0][0]])

# # モデルへの入力
# est.fit(X_ohe_selected,y.as_matrix().ravel())
# df_o = ID_s.copy()
# df_o['predict_proba'] = est.predict_proba(X_fin_s).T[1]

# # ファイル出力
# df_o.to_csv("predict_porba.csv", index=False)

# # 各種スコアの表示
# print("accuracy_score: %.3f"%accuracy_score(y_s.as_matrix().ravel(), est.predict(X_fin_s)))
# print("precision_score: %.3f"%precision_score(y_s.as_matrix().ravel(), est.predict(X_fin_s)))
# print("recall_score: %.3f"%recall_score(y_s.as_matrix().ravel(), est.predict(X_fin_s)))
# print("f1_score: %.3f"%f1_score(y_s.as_matrix().ravel(), est.predict(X_fin_s)))
# print("roc_auc_socore: %.3f"%roc_auc_score(y_s.as_matrix().ravel(), est.predict_proba(X_fin_s).T[1]))


Empty DataFrame
Columns: [satisfaction_level, last_evaluation, number_project, average_montly_hours, time_spend_company, Work_accident, promotion_last_5years, sales_IT, sales_RandD, sales_accounting, sales_hr, sales_management, sales_marketing, sales_product_mng, sales_sales, sales_support, sales_technical, sales_nan, salary_high, salary_low, salary_medium, salary_nan]
Index: []

[0 rows x 22 columns]
   satisfaction_level  last_evaluation  number_project  average_montly_hours  \
0                0.44             0.57             2.0                 141.0   
1                0.55             0.96             3.0                 194.0   
2                0.72             0.67             5.0                 210.0   
3                0.96             0.75             4.0                 177.0   
4                0.96             0.54             3.0                 198.0   

   time_spend_company  Work_accident  promotion_last_5years  sales_IT  \
0                 3.0            0.0     

In [None]:
# オーバーサンプリングが必要な場合
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=0)
ros = RandomOverSampler(random_state=0)
smt = SMOTE(random_state=0)
X_train_under, y_train_under = rus.fit_sample(X_train, y_train.as_matrix().ravel())
X_train_over, y_train_over = ros.fit_sample(X_train, y_train.as_matrix().ravel())
X_train_smote, y_train_smote = smt.fit_sample(X_train, y_train.as_matrix().ravel())
