In [1]:
# 必要ライブラリのインポート
import numpy as np
import pandas as pd
import os
import pickle
import gc

# 可視化
import matplotlib.pyplot as plt

# 前処理
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

# バリデーション
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

# 評価指標
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# モデリング: lightgbm
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

# matplotilbで日本語表示したい場合はこれをinstallしてインポートする
!pip install japanize-matplotlib
import japanize_matplotlib

# 2022/06/02追加: Kaggle notebook環境変更のため
!pip install -U pandas_profiling

# 分布確認
import pandas_profiling as pdp


Collecting japanize-matplotlib
  Downloading japanize-matplotlib-1.1.3.tar.gz (4.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: japanize-matplotlib
  Building wheel for japanize-matplotlib (setup.py) ... [?25l- \ done
[?25h  Created wheel for japanize-matplotlib: filename=japanize_matplotlib-1.1.3-py3-none-any.whl size=4120275 sha256=31d5102df97bcf8d6920a83c79529f0c006314b7f0671e3807df7c9bbb996f6b
  Stored in directory: /root/.cache/pip/wheels/d3/7d/c5/d3e02382561888f86edabf3256c09b3298f8e24456f8fc4da3
Successfully built japanize-matplotlib
Installing collected packages: japanize-matplotlib
Successfully installed japanize-matplotlib-1.1.3
Collecting pandas_profiling
  Downloading pandas_profiling-3.6.6-py2.py3-none-any.whl (324 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32

In [2]:
#ファイル読み込み
df_train = pd.read_csv("../input/titanic/train.csv")
df_train.head()

#Excelファイル　df = pd.read_excel("ファイル名")
#タブ区切り　　　df = pd_read_CSV("sample.tst", sep="\t")
#Shift JIS形式のCSVファイル　df = pd.read_CSV("sample.csv", encoding="Shift_jis")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#データセット作成
x_train, y_train, id_train = df_train[["Pclass", "Fare"]], \
                             df_train[["Survived"]], \
                             df_train[["PassengerId"]]
print(x_train.shape, y_train.shape, id_train.shape)

(891, 2) (891, 1) (891, 1)


In [4]:
#ベイズ最適化
import optuna

In [5]:
#目的関数の定義
# 探索しないハイパーパラメータ
params_base = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.02,
    'n_estimators': 100000,
    "bagging_freq": 1,
    "seed": 123,
}

def objective(trial):
    # 探索するハイパーパラメータ
    params_tuning = {
        "num_leaves": trial.suggest_int("num_leaves", 8, 256),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 200),
        "min_sum_hessian_in_leaf": trial.suggest_float("min_sum_hessian_in_leaf", 1e-5, 1e-2, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-2, 1e2, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-2, 1e2, log=True),
    }
    params_tuning.update(params_base)
    
    # モデル学習・評価
    list_metrics = []
    cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))
    for nfold in np.arange(5):
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = x_train.loc[idx_tr, :], y_train.loc[idx_tr, :]
        x_va, y_va = x_train.loc[idx_va, :], y_train.loc[idx_va, :]
        model = lgb.LGBMClassifier(**params_tuning)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr,y_tr), (x_va,y_va)],
                  early_stopping_rounds=100,
                  verbose=0,
                 )
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_va = accuracy_score(y_va, np.where(y_va_pred>=0.5, 1, 0))
        list_metrics.append(metric_va)
    
    # 評価値の計算
    metrics = np.mean(list_metrics)
    
    return metrics

In [6]:
#最適化処理（探査の実行）
sampler = optuna.samplers.TPESampler(seed=123)
study = optuna.create_study(sampler=sampler, direction="maximize")
study.optimize(objective, n_trials=30)

[32m[I 2023-04-05 14:40:17,506][0m A new study created in memory with name: no-name-fb1f8b2b-8634-4942-bc39-152406d7818a[0m




[32m[I 2023-04-05 14:40:20,589][0m Trial 0 finished with value: 0.664478061640826 and parameters: {'num_leaves': 181, 'min_data_in_leaf': 61, 'min_sum_hessian_in_leaf': 4.792414358623587e-05, 'feature_fraction': 0.7756573845414456, 'bagging_fraction': 0.8597344848927815, 'lambda_l1': 0.492522233779106, 'lambda_l2': 83.76388146302445}. Best is trial 0 with value: 0.664478061640826.[0m




[32m[I 2023-04-05 14:40:22,441][0m Trial 1 finished with value: 0.6712196346745339 and parameters: {'num_leaves': 178, 'min_data_in_leaf': 99, 'min_sum_hessian_in_leaf': 0.00015009027543233888, 'feature_fraction': 0.6715890080754348, 'bagging_fraction': 0.8645248536920208, 'lambda_l1': 0.567922374174008, 'lambda_l2': 0.01732652966363563}. Best is trial 1 with value: 0.6712196346745339.[0m




[32m[I 2023-04-05 14:40:24,026][0m Trial 2 finished with value: 0.65762350134957 and parameters: {'num_leaves': 107, 'min_data_in_leaf': 149, 'min_sum_hessian_in_leaf': 3.52756635172055e-05, 'feature_fraction': 0.5877258780737462, 'bagging_fraction': 0.7657756869209191, 'lambda_l1': 1.3406343673102123, 'lambda_l2': 3.4482904089131434}. Best is trial 1 with value: 0.6712196346745339.[0m




[32m[I 2023-04-05 14:40:25,270][0m Trial 3 finished with value: 0.6722302429226037 and parameters: {'num_leaves': 219, 'min_data_in_leaf': 146, 'min_sum_hessian_in_leaf': 0.0006808799287054756, 'feature_fraction': 0.8612216912851107, 'bagging_fraction': 0.6614794569265892, 'lambda_l1': 0.2799978022399009, 'lambda_l2': 0.08185645330667264}. Best is trial 3 with value: 0.6722302429226037.[0m




[32m[I 2023-04-05 14:40:26,670][0m Trial 4 finished with value: 0.668972443663298 and parameters: {'num_leaves': 81, 'min_data_in_leaf': 128, 'min_sum_hessian_in_leaf': 1.889360449174926e-05, 'feature_fraction': 0.7168505863397641, 'bagging_fraction': 0.7154313816648219, 'lambda_l1': 0.9434967110751797, 'lambda_l2': 0.5050346330980694}. Best is trial 3 with value: 0.6722302429226037.[0m




[32m[I 2023-04-05 14:40:28,283][0m Trial 5 finished with value: 0.6587847592743706 and parameters: {'num_leaves': 85, 'min_data_in_leaf': 88, 'min_sum_hessian_in_leaf': 0.004788147156768277, 'feature_fraction': 0.9720800091019398, 'bagging_fraction': 0.7509183379421682, 'lambda_l1': 3.1319282717196035, 'lambda_l2': 0.029005047452739414}. Best is trial 3 with value: 0.6722302429226037.[0m




[32m[I 2023-04-05 14:40:28,655][0m Trial 6 finished with value: 0.6161634548992531 and parameters: {'num_leaves': 87, 'min_data_in_leaf': 86, 'min_sum_hessian_in_leaf': 0.003971252247766701, 'feature_fraction': 0.6252276826982534, 'bagging_fraction': 0.7415171321313522, 'lambda_l1': 87.54657140659076, 'lambda_l2': 1.1965765212602313}. Best is trial 3 with value: 0.6722302429226037.[0m




[32m[I 2023-04-05 14:40:33,503][0m Trial 7 finished with value: 0.6992530286862093 and parameters: {'num_leaves': 160, 'min_data_in_leaf': 28, 'min_sum_hessian_in_leaf': 0.0030131614432849746, 'feature_fraction': 0.8015300642054637, 'bagging_fraction': 0.7725340032332324, 'lambda_l1': 0.23499322154972468, 'lambda_l2': 0.1646202117975735}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:40:35,624][0m Trial 8 finished with value: 0.6823363254033017 and parameters: {'num_leaves': 111, 'min_data_in_leaf': 138, 'min_sum_hessian_in_leaf': 0.00423029374725911, 'feature_fraction': 0.7552111687390055, 'bagging_fraction': 0.8346568914811361, 'lambda_l1': 2.206714812711709, 'lambda_l2': 3.1594683442464033}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:40:36,636][0m Trial 9 finished with value: 0.6362751867428285 and parameters: {'num_leaves': 175, 'min_data_in_leaf': 170, 'min_sum_hessian_in_leaf': 1.7765808030254076e-05, 'feature_fraction': 0.8818414207216692, 'bagging_fraction': 0.6218331872684371, 'lambda_l1': 0.05982625838323253, 'lambda_l2': 1.9490717640641542}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:40:39,886][0m Trial 10 finished with value: 0.673435440336451 and parameters: {'num_leaves': 32, 'min_data_in_leaf': 6, 'min_sum_hessian_in_leaf': 0.0010167214653943027, 'feature_fraction': 0.5040305717020102, 'bagging_fraction': 0.9940542446575642, 'lambda_l1': 0.010612397212799423, 'lambda_l2': 0.1661409929489422}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:40:40,888][0m Trial 11 finished with value: 0.6509949155734104 and parameters: {'num_leaves': 141, 'min_data_in_leaf': 198, 'min_sum_hessian_in_leaf': 0.009951069387483545, 'feature_fraction': 0.7633477525641262, 'bagging_fraction': 0.575475056267361, 'lambda_l1': 6.343590915843685, 'lambda_l2': 8.23255529096855}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:40:48,069][0m Trial 12 finished with value: 0.6756512459983679 and parameters: {'num_leaves': 255, 'min_data_in_leaf': 18, 'min_sum_hessian_in_leaf': 0.001634914743632515, 'feature_fraction': 0.8001581267589792, 'bagging_fraction': 0.8385815299651418, 'lambda_l1': 0.12924644318960654, 'lambda_l2': 0.2669531355707319}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:40:49,360][0m Trial 13 finished with value: 0.673404054987132 and parameters: {'num_leaves': 140, 'min_data_in_leaf': 43, 'min_sum_hessian_in_leaf': 0.0021756690901938718, 'feature_fraction': 0.6920119879687722, 'bagging_fraction': 0.5285693047324076, 'lambda_l1': 7.775399875398035, 'lambda_l2': 0.0699652461442427}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:40:52,378][0m Trial 14 finished with value: 0.676743456154667 and parameters: {'num_leaves': 31, 'min_data_in_leaf': 60, 'min_sum_hessian_in_leaf': 0.00043809038647614484, 'feature_fraction': 0.8259582777932722, 'bagging_fraction': 0.8148189817022143, 'lambda_l1': 0.0620290262968281, 'lambda_l2': 0.5343541416013556}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:40:54,277][0m Trial 15 finished with value: 0.6700207143305504 and parameters: {'num_leaves': 118, 'min_data_in_leaf': 121, 'min_sum_hessian_in_leaf': 0.009795048869631716, 'feature_fraction': 0.7363755369144015, 'bagging_fraction': 0.9342289692301791, 'lambda_l1': 2.3717491322043136, 'lambda_l2': 7.214148908060242}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:40:57,148][0m Trial 16 finished with value: 0.665601657146444 and parameters: {'num_leaves': 58, 'min_data_in_leaf': 36, 'min_sum_hessian_in_leaf': 0.0002540361858194928, 'feature_fraction': 0.9130458751266737, 'bagging_fraction': 0.684561345587239, 'lambda_l1': 0.24824579193576923, 'lambda_l2': 0.01033528363848504}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:40:57,949][0m Trial 17 finished with value: 0.6161634548992531 and parameters: {'num_leaves': 157, 'min_data_in_leaf': 168, 'min_sum_hessian_in_leaf': 0.0023269108543753883, 'feature_fraction': 0.8159485991955707, 'bagging_fraction': 0.8036823783112217, 'lambda_l1': 13.036514166467544, 'lambda_l2': 0.8807586136754733}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:40:59,468][0m Trial 18 finished with value: 0.6487477245621744 and parameters: {'num_leaves': 218, 'min_data_in_leaf': 113, 'min_sum_hessian_in_leaf': 0.0009561825756709678, 'feature_fraction': 0.745392277347916, 'bagging_fraction': 0.7896979370510073, 'lambda_l1': 1.8859038983275196, 'lambda_l2': 0.2497145849301042}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:41:02,536][0m Trial 19 finished with value: 0.6767622873642584 and parameters: {'num_leaves': 214, 'min_data_in_leaf': 73, 'min_sum_hessian_in_leaf': 0.00470022827464237, 'feature_fraction': 0.8343848232650426, 'bagging_fraction': 0.8962774263442048, 'lambda_l1': 0.6420140723743011, 'lambda_l2': 0.08240013163620977}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:41:04,428][0m Trial 20 finished with value: 0.6632477559475237 and parameters: {'num_leaves': 8, 'min_data_in_leaf': 141, 'min_sum_hessian_in_leaf': 0.0013845801360137025, 'feature_fraction': 0.9182676696103418, 'bagging_fraction': 0.7128489974232202, 'lambda_l1': 0.15470937382525465, 'lambda_l2': 2.2609218276420697}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:41:07,195][0m Trial 21 finished with value: 0.6767622873642584 and parameters: {'num_leaves': 202, 'min_data_in_leaf': 69, 'min_sum_hessian_in_leaf': 0.004346880252436188, 'feature_fraction': 0.8567769598292199, 'bagging_fraction': 0.8950870417609795, 'lambda_l1': 0.7703796368033595, 'lambda_l2': 0.04354863183028053}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:41:12,599][0m Trial 22 finished with value: 0.6835791852363318 and parameters: {'num_leaves': 242, 'min_data_in_leaf': 39, 'min_sum_hessian_in_leaf': 0.0029312159809115365, 'feature_fraction': 0.797120322493883, 'bagging_fraction': 0.9188562305409576, 'lambda_l1': 0.39491924434752995, 'lambda_l2': 0.11699060869869958}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:41:16,948][0m Trial 23 finished with value: 0.6858263762475676 and parameters: {'num_leaves': 238, 'min_data_in_leaf': 35, 'min_sum_hessian_in_leaf': 0.0025767028972243115, 'feature_fraction': 0.7805825622868416, 'bagging_fraction': 0.8148059316157261, 'lambda_l1': 0.35966599755660583, 'lambda_l2': 0.10619688398329478}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:41:21,508][0m Trial 24 finished with value: 0.6903207582700396 and parameters: {'num_leaves': 249, 'min_data_in_leaf': 35, 'min_sum_hessian_in_leaf': 0.0022135465987980587, 'feature_fraction': 0.7910081527344837, 'bagging_fraction': 0.777147743387992, 'lambda_l1': 0.2892838767901096, 'lambda_l2': 0.15562933607555834}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:41:28,096][0m Trial 25 finished with value: 0.6981545414600465 and parameters: {'num_leaves': 239, 'min_data_in_leaf': 24, 'min_sum_hessian_in_leaf': 0.0017613624830553707, 'feature_fraction': 0.7879963202697107, 'bagging_fraction': 0.7888189187157192, 'lambda_l1': 0.09570534249544699, 'lambda_l2': 0.039817678953394185}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:41:32,567][0m Trial 26 finished with value: 0.6520431862406628 and parameters: {'num_leaves': 196, 'min_data_in_leaf': 18, 'min_sum_hessian_in_leaf': 0.0006918172538679483, 'feature_fraction': 0.7156691855196431, 'bagging_fraction': 0.7759869529031096, 'lambda_l1': 0.06361154197572774, 'lambda_l2': 0.029506192029373737}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:41:45,135][0m Trial 27 finished with value: 0.6824555897307136 and parameters: {'num_leaves': 253, 'min_data_in_leaf': 7, 'min_sum_hessian_in_leaf': 0.0015110604476009741, 'feature_fraction': 0.7981086348840757, 'bagging_fraction': 0.7775152269204683, 'lambda_l1': 0.13213697905758195, 'lambda_l2': 0.03981834857329031}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-05 14:41:48,798][0m Trial 28 finished with value: 0.7048207896553889 and parameters: {'num_leaves': 233, 'min_data_in_leaf': 49, 'min_sum_hessian_in_leaf': 0.0068112646256021986, 'feature_fraction': 0.8426315000433712, 'bagging_fraction': 0.7317397959755858, 'lambda_l1': 0.05156762599756539, 'lambda_l2': 0.20885772437641695}. Best is trial 28 with value: 0.7048207896553889.[0m




[32m[I 2023-04-05 14:41:51,064][0m Trial 29 finished with value: 0.6733915008474044 and parameters: {'num_leaves': 161, 'min_data_in_leaf': 57, 'min_sum_hessian_in_leaf': 0.006296757603803257, 'feature_fraction': 0.8381244831959944, 'bagging_fraction': 0.7326320350377127, 'lambda_l1': 0.0223517282592696, 'lambda_l2': 0.265177341262348}. Best is trial 28 with value: 0.7048207896553889.[0m


In [7]:
#探査の結果確認
trial = study.best_trial
print("acc(best)={:.4f}".format(trial.value))
display(trial.params)

acc(best)=0.7048


{'num_leaves': 233,
 'min_data_in_leaf': 49,
 'min_sum_hessian_in_leaf': 0.0068112646256021986,
 'feature_fraction': 0.8426315000433712,
 'bagging_fraction': 0.7317397959755858,
 'lambda_l1': 0.05156762599756539,
 'lambda_l2': 0.20885772437641695}

In [8]:
#ベストなハイパーパラメータの取得
params_best = trial.params
params_best.update(params_base)
display(params_best)

{'num_leaves': 233,
 'min_data_in_leaf': 49,
 'min_sum_hessian_in_leaf': 0.0068112646256021986,
 'feature_fraction': 0.8426315000433712,
 'bagging_fraction': 0.7317397959755858,
 'lambda_l1': 0.05156762599756539,
 'lambda_l2': 0.20885772437641695,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'learning_rate': 0.02,
 'n_estimators': 100000,
 'bagging_freq': 1,
 'seed': 123}

# LightGBM以外のモデル利用

In [9]:
#Titanicデータを用いた例：ロジスティクス回帰
# ファイル読み込み
df_train = pd.read_csv("../input/titanic/train.csv")

# データセット作成
x_train = df_train[["Pclass", "Age", "Embarked"]]
y_train = df_train[["Survived"]]

In [10]:
# 欠損値の確認
x_train.isnull().sum()

Pclass        0
Age         177
Embarked      2
dtype: int64

In [11]:
# 欠損値補間：数値データ 平均値補完
x_train["Age"] = x_train["Age"].fillna(x_train["Age"].mean())

# 欠損値補間：カテゴリ変数　最頻値補完
x_train["Embarked"] = x_train["Embarked"].fillna(x_train["Embarked"].mode()[0])

In [12]:
#カテゴリ変数の数値化（One-hot-encoding）
ohe = OneHotEncoder()
ohe.fit(x_train[["Embarked"]])
df_embarked = pd.DataFrame(
    ohe.transform(x_train[["Embarked"]]).toarray(), 
    columns=["Embarked_{}".format(col) for col in ohe.categories_[0]])

x_train = pd.concat([x_train, df_embarked], axis=1)
x_train = x_train.drop(columns=["Embarked"])

In [13]:
#数値データの正規化
x_train["Pclass"] = (x_train["Pclass"] -x_train["Pclass"].min()) / (x_train["Pclass"].max() - x_train["Pclass"].min()) 
x_train["Age"] = (x_train["Age"] -x_train["Age"].min()) / (x_train["Age"].max() - x_train["Age"].min()) 

In [14]:
#学習データと検証データの分割（ホールドアウト法）
x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=123)
print(x_tr.shape, x_va.shape, y_tr.shape, y_va.shape)

(712, 5) (179, 5) (712, 1) (179, 1)


In [15]:
#LogisticRegression ロジスティクス回帰
# モデル定義
from sklearn.linear_model import LogisticRegression
model_logis = LogisticRegression()

# 学習
model_logis.fit(x_tr, y_tr)

# 予測
y_va_pred = model_logis.predict(x_va)
print("accuracy:{:.4f}".format(accuracy_score(y_va, y_va_pred)))
print(y_va_pred[:5])

accuracy:0.7263
[0 1 0 1 0]


In [16]:
#確率値の取得
y_va_pred_proba = model_logis.predict_proba(x_va)
print(y_va_pred_proba[:5, :])

[[0.83621285 0.16378715]
 [0.23058311 0.76941689]
 [0.83244141 0.16755859]
 [0.32227072 0.67772928]
 [0.62569522 0.37430478]]


In [17]:
#Titanicデータを用いた例：SVM
# モデル定義
from sklearn.svm import SVC
model_svm = SVC(C=1.0, random_state=123, probability=True)#確率値の計算するパラメータをTrue

# 学習
model_svm.fit(x_tr, y_tr)

# 予測
y_va_pred = model_svm.predict(x_va)
print("accuracy:{:.4f}".format(accuracy_score(y_va, y_va_pred)))
print(y_va_pred[:5])

# 確率値の取得
y_va_pred_proba = model_svm.predict_proba(x_va)
print(y_va_pred_proba[:5, :])

accuracy:0.7151
[0 1 0 1 0]
[[0.73985924 0.26014076]
 [0.28242534 0.71757466]
 [0.73986177 0.26013823]
 [0.26828214 0.73171786]
 [0.58950192 0.41049808]]


# ニューラルネットワーク
### ニューラルネットワークの適用例：①全結合層のみのネットワークモデル

In [18]:
#Tensorflowインポート
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.layers import Embedding, Flatten, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.optimizers import Adam, SGD

In [19]:
# tensorflowの再現性のためのシード指定
def seed_everything(seed):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    session_conf = tf.compat.v1.ConfigProto(
        intra_op_parallelism_threads=1,
        inter_op_parallelism_threads=1
    )
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)

In [20]:
# ファイル読み込み
df_train = pd.read_csv("../input/titanic/train.csv")

# データセット作成
x_train = df_train[["Pclass", "Age", "Embarked"]]
y_train = df_train[["Survived"]]

In [21]:
# 欠損値補間 平均値補完
x_train["Age"] = x_train["Age"].fillna(x_train["Age"].mean())

# 正規化　補完後に0~1に間になるように正規化
for col in ["Pclass", "Age"]:
    value_min = x_train[col].min()
    value_max = x_train[col].max()
    x_train[col] = (x_train[col] - value_min) / (value_max - value_min)

In [22]:
# 欠損値補間　最頻値で補完
x_train["Embarked"] = x_train["Embarked"].fillna(x_train["Embarked"].mode()[0])

# one-hot-encodingで変換
ohe = OneHotEncoder()
ohe.fit(x_train[["Embarked"]])
df_embarked = pd.DataFrame(ohe.transform(x_train[["Embarked"]]).toarray(), 
                           columns=["Embarked_{}".format(col) for col in ohe.categories_[0]])
x_train = pd.concat([x_train.drop(columns=["Embarked"]), 
                     df_embarked], axis=1)

In [23]:
# 学習データと検証データの分割
x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=123)
print(x_tr.shape, x_va.shape, y_tr.shape, y_va.shape)

(712, 5) (179, 5) (712, 1) (179, 1)


In [24]:
#モデル定義　カラム数5:入力ノード5 隠れ層=3 10/10/5
def create_model():
    input_num = Input(shape=(5,))
    x_num = Dense(10, activation="relu")(input_num)
    x_num = BatchNormalization()(x_num)
    x_num = Dropout(0.3)(x_num)
    x_num = Dense(10, activation="relu")(x_num)
    x_num = BatchNormalization()(x_num)
    x_num = Dropout(0.2)(x_num)
    x_num = Dense(5, activation="relu")(x_num)
    x_num = BatchNormalization()(x_num)
    x_num = Dropout(0.1)(x_num)
    out = Dense(1, activation="sigmoid")(x_num)

    model = Model(inputs=input_num,
                  outputs=out,
                 )

    model.compile(
        optimizer="Adam",
        loss="binary_crossentropy",
        metrics=["binary_crossentropy"],
    )
    
    return model

model = create_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 5)]               0         
                                                                 
 dense (Dense)               (None, 10)                60        
                                                                 
 batch_normalization (BatchN  (None, 10)               40        
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 10)                110       
                                                                 
 batch_normalization_1 (Batc  (None, 10)               40        
 hNormalization)                                             

In [25]:
#モデル学習
#ミニバッチサイズを8としてlossが5 回連続改善しない場合に学習率を1/10
#10回連続改善しなかった場合には強制終了
seed_everything(seed=123)
model = create_model()
model.fit(x=x_tr,
          y=y_tr,
          validation_data=(x_va, y_va),
          batch_size=8,
          epochs=10000,
          callbacks=[
              ModelCheckpoint(filepath="model_keras.h5", monitor="val_loss", mode="min", verbose=1, save_best_only=True, save_weights_only=True),
              EarlyStopping(monitor="val_loss", mode="min", min_delta=0, patience=10, verbose=1, restore_best_weights=True),
              ReduceLROnPlateau(monitor="val_loss", mode="min", factor=0.1, patience=5, verbose=1),
          ],
          verbose=1,
         )

Epoch 1/10000
Epoch 1: val_loss improved from inf to 0.68177, saving model to model_keras.h5
Epoch 2/10000
Epoch 2: val_loss improved from 0.68177 to 0.66743, saving model to model_keras.h5
Epoch 3/10000
Epoch 3: val_loss improved from 0.66743 to 0.65250, saving model to model_keras.h5
Epoch 4/10000
Epoch 4: val_loss improved from 0.65250 to 0.63568, saving model to model_keras.h5
Epoch 5/10000
Epoch 5: val_loss improved from 0.63568 to 0.61948, saving model to model_keras.h5
Epoch 6/10000
Epoch 6: val_loss improved from 0.61948 to 0.61322, saving model to model_keras.h5
Epoch 7/10000
Epoch 7: val_loss did not improve from 0.61322
Epoch 8/10000
Epoch 8: val_loss improved from 0.61322 to 0.61166, saving model to model_keras.h5
Epoch 9/10000
Epoch 9: val_loss improved from 0.61166 to 0.60525, saving model to model_keras.h5
Epoch 10/10000
Epoch 10: val_loss did not improve from 0.60525
Epoch 11/10000
Epoch 11: val_loss improved from 0.60525 to 0.60281, saving model to model_keras.h5
Epoch

<keras.callbacks.History at 0x7349ec705450>

In [26]:
#モデルの評価
y_va_pred = model.predict(x_va, batch_size=8, verbose=1)
print("accuracy: {:.4f}".format(accuracy_score(y_va, np.where(y_va_pred>=0.5,1,0))))

accuracy: 0.6872


### ニューラルネットワークの適用例：②埋め込み層ありのネットワークモデル

In [27]:
# ファイル読み込み
df_train = pd.read_csv("../input/titanic/train.csv")

# データセット作成
x_train = df_train[["Pclass", "Age", "Cabin"]]
y_train = df_train[["Survived"]]

In [28]:
#数値データの前処理
# 欠損値補間
x_train["Age"] = x_train["Age"].fillna(x_train["Age"].mean())

# 正規化
for col in ["Pclass", "Age"]:
    value_min = x_train[col].min()
    value_max = x_train[col].max()
    x_train[col] = (x_train[col] - value_min) / (value_max - value_min)

In [29]:
#カテゴリ変数の前処理　欠損値をnoneにした後にlabel-encoding
# 欠損値補間
x_train["Cabin"] = x_train["Cabin"].fillna("None")

# label-encoding
le = LabelEncoder()
le.fit(x_train[["Cabin"]])
x_train["Cabin"] = le.transform(x_train["Cabin"])

print(le.classes_)
print("count:", len(le.classes_))

['A10' 'A14' 'A16' 'A19' 'A20' 'A23' 'A24' 'A26' 'A31' 'A32' 'A34' 'A36'
 'A5' 'A6' 'A7' 'B101' 'B102' 'B18' 'B19' 'B20' 'B22' 'B28' 'B3' 'B30'
 'B35' 'B37' 'B38' 'B39' 'B4' 'B41' 'B42' 'B49' 'B5' 'B50' 'B51 B53 B55'
 'B57 B59 B63 B66' 'B58 B60' 'B69' 'B71' 'B73' 'B77' 'B78' 'B79' 'B80'
 'B82 B84' 'B86' 'B94' 'B96 B98' 'C101' 'C103' 'C104' 'C106' 'C110' 'C111'
 'C118' 'C123' 'C124' 'C125' 'C126' 'C128' 'C148' 'C2' 'C22 C26'
 'C23 C25 C27' 'C30' 'C32' 'C45' 'C46' 'C47' 'C49' 'C50' 'C52' 'C54'
 'C62 C64' 'C65' 'C68' 'C7' 'C70' 'C78' 'C82' 'C83' 'C85' 'C86' 'C87'
 'C90' 'C91' 'C92' 'C93' 'C95' 'C99' 'D' 'D10 D12' 'D11' 'D15' 'D17' 'D19'
 'D20' 'D21' 'D26' 'D28' 'D30' 'D33' 'D35' 'D36' 'D37' 'D45' 'D46' 'D47'
 'D48' 'D49' 'D50' 'D56' 'D6' 'D7' 'D9' 'E10' 'E101' 'E12' 'E121' 'E17'
 'E24' 'E25' 'E31' 'E33' 'E34' 'E36' 'E38' 'E40' 'E44' 'E46' 'E49' 'E50'
 'E58' 'E63' 'E67' 'E68' 'E77' 'E8' 'F E69' 'F G63' 'F G73' 'F2' 'F33'
 'F38' 'F4' 'G6' 'None' 'T']
count: 148


In [30]:
#学習データと検証データの分離
x_train_num, x_train_cat = x_train[["Pclass", "Age"]], x_train[["Cabin"]]

x_num_tr, x_num_va, x_cat_tr, x_cat_va, y_tr, y_va = \
   train_test_split(x_train_num, x_train_cat, y_train, test_size=0.2, stratify=y_train, random_state=123)
print(x_num_tr.shape, x_num_va.shape, x_cat_tr.shape, x_cat_va.shape, y_tr.shape, y_va.shape)

(712, 2) (179, 2) (712, 1) (179, 1) (712, 1) (179, 1)


In [31]:
#組み込み層ありのモデル定義
def create_model_embedding():
    ################# num
    input_num = Input(shape=(2,))
    layer_num = Dense(10, activation="relu")(input_num)
    layer_num = BatchNormalization()(layer_num)
    layer_num = Dropout(0.2)(layer_num)
    layer_num = Dense(10, activation="relu")(layer_num)

    ################# cat
    input_cat = Input(shape=(1,))
    layer_cat = input_cat[:, 0]
    layer_cat = Embedding(input_dim=148, output_dim=74)(layer_cat)
    layer_cat = Dropout(0.2)(layer_cat)
    layer_cat = Flatten()(layer_cat)

    ################# concat
    hidden_layer = Concatenate()([layer_num, layer_cat])
    hidden_layer = Dense(50, activation="relu")(hidden_layer)
    hidden_layer = BatchNormalization()(hidden_layer)
    hidden_layer = Dropout(0.1)(hidden_layer)
    hidden_layer = Dense(20, activation="relu")(hidden_layer)
    hidden_layer = BatchNormalization()(hidden_layer)
    hidden_layer = Dropout(0.1)(hidden_layer)
    output_layer = Dense(1, activation="sigmoid")(hidden_layer)

    model = Model(inputs=[input_num, input_cat],
                  outputs=output_layer,
                 )

    model.compile(
        optimizer="Adam",
        loss="binary_crossentropy",
        metrics=["binary_crossentropy"],
    )
    
    return model

model = create_model_embedding()
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 2)]          0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 dense_8 (Dense)                (None, 10)           30          ['input_3[0][0]']                
                                                                                                  
 tf.__operators__.getitem (Slic  (None,)             0           ['input_4[0][0]']                
 ingOpLambda)                                                                               

In [32]:
#モデル学習
seed_everything(seed=123)
model = create_model_embedding()
model.fit(x=[x_num_tr, x_cat_tr],
          y=y_tr,
          validation_data=([x_num_va, x_cat_va], y_va),
          batch_size=8,
          epochs=10000,
          callbacks=[
              ModelCheckpoint(filepath="model_keras_embedding.h5", monitor="val_loss", mode="min", verbose=1, save_best_only=True, save_weights_only=True),
              EarlyStopping(monitor="val_loss", mode="min", min_delta=0, patience=10, verbose=1, restore_best_weights=True),
              ReduceLROnPlateau(monitor="val_loss", mode="min", factor=0.1, patience=5, verbose=1),
          ],
          verbose=1,
         )

Epoch 1/10000
Epoch 1: val_loss improved from inf to 0.65931, saving model to model_keras_embedding.h5
Epoch 2/10000
Epoch 2: val_loss improved from 0.65931 to 0.65164, saving model to model_keras_embedding.h5
Epoch 3/10000
Epoch 3: val_loss improved from 0.65164 to 0.64750, saving model to model_keras_embedding.h5
Epoch 4/10000
Epoch 4: val_loss improved from 0.64750 to 0.63007, saving model to model_keras_embedding.h5
Epoch 5/10000
Epoch 5: val_loss improved from 0.63007 to 0.60511, saving model to model_keras_embedding.h5
Epoch 6/10000
Epoch 6: val_loss improved from 0.60511 to 0.59476, saving model to model_keras_embedding.h5
Epoch 7/10000
Epoch 7: val_loss did not improve from 0.59476
Epoch 8/10000
Epoch 8: val_loss did not improve from 0.59476
Epoch 9/10000
Epoch 9: val_loss did not improve from 0.59476
Epoch 10/10000
Epoch 10: val_loss did not improve from 0.59476
Epoch 11/10000
Epoch 11: val_loss did not improve from 0.59476

Epoch 11: ReduceLROnPlateau reducing learning rate t

<keras.callbacks.History at 0x7349e6fef910>

In [33]:
#学習モデル評価
y_va_pred = model.predict([x_num_va, x_cat_va], batch_size=8, verbose=1)
print("accuracy: {:.4f}".format(accuracy_score(y_va, np.where(y_va_pred>=0.5,1,0))))

accuracy: 0.6983


# アンサンブル

## 単純平均

In [34]:
#モデルの予測値を持つデータフレームを乱数で作成
np.random.seed(123)
df = pd.DataFrame({
    "true": [0]*700 + [1]*300,
    "pred1":np.arange(1000) + np.random.rand(1000)*1200,
    "pred2":np.arange(1000) + np.random.rand(1000)*1000,
    "pred3":np.arange(1000) + np.random.rand(1000)*800,
})
df["pred1"] = np.clip(df["pred1"]/df["pred1"].max(), 0, 1)
df["pred2"] = np.clip(df["pred2"]/df["pred2"].max(), 0, 1)
df["pred3"] = np.clip(df["pred3"]/df["pred3"].max(), 0, 1)

df_train, df_test = train_test_split(df, test_size=0.8, stratify=df["true"], random_state=123)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_train.head()

Unnamed: 0,true,pred1,pred2,pred3
0,1,0.683821,0.874443,0.859939
1,0,0.540691,0.113419,0.197144
2,0,0.310541,0.334798,0.599304
3,0,0.043486,0.170622,0.378528
4,0,0.550847,0.354703,0.59886


In [35]:
#単純移動平均によるアンサンブル
df_train["pred_ensemble1"] = (df_train["pred1"] + df_train["pred2"] + df_train["pred3"]) / 3
df_train.head()

Unnamed: 0,true,pred1,pred2,pred3,pred_ensemble1
0,1,0.683821,0.874443,0.859939,0.806068
1,0,0.540691,0.113419,0.197144,0.283752
2,0,0.310541,0.334798,0.599304,0.414881
3,0,0.043486,0.170622,0.378528,0.197545
4,0,0.550847,0.354703,0.59886,0.50147


In [36]:
#アンサンブル用の精度評価関数と精度評価
def evaluate_ensemble(input_df, col_pred):
    print("[auc] model1:{:.4f}, model2:{:.4f}, model3:{:.4f} -> ensemble:{:.4f}".format(
        roc_auc_score(input_df["true"], input_df["pred1"]),
        roc_auc_score(input_df["true"], input_df["pred2"]),
        roc_auc_score(input_df["true"], input_df["pred3"]),
        roc_auc_score(input_df["true"], input_df[col_pred]),
    ))

evaluate_ensemble(df_train, col_pred="pred_ensemble1")

[auc] model1:0.8342, model2:0.8671, model3:0.9050 -> ensemble:0.9585


In [37]:
#推論時のアンサンブル処理と精度評価
df_test["pred_ensemble1"] = (df_test["pred1"] + df_test["pred2"] + df_test["pred3"]) / 3
evaluate_ensemble(df_test, col_pred="pred_ensemble1")

[auc] model1:0.8086, model2:0.8398, model3:0.8973 -> ensemble:0.9396


## 重み付き平均

In [38]:
# 重み付きに平均によるアンサンブル
weight = [0.3, 0.3, 0.4]
weight = weight / np.sum(weight)
print(weight)

df_train["pred_ensemble2"] = df_train["pred1"] * weight[0] + \
                             df_train["pred2"] * weight[1] + \
                             df_train["pred3"] * weight[2]
df_train[["true","pred1","pred2","pred3","pred_ensemble2"]].head()

[0.3 0.3 0.4]


Unnamed: 0,true,pred1,pred2,pred3,pred_ensemble2
0,1,0.683821,0.874443,0.859939,0.811455
1,0,0.540691,0.113419,0.197144,0.275091
2,0,0.310541,0.334798,0.599304,0.433324
3,0,0.043486,0.170622,0.378528,0.215643
4,0,0.550847,0.354703,0.59886,0.511209


In [39]:
# アンサンブルの精度評価
evaluate_ensemble(df_train, col_pred="pred_ensemble2")

[auc] model1:0.8342, model2:0.8671, model3:0.9050 -> ensemble:0.9614


In [40]:
#推論時のアンサンブル処理と精度評価
df_test["pred_ensemble2"] = df_test["pred1"] * weight[0] + \
                            df_test["pred2"] * weight[1] + \
                            df_test["pred3"] * weight[2]
evaluate_ensemble(df_test, col_pred="pred_ensemble2")

[auc] model1:0.8086, model2:0.8398, model3:0.8973 -> ensemble:0.9420


## スタッキング

In [41]:
# スタッキングによるいアンサンブル（予測値から値を予測するモデルを利用する方法）
from sklearn.linear_model import Lasso

x, y = df_train[["pred1", "pred2", "pred3"]], df_train[["true"]]
oof = np.zeros(len(x))
models = []

cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x, y))
for nfold in np.arange(5):
    # 学習データと検証データの分離
    idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
    x_tr, y_tr = x.loc[idx_tr, :], y.loc[idx_tr, :]
    x_va, y_va = x.loc[idx_va, :], y.loc[idx_va, :]
    
    # モデル学習
    model = Lasso(alpha=0.01)
    model.fit(x_tr, y_tr)
    models.append(model)
    
    # 検証データの予測値算出
    y_va_pred = model.predict(x_va)
    oof[idx_va] = y_va_pred
    
df_train["pred_ensemble3"] = oof
df_train["pred_ensemble3"] = df_train["pred_ensemble3"].clip(lower=0, upper=1)
df_train[["true","pred1","pred2","pred3","pred_ensemble3"]].head()

Unnamed: 0,true,pred1,pred2,pred3,pred_ensemble3
0,1,0.683821,0.874443,0.859939,0.74502
1,0,0.540691,0.113419,0.197144,0.0
2,0,0.310541,0.334798,0.599304,0.206734
3,0,0.043486,0.170622,0.378528,0.0
4,0,0.550847,0.354703,0.59886,0.303498


In [42]:
#アンサンブル精度評価
evaluate_ensemble(df_train, col_pred="pred_ensemble3")

[auc] model1:0.8342, model2:0.8671, model3:0.9050 -> ensemble:0.9577


In [43]:
#推論時のアンサンブル処理と精度評価
df_test["pred_ensemble3"] = 0
for model in models:
    df_test["pred_ensemble3"] += model.predict(df_test[["pred1", "pred2", "pred3"]]) / len(models)
df_test["pred_ensemble3"] = df_test["pred_ensemble3"].clip(lower=0, upper=1)
evaluate_ensemble(df_test, col_pred="pred_ensemble3")

[auc] model1:0.8086, model2:0.8398, model3:0.8973 -> ensemble:0.9437
