In [1]:
# 必要ライブラリのインポート
import numpy as np
import pandas as pd
import os
import pickle
import gc

# 可視化
import matplotlib.pyplot as plt

# 前処理
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

# バリデーション
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

# 評価指標
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# モデリング: lightgbm
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

# matplotilbで日本語表示したい場合はこれをinstallしてインポートする
!pip install japanize-matplotlib
import japanize_matplotlib

# 2022/06/02追加: Kaggle notebook環境変更のため
!pip install -U pandas_profiling

# 分布確認
import pandas_profiling as pdp


Collecting japanize-matplotlib
  Downloading japanize-matplotlib-1.1.3.tar.gz (4.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: japanize-matplotlib
  Building wheel for japanize-matplotlib (setup.py) ... [?25l- \ done
[?25h  Created wheel for japanize-matplotlib: filename=japanize_matplotlib-1.1.3-py3-none-any.whl size=4120275 sha256=55c148785d175c4dd2ee3835a181576cd6be838b8a9249101bff4870cef3b5f9
  Stored in directory: /root/.cache/pip/wheels/d3/7d/c5/d3e02382561888f86edabf3256c09b3298f8e24456f8fc4da3
Successfully built japanize-matplotlib
Installing collected packages: japanize-matplotlib
Successfully installed japanize-matplotlib-1.1.3
Collecting pandas_profiling
  Downloading pandas_profiling-3.6.6-py2.py3-none-any.whl (324 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3

In [2]:
#ファイル読み込み
df_train = pd.read_csv("../input/titanic/train.csv")
df_train.head()

#Excelファイル　df = pd.read_excel("ファイル名")
#タブ区切り　　　df = pd_read_CSV("sample.tst", sep="\t")
#Shift JIS形式のCSVファイル　df = pd.read_CSV("sample.csv", encoding="Shift_jis")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#データセット作成
x_train, y_train, id_train = df_train[["Pclass", "Fare"]], \
                             df_train[["Survived"]], \
                             df_train[["PassengerId"]]
print(x_train.shape, y_train.shape, id_train.shape)

(891, 2) (891, 1) (891, 1)


In [4]:
#ベイズ最適化
import optuna

In [5]:
#目的関数の定義
# 探索しないハイパーパラメータ
params_base = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.02,
    'n_estimators': 100000,
    "bagging_freq": 1,
    "seed": 123,
}

def objective(trial):
    # 探索するハイパーパラメータ
    params_tuning = {
        "num_leaves": trial.suggest_int("num_leaves", 8, 256),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 200),
        "min_sum_hessian_in_leaf": trial.suggest_float("min_sum_hessian_in_leaf", 1e-5, 1e-2, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-2, 1e2, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-2, 1e2, log=True),
    }
    params_tuning.update(params_base)
    
    # モデル学習・評価
    list_metrics = []
    cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))
    for nfold in np.arange(5):
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = x_train.loc[idx_tr, :], y_train.loc[idx_tr, :]
        x_va, y_va = x_train.loc[idx_va, :], y_train.loc[idx_va, :]
        model = lgb.LGBMClassifier(**params_tuning)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr,y_tr), (x_va,y_va)],
                  early_stopping_rounds=100,
                  verbose=0,
                 )
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_va = accuracy_score(y_va, np.where(y_va_pred>=0.5, 1, 0))
        list_metrics.append(metric_va)
    
    # 評価値の計算
    metrics = np.mean(list_metrics)
    
    return metrics

In [6]:
#最適化処理（探査の実行）
sampler = optuna.samplers.TPESampler(seed=123)
study = optuna.create_study(sampler=sampler, direction="maximize")
study.optimize(objective, n_trials=30)

[32m[I 2023-04-03 15:00:19,381][0m A new study created in memory with name: no-name-d7af92f8-fb75-41a2-b50d-004fd4f44100[0m




[32m[I 2023-04-03 15:00:22,056][0m Trial 0 finished with value: 0.664478061640826 and parameters: {'num_leaves': 181, 'min_data_in_leaf': 61, 'min_sum_hessian_in_leaf': 4.792414358623587e-05, 'feature_fraction': 0.7756573845414456, 'bagging_fraction': 0.8597344848927815, 'lambda_l1': 0.492522233779106, 'lambda_l2': 83.76388146302445}. Best is trial 0 with value: 0.664478061640826.[0m




[32m[I 2023-04-03 15:00:23,465][0m Trial 1 finished with value: 0.6712196346745339 and parameters: {'num_leaves': 178, 'min_data_in_leaf': 99, 'min_sum_hessian_in_leaf': 0.00015009027543233888, 'feature_fraction': 0.6715890080754348, 'bagging_fraction': 0.8645248536920208, 'lambda_l1': 0.567922374174008, 'lambda_l2': 0.01732652966363563}. Best is trial 1 with value: 0.6712196346745339.[0m




[32m[I 2023-04-03 15:00:24,708][0m Trial 2 finished with value: 0.65762350134957 and parameters: {'num_leaves': 107, 'min_data_in_leaf': 149, 'min_sum_hessian_in_leaf': 3.52756635172055e-05, 'feature_fraction': 0.5877258780737462, 'bagging_fraction': 0.7657756869209191, 'lambda_l1': 1.3406343673102123, 'lambda_l2': 3.4482904089131434}. Best is trial 1 with value: 0.6712196346745339.[0m




[32m[I 2023-04-03 15:00:25,719][0m Trial 3 finished with value: 0.6722302429226037 and parameters: {'num_leaves': 219, 'min_data_in_leaf': 146, 'min_sum_hessian_in_leaf': 0.0006808799287054756, 'feature_fraction': 0.8612216912851107, 'bagging_fraction': 0.6614794569265892, 'lambda_l1': 0.2799978022399009, 'lambda_l2': 0.08185645330667264}. Best is trial 3 with value: 0.6722302429226037.[0m




[32m[I 2023-04-03 15:00:26,845][0m Trial 4 finished with value: 0.668972443663298 and parameters: {'num_leaves': 81, 'min_data_in_leaf': 128, 'min_sum_hessian_in_leaf': 1.889360449174926e-05, 'feature_fraction': 0.7168505863397641, 'bagging_fraction': 0.7154313816648219, 'lambda_l1': 0.9434967110751797, 'lambda_l2': 0.5050346330980694}. Best is trial 3 with value: 0.6722302429226037.[0m




[32m[I 2023-04-03 15:00:28,129][0m Trial 5 finished with value: 0.6587847592743706 and parameters: {'num_leaves': 85, 'min_data_in_leaf': 88, 'min_sum_hessian_in_leaf': 0.004788147156768277, 'feature_fraction': 0.9720800091019398, 'bagging_fraction': 0.7509183379421682, 'lambda_l1': 3.1319282717196035, 'lambda_l2': 0.029005047452739414}. Best is trial 3 with value: 0.6722302429226037.[0m




[32m[I 2023-04-03 15:00:28,445][0m Trial 6 finished with value: 0.6161634548992531 and parameters: {'num_leaves': 87, 'min_data_in_leaf': 86, 'min_sum_hessian_in_leaf': 0.003971252247766701, 'feature_fraction': 0.6252276826982534, 'bagging_fraction': 0.7415171321313522, 'lambda_l1': 87.54657140659076, 'lambda_l2': 1.1965765212602313}. Best is trial 3 with value: 0.6722302429226037.[0m




[32m[I 2023-04-03 15:00:31,861][0m Trial 7 finished with value: 0.6992530286862093 and parameters: {'num_leaves': 160, 'min_data_in_leaf': 28, 'min_sum_hessian_in_leaf': 0.0030131614432849746, 'feature_fraction': 0.8015300642054637, 'bagging_fraction': 0.7725340032332324, 'lambda_l1': 0.23499322154972468, 'lambda_l2': 0.1646202117975735}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:00:33,560][0m Trial 8 finished with value: 0.6823363254033017 and parameters: {'num_leaves': 111, 'min_data_in_leaf': 138, 'min_sum_hessian_in_leaf': 0.00423029374725911, 'feature_fraction': 0.7552111687390055, 'bagging_fraction': 0.8346568914811361, 'lambda_l1': 2.206714812711709, 'lambda_l2': 3.1594683442464033}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:00:34,326][0m Trial 9 finished with value: 0.6362751867428285 and parameters: {'num_leaves': 175, 'min_data_in_leaf': 170, 'min_sum_hessian_in_leaf': 1.7765808030254076e-05, 'feature_fraction': 0.8818414207216692, 'bagging_fraction': 0.6218331872684371, 'lambda_l1': 0.05982625838323253, 'lambda_l2': 1.9490717640641542}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:00:36,741][0m Trial 10 finished with value: 0.673435440336451 and parameters: {'num_leaves': 32, 'min_data_in_leaf': 6, 'min_sum_hessian_in_leaf': 0.0010167214653943027, 'feature_fraction': 0.5040305717020102, 'bagging_fraction': 0.9940542446575642, 'lambda_l1': 0.010612397212799423, 'lambda_l2': 0.1661409929489422}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:00:37,616][0m Trial 11 finished with value: 0.6509949155734104 and parameters: {'num_leaves': 141, 'min_data_in_leaf': 198, 'min_sum_hessian_in_leaf': 0.009951069387483545, 'feature_fraction': 0.7633477525641262, 'bagging_fraction': 0.575475056267361, 'lambda_l1': 6.343590915843685, 'lambda_l2': 8.23255529096855}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:00:42,996][0m Trial 12 finished with value: 0.6756512459983679 and parameters: {'num_leaves': 255, 'min_data_in_leaf': 18, 'min_sum_hessian_in_leaf': 0.001634914743632515, 'feature_fraction': 0.8001581267589792, 'bagging_fraction': 0.8385815299651418, 'lambda_l1': 0.12924644318960654, 'lambda_l2': 0.2669531355707319}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:00:44,407][0m Trial 13 finished with value: 0.673404054987132 and parameters: {'num_leaves': 140, 'min_data_in_leaf': 43, 'min_sum_hessian_in_leaf': 0.0021756690901938718, 'feature_fraction': 0.6920119879687722, 'bagging_fraction': 0.5285693047324076, 'lambda_l1': 7.775399875398035, 'lambda_l2': 0.0699652461442427}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:00:46,884][0m Trial 14 finished with value: 0.676743456154667 and parameters: {'num_leaves': 31, 'min_data_in_leaf': 60, 'min_sum_hessian_in_leaf': 0.00043809038647614484, 'feature_fraction': 0.8259582777932722, 'bagging_fraction': 0.8148189817022143, 'lambda_l1': 0.0620290262968281, 'lambda_l2': 0.5343541416013556}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:00:48,290][0m Trial 15 finished with value: 0.6700207143305504 and parameters: {'num_leaves': 118, 'min_data_in_leaf': 121, 'min_sum_hessian_in_leaf': 0.009795048869631716, 'feature_fraction': 0.7363755369144015, 'bagging_fraction': 0.9342289692301791, 'lambda_l1': 2.3717491322043136, 'lambda_l2': 7.214148908060242}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:00:50,618][0m Trial 16 finished with value: 0.665601657146444 and parameters: {'num_leaves': 58, 'min_data_in_leaf': 36, 'min_sum_hessian_in_leaf': 0.0002540361858194928, 'feature_fraction': 0.9130458751266737, 'bagging_fraction': 0.684561345587239, 'lambda_l1': 0.24824579193576923, 'lambda_l2': 0.01033528363848504}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:00:51,340][0m Trial 17 finished with value: 0.6161634548992531 and parameters: {'num_leaves': 157, 'min_data_in_leaf': 168, 'min_sum_hessian_in_leaf': 0.0023269108543753883, 'feature_fraction': 0.8159485991955707, 'bagging_fraction': 0.8036823783112217, 'lambda_l1': 13.036514166467544, 'lambda_l2': 0.8807586136754733}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:00:52,580][0m Trial 18 finished with value: 0.6487477245621744 and parameters: {'num_leaves': 218, 'min_data_in_leaf': 113, 'min_sum_hessian_in_leaf': 0.0009561825756709678, 'feature_fraction': 0.745392277347916, 'bagging_fraction': 0.7896979370510073, 'lambda_l1': 1.8859038983275196, 'lambda_l2': 0.2497145849301042}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:00:54,847][0m Trial 19 finished with value: 0.6767622873642584 and parameters: {'num_leaves': 214, 'min_data_in_leaf': 73, 'min_sum_hessian_in_leaf': 0.00470022827464237, 'feature_fraction': 0.8343848232650426, 'bagging_fraction': 0.8962774263442048, 'lambda_l1': 0.6420140723743011, 'lambda_l2': 0.08240013163620977}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:00:56,416][0m Trial 20 finished with value: 0.6632477559475237 and parameters: {'num_leaves': 8, 'min_data_in_leaf': 141, 'min_sum_hessian_in_leaf': 0.0013845801360137025, 'feature_fraction': 0.9182676696103418, 'bagging_fraction': 0.7128489974232202, 'lambda_l1': 0.15470937382525465, 'lambda_l2': 2.2609218276420697}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:00:58,532][0m Trial 21 finished with value: 0.6767622873642584 and parameters: {'num_leaves': 202, 'min_data_in_leaf': 69, 'min_sum_hessian_in_leaf': 0.004346880252436188, 'feature_fraction': 0.8567769598292199, 'bagging_fraction': 0.8950870417609795, 'lambda_l1': 0.7703796368033595, 'lambda_l2': 0.04354863183028053}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:01:02,711][0m Trial 22 finished with value: 0.6835791852363318 and parameters: {'num_leaves': 242, 'min_data_in_leaf': 39, 'min_sum_hessian_in_leaf': 0.0029312159809115365, 'feature_fraction': 0.797120322493883, 'bagging_fraction': 0.9188562305409576, 'lambda_l1': 0.39491924434752995, 'lambda_l2': 0.11699060869869958}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:01:06,013][0m Trial 23 finished with value: 0.6858263762475676 and parameters: {'num_leaves': 238, 'min_data_in_leaf': 35, 'min_sum_hessian_in_leaf': 0.0025767028972243115, 'feature_fraction': 0.7805825622868416, 'bagging_fraction': 0.8148059316157261, 'lambda_l1': 0.35966599755660583, 'lambda_l2': 0.10619688398329478}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:01:09,386][0m Trial 24 finished with value: 0.6903207582700396 and parameters: {'num_leaves': 249, 'min_data_in_leaf': 35, 'min_sum_hessian_in_leaf': 0.0022135465987980587, 'feature_fraction': 0.7910081527344837, 'bagging_fraction': 0.777147743387992, 'lambda_l1': 0.2892838767901096, 'lambda_l2': 0.15562933607555834}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:01:14,639][0m Trial 25 finished with value: 0.6981545414600465 and parameters: {'num_leaves': 239, 'min_data_in_leaf': 24, 'min_sum_hessian_in_leaf': 0.0017613624830553707, 'feature_fraction': 0.7879963202697107, 'bagging_fraction': 0.7888189187157192, 'lambda_l1': 0.09570534249544699, 'lambda_l2': 0.039817678953394185}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:01:18,300][0m Trial 26 finished with value: 0.6520431862406628 and parameters: {'num_leaves': 196, 'min_data_in_leaf': 18, 'min_sum_hessian_in_leaf': 0.0006918172538679483, 'feature_fraction': 0.7156691855196431, 'bagging_fraction': 0.7759869529031096, 'lambda_l1': 0.06361154197572774, 'lambda_l2': 0.029506192029373737}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:01:27,561][0m Trial 27 finished with value: 0.6824555897307136 and parameters: {'num_leaves': 253, 'min_data_in_leaf': 7, 'min_sum_hessian_in_leaf': 0.0015110604476009741, 'feature_fraction': 0.7981086348840757, 'bagging_fraction': 0.7775152269204683, 'lambda_l1': 0.13213697905758195, 'lambda_l2': 0.03981834857329031}. Best is trial 7 with value: 0.6992530286862093.[0m




[32m[I 2023-04-03 15:01:30,377][0m Trial 28 finished with value: 0.7048207896553889 and parameters: {'num_leaves': 233, 'min_data_in_leaf': 49, 'min_sum_hessian_in_leaf': 0.0068112646256021986, 'feature_fraction': 0.8426315000433712, 'bagging_fraction': 0.7317397959755858, 'lambda_l1': 0.05156762599756539, 'lambda_l2': 0.20885772437641695}. Best is trial 28 with value: 0.7048207896553889.[0m




[32m[I 2023-04-03 15:01:32,165][0m Trial 29 finished with value: 0.6733915008474044 and parameters: {'num_leaves': 161, 'min_data_in_leaf': 57, 'min_sum_hessian_in_leaf': 0.006296757603803257, 'feature_fraction': 0.8381244831959944, 'bagging_fraction': 0.7326320350377127, 'lambda_l1': 0.0223517282592696, 'lambda_l2': 0.265177341262348}. Best is trial 28 with value: 0.7048207896553889.[0m


In [7]:
#探査の結果確認
trial = study.best_trial
print("acc(best)={:.4f}".format(trial.value))
display(trial.params)

acc(best)=0.7048


{'num_leaves': 233,
 'min_data_in_leaf': 49,
 'min_sum_hessian_in_leaf': 0.0068112646256021986,
 'feature_fraction': 0.8426315000433712,
 'bagging_fraction': 0.7317397959755858,
 'lambda_l1': 0.05156762599756539,
 'lambda_l2': 0.20885772437641695}

In [8]:
#ベストなハイパーパラメータの取得
params_best = trial.params
params_best.update(params_base)
display(params_best)

{'num_leaves': 233,
 'min_data_in_leaf': 49,
 'min_sum_hessian_in_leaf': 0.0068112646256021986,
 'feature_fraction': 0.8426315000433712,
 'bagging_fraction': 0.7317397959755858,
 'lambda_l1': 0.05156762599756539,
 'lambda_l2': 0.20885772437641695,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'learning_rate': 0.02,
 'n_estimators': 100000,
 'bagging_freq': 1,
 'seed': 123}

# LightGBM以外のモデル利用

In [9]:
#Titanicデータを用いた例：ロジスティクス回帰
# ファイル読み込み
df_train = pd.read_csv("../input/titanic/train.csv")

# データセット作成
x_train = df_train[["Pclass", "Age", "Embarked"]]
y_train = df_train[["Survived"]]

In [10]:
# 欠損値の確認
x_train.isnull().sum()

Pclass        0
Age         177
Embarked      2
dtype: int64

In [11]:
# 欠損値補間：数値データ 平均値補完
x_train["Age"] = x_train["Age"].fillna(x_train["Age"].mean())

# 欠損値補間：カテゴリ変数　最頻値補完
x_train["Embarked"] = x_train["Embarked"].fillna(x_train["Embarked"].mode()[0])

In [12]:
#カテゴリ変数の数値化（One-hot-encoding）
ohe = OneHotEncoder()
ohe.fit(x_train[["Embarked"]])
df_embarked = pd.DataFrame(
    ohe.transform(x_train[["Embarked"]]).toarray(), 
    columns=["Embarked_{}".format(col) for col in ohe.categories_[0]])

x_train = pd.concat([x_train, df_embarked], axis=1)
x_train = x_train.drop(columns=["Embarked"])

In [13]:
#数値データの正規化
x_train["Pclass"] = (x_train["Pclass"] -x_train["Pclass"].min()) / (x_train["Pclass"].max() - x_train["Pclass"].min()) 
x_train["Age"] = (x_train["Age"] -x_train["Age"].min()) / (x_train["Age"].max() - x_train["Age"].min()) 

In [14]:
#学習データと検証データの分割（ホールドアウト法）
x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=123)
print(x_tr.shape, x_va.shape, y_tr.shape, y_va.shape)

(712, 5) (179, 5) (712, 1) (179, 1)


In [15]:
#LogisticRegression ロジスティクス回帰
# モデル定義
from sklearn.linear_model import LogisticRegression
model_logis = LogisticRegression()

# 学習
model_logis.fit(x_tr, y_tr)

# 予測
y_va_pred = model_logis.predict(x_va)
print("accuracy:{:.4f}".format(accuracy_score(y_va, y_va_pred)))
print(y_va_pred[:5])

accuracy:0.7263
[0 1 0 1 0]


In [16]:
#確率値の取得
y_va_pred_proba = model_logis.predict_proba(x_va)
print(y_va_pred_proba[:5, :])

[[0.83621285 0.16378715]
 [0.23058311 0.76941689]
 [0.83244141 0.16755859]
 [0.32227072 0.67772928]
 [0.62569522 0.37430478]]


In [17]:
#Titanicデータを用いた例：SVM
# モデル定義
from sklearn.svm import SVC
model_svm = SVC(C=1.0, random_state=123, probability=True)#確率値の計算するパラメータをTrue

# 学習
model_svm.fit(x_tr, y_tr)

# 予測
y_va_pred = model_svm.predict(x_va)
print("accuracy:{:.4f}".format(accuracy_score(y_va, y_va_pred)))
print(y_va_pred[:5])

# 確率値の取得
y_va_pred_proba = model_svm.predict_proba(x_va)
print(y_va_pred_proba[:5, :])

accuracy:0.7151
[0 1 0 1 0]
[[0.73985924 0.26014076]
 [0.28242534 0.71757466]
 [0.73986177 0.26013823]
 [0.26828214 0.73171786]
 [0.58950192 0.41049808]]


# ニューラルネットワーク
### ニューラルネットワークの適用例：①全結合層のみのネットワークモデル

In [18]:
#Tensorflowインポート
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.layers import Embedding, Flatten, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.optimizers import Adam, SGD

In [19]:
# tensorflowの再現性のためのシード指定
def seed_everything(seed):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    session_conf = tf.compat.v1.ConfigProto(
        intra_op_parallelism_threads=1,
        inter_op_parallelism_threads=1
    )
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)

In [20]:
# ファイル読み込み
df_train = pd.read_csv("../input/titanic/train.csv")

# データセット作成
x_train = df_train[["Pclass", "Age", "Embarked"]]
y_train = df_train[["Survived"]]

In [21]:
# 欠損値補間 平均値補完
x_train["Age"] = x_train["Age"].fillna(x_train["Age"].mean())

# 正規化　補完後に0~1に間になるように正規化
for col in ["Pclass", "Age"]:
    value_min = x_train[col].min()
    value_max = x_train[col].max()
    x_train[col] = (x_train[col] - value_min) / (value_max - value_min)

In [22]:
# 欠損値補間　最頻値で補完
x_train["Embarked"] = x_train["Embarked"].fillna(x_train["Embarked"].mode()[0])

# one-hot-encodingで変換
ohe = OneHotEncoder()
ohe.fit(x_train[["Embarked"]])
df_embarked = pd.DataFrame(ohe.transform(x_train[["Embarked"]]).toarray(), 
                           columns=["Embarked_{}".format(col) for col in ohe.categories_[0]])
x_train = pd.concat([x_train.drop(columns=["Embarked"]), 
                     df_embarked], axis=1)

In [23]:
# 学習データと検証データの分割
x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=123)
print(x_tr.shape, x_va.shape, y_tr.shape, y_va.shape)

(712, 5) (179, 5) (712, 1) (179, 1)


In [24]:
#モデル定義　カラム数5:入力ノード5 隠れ層=3 10/10/5
def create_model():
    input_num = Input(shape=(5,))
    x_num = Dense(10, activation="relu")(input_num)
    x_num = BatchNormalization()(x_num)
    x_num = Dropout(0.3)(x_num)
    x_num = Dense(10, activation="relu")(x_num)
    x_num = BatchNormalization()(x_num)
    x_num = Dropout(0.2)(x_num)
    x_num = Dense(5, activation="relu")(x_num)
    x_num = BatchNormalization()(x_num)
    x_num = Dropout(0.1)(x_num)
    out = Dense(1, activation="sigmoid")(x_num)

    model = Model(inputs=input_num,
                  outputs=out,
                 )

    model.compile(
        optimizer="Adam",
        loss="binary_crossentropy",
        metrics=["binary_crossentropy"],
    )
    
    return model

model = create_model()
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 5)]               0         
                                                                 
 dense (Dense)               (None, 10)                60        
                                                                 
 batch_normalization (BatchN  (None, 10)               40        
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 10)                110       
                                                                 
 batch_normalization_1 (Batc  (None, 10)               40        
 hNormalization)                                             

In [25]:
#モデル学習
#ミニバッチサイズを8としてlossが5 回連続改善しない場合に学習率を1/10
#10回連続改善しなかった場合には強制終了
seed_everything(seed=123)
model = create_model()
model.fit(x=x_tr,
          y=y_tr,
          validation_data=(x_va, y_va),
          batch_size=8,
          epochs=10000,
          callbacks=[
              ModelCheckpoint(filepath="model_keras.h5", monitor="val_loss", mode="min", verbose=1, save_best_only=True, save_weights_only=True),
              EarlyStopping(monitor="val_loss", mode="min", min_delta=0, patience=10, verbose=1, restore_best_weights=True),
              ReduceLROnPlateau(monitor="val_loss", mode="min", factor=0.1, patience=5, verbose=1),
          ],
          verbose=1,
         )

Epoch 1/10000
Epoch 1: val_loss improved from inf to 0.68175, saving model to model_keras.h5
Epoch 2/10000
Epoch 2: val_loss improved from 0.68175 to 0.66809, saving model to model_keras.h5
Epoch 3/10000
Epoch 3: val_loss improved from 0.66809 to 0.65506, saving model to model_keras.h5
Epoch 4/10000
Epoch 4: val_loss improved from 0.65506 to 0.63972, saving model to model_keras.h5
Epoch 5/10000
Epoch 5: val_loss improved from 0.63972 to 0.62351, saving model to model_keras.h5
Epoch 6/10000
Epoch 6: val_loss improved from 0.62351 to 0.61782, saving model to model_keras.h5
Epoch 7/10000
Epoch 7: val_loss improved from 0.61782 to 0.61302, saving model to model_keras.h5
Epoch 8/10000
Epoch 8: val_loss did not improve from 0.61302
Epoch 9/10000
Epoch 9: val_loss did not improve from 0.61302
Epoch 10/10000
Epoch 10: val_loss improved from 0.61302 to 0.60942, saving model to model_keras.h5
Epoch 11/10000
Epoch 11: val_loss improved from 0.60942 to 0.60898, saving model to model_keras.h5
Epoch

<keras.callbacks.History at 0x7f2c36cbeed0>

In [26]:
#モデルの評価
y_va_pred = model.predict(x_va, batch_size=8, verbose=1)
print("accuracy: {:.4f}".format(accuracy_score(y_va, np.where(y_va_pred>=0.5,1,0))))

accuracy: 0.7039


### ニューラルネットワークの適用例：②埋め込み層ありのネットワークモデル

In [27]:
# ファイル読み込み
df_train = pd.read_csv("../input/titanic/train.csv")

# データセット作成
x_train = df_train[["Pclass", "Age", "Cabin"]]
y_train = df_train[["Survived"]]