# タイタニックのデータで試す

## 5.1 前処理
### 5.1.1 カテゴリ変数の変換

In [None]:
import pandas as pd
import numpy as np

inputフォルダのファイルを表示

In [None]:
ls ../input

In [None]:
# csvファイルの読み込み
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sample_submission = pd.read_csv('../input/gender_submission.csv')

In [None]:
train.head()

In [None]:
# SexとEmbarkedのOne-Hotエンコーディング
train = pd.get_dummies(train, columns=['Sex', 'Embarked'])
test = pd.get_dummies(test, columns=['Sex', 'Embarked'])
 # 補足 : Sexは2値なので、One-Hotエンコーディングは不要(してもいいが普通はしない)ですが、今回は行なっています

# 不要な列の削除
train.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)
test.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)

# trainの表示
display(train.head())

## 5.2 ホールドアウト法での学習・推論

![](https://cdn.discordapp.com/attachments/507208726864855060/577789809330880532/20190514_.012.jpeg)

In [None]:
X_train = train.drop(['Survived'], axis=1)  # X_trainはtrainのSurvived列以外
Y_train = train['Survived']  # Y_trainはtrainのSurvived列

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# X_trainとY_trainをtrainとvalidに分割
train_x, valid_x, train_y, valid_y = train_test_split(X_train, Y_train, test_size=0.33, random_state=0)

# LightGBMの分類器をインスタンス化
gbm = lgb.LGBMClassifier(objective='binary')

# trainとvalidを指定し学習
gbm.fit(train_x, train_y, eval_set = [(valid_x, valid_y)],
        early_stopping_rounds=20,  # 20回連続でlossが下がらなかったら終了
        verbose=10  # 10round毎に、lossを表示
) ;

In [None]:
# valid_xについて推論
oof = gbm.predict(valid_x, num_iteration=gbm.best_iteration_)  # oofはout of fold
print('score', round(accuracy_score(valid_y, oof)*100,2), '%')  # 正解率の表示

# out of foldは学習に使っているデータ以外の予測

In [None]:
# testの予測
test_pred = gbm.predict(test, num_iteration=gbm.best_iteration_)  # testの予測
sample_submission['Survived'] = test_pred  # sample_submissionのSurvived列をtest_predに置き換え
sample_submission.to_csv('train_test_split.csv', index=False)  # csvファイルの書き出し

## 5.3 3分割交差検証での学習

![](https://cdn.discordapp.com/attachments/507208726864855060/577789829518065675/20190514_.014.jpeg)

In [None]:
# 3分割交差検証を指定し、インスタンス化
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)  # 3分割交差検証のためにインスタンス化

# スコアとモデルを格納するリスト
score_list = []
models = []

for fold_, (train_index, valid_index) in enumerate(kf.split(X_train, Y_train)):
    train_x = X_train.iloc[train_index]
    valid_x = X_train.iloc[valid_index]
    train_y = Y_train[train_index]
    valid_y = Y_train[valid_index]
    
    print(f'fold{fold_ + 1} start')

    gbm = lgb.LGBMClassifier(objective='binary')
    gbm.fit(train_x, train_y, eval_set = [(valid_x, valid_y)],
                early_stopping_rounds=20,
                verbose= -1) # 学習の状況を表示しない
    
    oof = gbm.predict(valid_x, num_iteration=gbm.best_iteration_)
    score_list.append(round(accuracy_score(valid_y, oof)*100,2))
    models.append(gbm)  # 学習が終わったモデルをリストに入れておく
    print(f'fold{fold_ + 1} end\n' )
print(score_list, '平均score', np.mean(score_list), "%")  

In [None]:
# testの予測
test_pred = np.zeros((len(test), 3))  # 行:len(test), 列:3のall zeroの配列を用意

for fold_, gbm in enumerate(models):  # 学習ずみのmodelをgbmに入れる
    pred_ = gbm.predict(test, num_iteration=gbm.best_iteration_)  # testの予測
    test_pred[:, fold_] = pred_  # １回目は0列目、2回目は1列目、2回目は3列目に格納

pred = (np.mean(test_pred, axis=1) > 0.5).astype(int)  # 平均をとって、0と１に変換
sample_submission['Survived'] = pred
sample_submission.to_csv('3-fold_cross-validation.csv',index = False)

## 5.4 パラメータの調整

In [None]:
gbm.get_params()

In [None]:
# GridSearchCVをimport
from sklearn.model_selection import GridSearchCV

gbm = lgb.LGBMClassifier(objective='binary')

# 試行するパラメータを羅列する
params = {
    'max_depth': [2, 3, 4, 5],
    'reg_alpha': [0, 1, 10, 100],
    'reg_lambda': [0, 1, 10, 100],
}

grid_search = GridSearchCV(
                           gbm,  # 分類器を渡す
                           param_grid=params,  # 試行してほしいパラメータを渡す
                           cv=3,  # 3分割交差検証でスコアを確認
                          )

grid_search.fit(X_train, Y_train)  # データを渡す

print(grid_search.best_score_)  # ベストスコアを表示
print(grid_search.best_params_)  # ベストスコアのパラメータを表示

In [None]:
# スコアとモデルを格納するリスト
score_list = []
test_pred = np.zeros((len(test), 3))


for fold_, (train_index, valid_index) in enumerate(kf.split(X_train, Y_train)):
    train_x = X_train.iloc[train_index]
    valid_x = X_train.iloc[valid_index]
    train_y = Y_train[train_index]
    valid_y = Y_train[valid_index]
    
    print(f'fold{fold_ + 1} start')

    gbm = lgb.LGBMClassifier(objective='binary', max_depth=3, reg_alpha=1,
                             reg_lambda=0)
    gbm.fit(train_x, train_y,
            eval_set = [(valid_x, valid_y)],
            early_stopping_rounds=20,
            verbose= -1)
    
    oof = gbm.predict(valid_x, num_iteration=gbm.best_iteration_)
    score_list.append(round(accuracy_score(valid_y, oof)*100,2))
    test_pred[:, fold_] = gbm.predict(test, num_iteration=gbm.best_iteration_)
    print(f'fold{fold_ + 1} end\n' )
print(score_list, '平均score', np.mean(score_list))
pred = (np.mean(test_pred, axis=1) > 0.5).astype(int)
sample_submission['Survived'] = pred
sample_submission.to_csv('glid_search.csv', index=False)  # scoreは0.77511

# 次のNotebook
https://www.kaggle.com/currypurin/titanic-lightgbm-ex

# 参考資料

- [Parameters Tuning](https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html)
  - パラメータチューニング
- [LightGBM’s documentation!](https://lightgbm.readthedocs.io/en/latest/)
  - LightGBMのドキュメント
- [tutorial of kaggle ver3 ch5 optuna\_ex](https://www.kaggle.com/)
  - optunaでのチューニングの例