In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

### データ読み込み, train/test分割, 不要列削除

In [2]:
train = pd.read_csv('../../input/ch01-titanic/train.csv')
test = pd.read_csv('../../input/ch01-titanic/test.csv')

train_x = train.drop(['Survived'], axis=1)
train_y = train['Survived']

test_x = test.copy()

train_x = train_x.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_x = test_x.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [3]:
train_x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


### カテゴリー変数 => Label Encoding

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
for c in ['Sex', 'Embarked']:
    le = LabelEncoder()
    le.fit(train_x[c].fillna('NA'))
    
    train_x[c] = le.transform(train_x[c].fillna('NA'))
    test_x[c] = le.transform(test_x[c].fillna('NA'))

In [6]:
train_x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.0,1,0,7.25,3
1,1,0,38.0,1,0,71.2833,0
2,3,0,26.0,0,0,7.925,3
3,1,0,35.0,1,0,53.1,3
4,3,1,35.0,0,0,8.05,3


### 学習・予測

In [7]:
from xgboost import XGBClassifier

In [8]:
model = XGBClassifier(n_estimators=20, random_state=71)
model.fit(train_x, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=None, n_estimators=20,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=71, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

In [9]:
pred = model.predict_proba(test_x)[:, 1]
pred_label = np.where(pred > 0.5, 1, 0)

submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred_label})
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


### モデルの評価

In [10]:
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import KFold

In [11]:
scores_accuracy = []
scores_logloss = []

kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
    
    # trで学習, vaで評価
    model = XGBClassifier(n_estimators=20, random_state=71)
    model.fit(tr_x, tr_y)
    
    va_pred = model.predict_proba(va_x)[:, 1]
    
    logloss = log_loss(va_y, va_pred)
    accuracy = accuracy_score(va_y, va_pred > 0.5)
    
    # そのfoldにおけるscore保存
    scores_accuracy.append(accuracy)
    scores_logloss.append(logloss)

accuracy = np.mean(scores_accuracy)
logloss = np.mean(scores_logloss)

print(f'accuracy: {accuracy:.4f}, logloss: {logloss:.4f}')

accuracy: 0.8148, logloss: 0.4270


### ハイパーパラメータチューニング

In [12]:
from itertools import product

In [13]:
param_space = {
    'max_depth': [3, 5, 7],
    'min_child_weight': [1.0, 2.0, 4.0]
}

param_combinations = product(*param_space.values())

params = []
scores = []

for max_depth, min_child_weight in param_combinations:
    score_folds = []
    for tr_idx, va_idx in kf.split(train_x):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        
        model = XGBClassifier(n_estimators=20, max_depth=max_depth, min_child_weight=min_child_weight, random_state=71)
        model.fit(tr_x, tr_y)
        
        va_pred = model.predict_proba(va_x)[:, 1]
        logloss = log_loss(va_y, va_pred)
        
        score_folds.append(logloss)
    params.append([max_depth, min_child_weight])
    scores.append(np.mean(logloss))

best_params = params[np.argmax(scores)]
best_params

[3, 4.0]

### 予測値の出力

In [14]:
max_depth, min_child_weight = best_params
model = XGBClassifier(n_estimators=20, max_depth=max_depth, min_child_weight=min_child_weight, random_state=71)
model.fit(train_x, train_y)

pred = model.predict_proba(test_x)[:, 1]

pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred}).to_csv('pred1.csv', index=False)

### 別モデル作成

In [15]:
train = pd.read_csv('../../input/ch01-titanic/train.csv')
test = pd.read_csv('../../input/ch01-titanic/test.csv')

train_x = train.drop(['Survived'], axis=1)
train_y = train['Survived']

test_x = test.copy()

train_x = train_x.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_x = test_x.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [16]:
from sklearn.preprocessing import OneHotEncoder
# より、pd_get_dummiesのほうが楽だが、trainとtestをconcatした状態で実施する必要がある。
# cat_cols = ['Sex', 'Embarked', 'Pclass']
# pd.get_dummies(train_x[cat_cols].fillna('NA'), columns=cat_cols)

In [17]:
# カテゴリー変数 => Onehot Encoding

cat_cols = ['Sex', 'Embarked', 'Pclass']

ohe = OneHotEncoder(categories='auto', sparse=False)
ohe.fit(train_x[cat_cols].fillna('NA'))

ohe_columns = []
for i, c in enumerate(cat_cols):
    ohe_columns += [f'{c}_{v}' for v in ohe.categories_[i]]

ohe_train_x = pd.DataFrame(ohe.transform(train_x[cat_cols].fillna('NA')), columns=ohe_columns)
ohe_test_x = pd.DataFrame(ohe.transform(test_x[cat_cols].fillna('NA')), columns=ohe_columns)

train_x = pd.concat([train_x.drop(cat_cols, axis=1), ohe_train_x], axis=1)
test_x = pd.concat([test_x.drop(cat_cols, axis=1), ohe_test_x], axis=1)

In [18]:
# 数値変数 => Mean Encoding

num_cols = ['Age', 'SibSp', 'Parch', 'Fare']

for col in num_cols:
    train_x[col].fillna(train_x[col].mean(), inplace=True)
    test_x[col].fillna(train_x[col].mean(), inplace=True)

# Fare のみ対数変換 ( np.log1p : log(1 + x))
train_x['Fare'] = np.log1p(train_x['Fare'])
test_x['Fare'] = np.log1p(test_x['Fare'])

In [19]:
# 学習
from sklearn.linear_model import LogisticRegression

In [20]:
model = LogisticRegression(solver='lbfgs', max_iter=300)
model.fit(train_x, train_y)
pred = model.predict_proba(test_x)[:, 1]
pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': pred}).to_csv('pred2.csv', index=False)

### アンサンブル

In [21]:
pred1 = pd.read_csv('pred1.csv')
pred2 = pd.read_csv('pred2.csv')

# 加重平均（weighted averaging）
pred = np.where(pred1['Survived'] * 0.8 + pred2['Survived'] * 0.2 > 0.5, 1, 0)
pred

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,