In [22]:
# kaggle titanic
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

In [23]:
# dataset
DATA_DIR = './data/'
TRAIN_DATA = 'train.csv'
TEST_DATA = 'test.csv'
SUBMIT_DATA = 'gender_submission.csv'

In [24]:
# load train data
RAW_DATA = pd.read_csv(os.path.join(DATA_DIR, TRAIN_DATA))

data_test = pd.read_csv(os.path.join(DATA_DIR, TEST_DATA)) 

data = RAW_DATA

In [25]:
# fillna
data['Fare'] = data['Fare'].fillna(np.mean(data['Fare']), inplace=True)

data.isnull().sum()

# Objective variable and Explanatory variable
y_1 = data['Survived']
X_1 = data.drop(columns={'Survived', 'Name', 'Ticket', 'Embarked', 'Cabin'})
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_1_le = X_1
Sex_le = le.fit_transform(X_1_le['Sex'])
X_1['Sex'] = Sex_le
X_1 = X_1.fillna(0)

In [26]:
# fillna
data_test['Fare'] = data_test['Fare'].fillna(np.mean(data_test['Fare']), inplace=True)

# Objective variable and Explanatory variable
# y_1 = data['Survived']
data_test = data_test.drop(columns={'Name', 'Ticket', 'Embarked', 'Cabin'})
data_test_le = data_test
data_test_sex_le = le.fit_transform(data_test_le['Sex'])
data_test_le['Sex'] = data_test_sex_le
data_test_le = data_test_le.fillna(0)
data_predict = data_test_le

In [27]:
X_train1, X_test1, y_train, y_test = train_test_split(X_1, y_1, random_state=0, test_size=0.3)

In [28]:
# GridSearchCV
# パラメータを dict 型で指定
param_grid = {"eta": [0.3, 0.5, 0.7], "colsample_bytree":[0.5, 0.8], "subsample": [0.2, 0.7], "lambda": [0, 1],
             "learning_rate":[0.05, 0.1, 0.3, 0.5]}

# validation set は GridSearchCV が自動で作成してくれるため，
# training set と test set の分割のみを実行すればよい
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train1, y_train, random_state=0)

grid_search = GridSearchCV(xgb.XGBRFClassifier(), param_grid, cv=5)

# fit 関数を呼ぶことで交差検証とグリッドサーチがどちらも実行される
grid_search.fit(X_train2, y_train2)

print('Test set score: {}'.format(grid_search.score(X_test2, y_test2)))
print('Best parameters: {}'.format(grid_search.best_params_))
print('Best cross-validation: {}'.format(grid_search.best_score_))

Test set score: 0.8012820512820513
Best parameters: {'colsample_bytree': 0.8, 'eta': 0.3, 'lambda': 0, 'learning_rate': 0.05, 'subsample': 0.7}
Best cross-validation: 0.8159917638984215


In [29]:
# GridSearchCVの結果をもとにXGBClassifierのパラーメータ指定
xgb_bst = xgb.XGBClassifier(objective="binary:logistic",
                            colsample_bytree=0.8,
                            eta=0.3,
                            # lambda= 0,
                            learning_rate=0.05,
                            subsample=0.7,
                           eval_metric= 'logloss')

In [30]:
# 学習
# GridSearchCVで分割したデータを使う
# X_train2, X_test2, y_train2, y_test2 = train_test_split(X_train1, y_train, random_state=0)
xgb_bst.fit(X_train2, y_train2)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eta=0.3,
              eval_metric='logloss', gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.05, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.7, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [31]:
# xgb_bst.predict(data_predict)
xgb_bst_pred = xgb_bst.predict(data_predict)

In [32]:
# xgbでモデリング
xgb3 = xgb.XGBClassifier(
    objective= "binary:logistic",
    colsample_bytree= 0.8,
    eta= 0.3,
    eval_metric= 'logloss',
    # lambda= 0,
    learning_rate= 0.05,
    max_depth=4,
    n_estimators=20,
    subsample= 0.7)

In [33]:
# 学習
xgb3.fit(X_train1, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eta=0.3,
              eval_metric='logloss', gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.05, max_delta_step=0,
              max_depth=4, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=20, n_jobs=4,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.7, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [34]:
# 予測
xgb3_pred = xgb3.predict(data_predict)

In [35]:
xgb3_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [36]:
# 交差検証
from sklearn.model_selection import cross_val_score

# xgbの分析結果比較
scores_01 = cross_val_score(xgb_bst ,X_train2, y_train2)
print('Cross-Validation scores: {}'.format(scores_01))

scores_02 = cross_val_score(xgb3 ,X_train1, y_train)
print('Cross-Validation scores: {}'.format(scores_02))

# スコアの平均値
import numpy as np
print('Average score: {}'.format(np.mean(scores_01)))
print('Average score: {}'.format(np.mean(scores_02)))

Cross-Validation scores: [0.76595745 0.82978723 0.8172043  0.8172043  0.84946237]
Cross-Validation scores: [0.792      0.8        0.784      0.79032258 0.80645161]
Average score: 0.8159231297185998
Average score: 0.7945548387096775


In [16]:
data_submit = pd.read_csv(os.path.join(DATA_DIR, SUBMIT_DATA))

In [17]:
# 提出用ファイルに予測結果を連結
# GrideSearchの結果を使った、モデリング結果を提出用ファイルに連結
# data_submit['Survived'] = xgb_bst_pred

In [18]:
# data_submit.shape

In [19]:
# submit data
# data_submit['Survived'] = lr_pred01

In [20]:
# data_submit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [21]:
# write csv
# data_submit.to_csv(os.path.join(DATA_DIR, 'submission_xgb05.csv'), index=False)