In [140]:
# titanic
# KFoldを適用
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pprint
import plotly.express as px
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
# from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [141]:
# データ・セット
DATA_DIR = './data/'
TRAIN_DATA = 'train.csv'
DATA = pd.read_csv(os.path.join(DATA_DIR, TRAIN_DATA))

In [142]:
DATA.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [143]:
# 性別をダミー変数化
DATA['Sex'] = DATA['Sex'].replace(['male', 'female'], [0, 1])

In [144]:
DATA['Sex'].unique()

array([0, 1])

In [145]:
DATA.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [146]:
le = LabelEncoder()

In [148]:
# カテゴリカルデータをダミー変数化
embarked_le = le.fit_transform(DATA['Embarked'].values)

name_le = le.fit_transform(DATA['Name'].values)

ticket_le = le.fit_transform(DATA['Ticket'].values)

cabin_le = le.fit_transform(DATA['Cabin'].values)

# ダミー変数化したデータを元データに適用
data = DATA
data['Embarked'] = embarked_le
data['Name'] = name_le
data['Ticket'] = ticket_le
data['Cabin'] = cabin_le

In [149]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,108,0,22.0,1,0,523,7.25,147,2
1,2,1,1,190,1,38.0,1,0,596,71.2833,81,0
2,3,1,3,353,1,26.0,0,0,669,7.925,147,2
3,4,1,1,272,1,35.0,1,0,49,53.1,55,2
4,5,0,3,15,0,35.0,0,0,472,8.05,147,2


In [150]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64

In [151]:
# 年齢の欠損処理
# 平均で埋めて、データフレームに適用
data['Age'] = data['Age'].fillna(data['Age'].mean())

In [152]:
data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [153]:
# 目的変数と説明変数に分割
y = data['Survived']

# y = np.array([0, 0, 1, 1])
X = data.drop(columns={'Survived'})

# 欠損値確認
# X.isnull().sum()

# Ageの欠損値を平均で穴埋め
X['Age'] = X['Age'].fillna(X['Age'].mean())

In [154]:
# KFoldで交差検証
kf = KFold(n_splits=3)
kf.get_n_splits(X, y)

print(kf)

for train_index, test_index in kf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train1, X_test1 = X.iloc[train_index], X.iloc[test_index]
    y_train1, y_test1 = y.iloc[train_index], y.iloc[test_index]

KFold(n_splits=3, random_state=None, shuffle=False)
TRAIN: [297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350
 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440
 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458
 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476
 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512
 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 

In [155]:
X_train1.shape, y_train1.shape, X_test1.shape, y_test1.shape

((594, 11), (594,), (297, 11), (297,))

In [156]:
# LogisticRegressionCV?

[0;31mInit signature:[0m
[0mLogisticRegressionCV[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mCs[0m[0;34m=[0m[0;36m10[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfit_intercept[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcv[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdual[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpenalty[0m[0;34m=[0m[0;34m'l2'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mscoring[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msolver[0m[0;34m=[0m[0;34m'lbfgs'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtol[0m[0;34m=[0m[0;36m0.0001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_iter[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mclass_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_jobs[0m[0;34m=[0m[0;32mNone[0m[0;34m

In [159]:
# model1
# ロジスティック回帰
lr = LogisticRegression(random_state=0, max_iter=300, C=0.05)
lr.fit(X_train1, y_train1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=0.05, max_iter=300, random_state=0)

In [160]:
# モデルの精度
lr.score(X_train1, y_train1)

0.7878787878787878

In [161]:
# model2
# xgboost
# xgbでモデリング
xgb01 = xgb.XGBClassifier(
    objective= "binary:logistic",
    colsample_bytree= 0.8,
    eta= 0.3,
    eval_metric= 'logloss',
    # lambda= 0,
    learning_rate= 0.05,
    max_depth=4,
    n_estimators=20,
    subsample= 0.7)

In [162]:
xgb01.fit(X_train1, y_train1)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, eta=0.3,
              eval_metric='logloss', gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.05, max_delta_step=0,
              max_depth=4, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=20, n_jobs=4,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.7, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [163]:
# 精度検証
xgb01.score(X_train1, y_train1)

0.8720538720538721

In [164]:
# model3
# randomforest
rf01 = RandomForestClassifier(random_state=0)

rf01.fit(X_train1, y_train1)

# 精度検証
rf01.score(X_train1, y_train1)

1.0

In [165]:
# model4
# randomforest
rf02 = RandomForestClassifier(random_state=0, n_estimators=20, class_weight='balanced')

rf02.fit(X_train1, y_train1)

# 精度検証
rf02.score(X_train1, y_train1)

0.9932659932659933

In [197]:
LogisticRegressionCV?

[0;31mInit signature:[0m
[0mLogisticRegressionCV[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mCs[0m[0;34m=[0m[0;36m10[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfit_intercept[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcv[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdual[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpenalty[0m[0;34m=[0m[0;34m'l2'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mscoring[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msolver[0m[0;34m=[0m[0;34m'lbfgs'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtol[0m[0;34m=[0m[0;36m0.0001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_iter[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mclass_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_jobs[0m[0;34m=[0m[0;32mNone[0m[0;34m

In [198]:
# model5
# randomforest
lrcv01 = LogisticRegressionCV(random_state=0, max_iter=300, cv=3)

lrcv01.fit(X_train1, y_train1)

# 精度検証
lrcv01.score(X_train1, y_train1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8080808080808081

In [166]:
# 交差検証
from sklearn.model_selection import cross_val_score

# 分析結果比較
scores_01 = cross_val_score(lr ,X_train1, y_train1)
# print('Cross-Validation scores: {}'.format(scores_01))

scores_02 = cross_val_score(xgb01 ,X_train1, y_train1)
# print('Cross-Validation scores: {}'.format(scores_02))

scores_03 = cross_val_score(rf02 ,X_train1, y_train1)
# print('Cross-Validation scores: {}'.format(scores_02))

scores_04 = cross_val_score(lrcv01 ,X_train1, y_train1)
# print('Cross-Validation scores: {}'.format(scores_02))

# スコアの平均値
# import numpy as np
# print('Average score: {}'.format(np.mean(scores_01)))
# print('Average score: {}'.format(np.mean(scores_02)))



In [167]:
from sklearn.model_selection import cross_validate

In [168]:
#cv_scores = cross_validate(gnb, iris.data, iris.target,
#                            cv=kf, scoring=scoring)

# 分析結果比較
scores_criteria = ['balanced_accuracy', 'precision', 'recall', 'f1']  # 2値分類で有効そうな指標
cv_scores_01 = cross_validate(lr ,X_train1, y_train1, cv=3, scoring=scores_criteria)

cv_scores_02 = cross_validate(xgb01 ,X_train1, y_train1, cv=3, scoring=scores_criteria)

cv_scores_03 = cross_validate(rf02 ,X_train1, y_train1, cv=3, scoring=scores_criteria)

cv_scores_04 = cross_validate(lrcv01 ,X_train1, y_train1, cv=3, scoring=scores_criteria)



In [169]:
# seriesにcross_val_scoreを格納
ser_score_01 = pd.Series(scores_01)
ser_score_02 = pd.Series(scores_02)
ser_score_03 = pd.Series(scores_03)
ser_score_04 = pd.Series(scores_04)

In [170]:
# seriesにcross_validateを格納
ser_cv_score_01 = pd.Series(cv_scores_01)
ser_cv_score_02 = pd.Series(cv_scores_02)
ser_cv_score_03 = pd.Series(cv_scores_03)
ser_cv_score_04 = pd.Series(cv_scores_04)

In [171]:
# cross_val_scoreの平均値
# print("CrossValSore_01の平均値", ser_score_01.mean(), "CrossValScore_02の平均値,", ser_score_02.mean())

CrossValSore_01の平均値 0.8198262355789773 CrossValScore_02の平均値, 0.8265774106252671


In [172]:
print(ser_cv_score_01)

fit_time                  [0.044014692306518555, 0.022696971893310547]
score_time                 [0.017159700393676758, 0.00801396369934082]
test_balanced_accuracy        [0.7450854700854701, 0.5032051282051282]
test_precision                [0.7717391304347826, 0.3956043956043956]
test_recall                   [0.6068376068376068, 0.9230769230769231]
test_f1                       [0.6794258373205742, 0.5538461538461539]
dtype: object


In [173]:
# pd.DataFrame(cv_scores_02)
print(ser_cv_score_02)

fit_time                   [0.04112958908081055, 0.043048858642578125]
score_time                [0.011833429336547852, 0.009324073791503906]
test_balanced_accuracy       [0.6858974358974359, 0.49786324786324787]
test_precision               [0.8333333333333334, 0.39285714285714285]
test_recall                  [0.42735042735042733, 0.9401709401709402]
test_f1                       [0.5649717514124294, 0.5541561712846348]
dtype: object


In [174]:
# pd.DataFrame(cv_scores_03)
print(ser_cv_score_03)

CrossValidate_01の平均値 [0.47737707 0.40107392] CrossValidate_02の平均値, [0.42741933 0.40623674]


In [None]:
# pd.DataFrame(cv_scores_02)
print(ser_cv_score_04)

In [175]:
# 予測用ファイルの読み込み
TEST_DATA = 'test.csv'
predict_data = pd.read_csv(os.path.join(DATA_DIR, TEST_DATA))

In [176]:
predict_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [177]:
# 予測用ファイルをモデリングに使った学習用・テスト用ファイルに合わせ修正

In [179]:
predict_data.shape, X_train1.shape, y_train1.shape, y_test1.shape

((418, 11), (594, 11), (594,), (297,))

In [191]:
# 予測用ファイルを修正 
# 性別をダミー変数化
predict_data['Sex'] = predict_data['Sex'].replace(['male', 'female'], [0, 1])

# カテゴリカルデータをダミー変数化
embarked_le_pred = le.fit_transform(predict_data['Embarked'].values)

name_le_pred = le.fit_transform(predict_data['Name'].values)

ticket_le_pred = le.fit_transform(predict_data['Ticket'].values)

cabin_le_pred = le.fit_transform(predict_data['Cabin'].values)

# ダミー変数化したデータを元データに適用
# data = DATA
predict_data['Embarked'] = embarked_le_pred
predict_data['Name'] = name_le_pred
predict_data['Ticket'] = ticket_le_pred
predict_data['Cabin'] = cabin_le_pred

In [193]:
predict_data.shape

(418, 11)

In [195]:
xgb_pred01 = xgb01.predict(predict_data)

In [199]:
# 提出用ファイル読み込み
SUBMIT_DATA = 'gender_submission.csv'
data_submit = pd.read_csv(os.path.join(DATA_DIR, SUBMIT_DATA))

In [200]:
# 提出用ファイルに予測結果を連結
data_submit['Survived'] = xgb_pred01

In [201]:
# write csv
data_submit.to_csv(os.path.join(DATA_DIR, 'submission_xgb06.csv'), index=False)